In [1]:
import itertools
import json
import os
import sys

import pandas as pd
from tqdm import tqdm

from config import (FEATURES_FILE, MAJOR_ELECTIONS_DATES_FILE,
                    PROCESSED_DATA_DIR, SLIDING_WINDOW_FILE,
                    TEN_PCT_SAMPLE_FILE, election_date_to_feature_names)

In [3]:
# Find the percent of all samples which have each election date

major_elections = json.load(open(MAJOR_ELECTIONS_DATES_FILE))["dates"]

num_total = 0
num_sampled = [0] * len(major_elections)

for file in tqdm(
    os.listdir(PROCESSED_DATA_DIR), desc="Reading files", unit="file", colour="green"
):
    if file.endswith(".csv"):
        if file == TEN_PCT_SAMPLE_FILE:
            continue
        else:
            df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, file))

            num_total += len(df)

            for date in major_elections:
                presence_column = f"Election {date} Presence"

                if presence_column in df.columns:
                    num_sampled[major_elections.index(date)] += df[presence_column].sum()

for i in range(len(major_elections)):
    print(f"{major_elections[i]}: has {num_sampled[i]} samples and {num_sampled[i] / num_total * 100:.2f}% existance")


Reading files: 100%|[32m██████████[0m| 70/70 [02:01<00:00,  1.73s/file]

11/04/2003: has 1860273 samples and 19.67% existance
04/27/2004: has 3437031 samples and 36.35% existance
11/02/2004: has 4484242 samples and 47.42% existance
05/17/2005: has 6635385 samples and 70.17% existance
11/08/2005: has 7313625 samples and 77.34% existance
05/16/2006: has 7343628 samples and 77.66% existance
11/07/2006: has 7343628 samples and 77.66% existance
05/15/2007: has 8471785 samples and 89.59% existance
11/06/2007: has 8471785 samples and 89.59% existance
04/22/2008: has 9456662 samples and 100.00% existance
11/04/2008: has 9456662 samples and 100.00% existance
05/19/2009: has 9456662 samples and 100.00% existance
11/03/2009: has 9456662 samples and 100.00% existance
05/18/2010: has 9456662 samples and 100.00% existance
11/02/2010: has 9456662 samples and 100.00% existance
05/17/2011: has 9456662 samples and 100.00% existance
11/08/2011: has 9456662 samples and 100.00% existance
04/24/2012: has 9456662 samples and 100.00% existance
11/06/2012: has 9456662 samples and 1




Now, we've decided which election years to use (2008 Primary through 2023 General) by thresholding the existance rate at 100%. We are left with 30 elections, for which we'll use 10 sliding windows of 20 features (10 years) each (and 1 response variable).

Now, we will build a full dataset, sampling from the data for each model. Optimally, we would sample the full dataset on each sliding window (thus, a 100% sample), but this would result in about 100,000,000 samples. To balance computational cost, we'll sample 10% of the data for now, which will result in about 10,000,000 samples.

In [8]:
feature_election_dates = json.load(open(MAJOR_ELECTIONS_DATES_FILE))["feature_dates"]

demographic_features = json.load(open(FEATURES_FILE))["demographic"]

# Allow for 10 years of election history and 1 response variable election
SLIDING_WINDOW_LENGTH = 21
num_sliding_windows = len(feature_election_dates) - SLIDING_WINDOW_LENGTH + 1

# Use the election names "T - 1", "T - 2", etc. for the feature elections
# However, we've decided to only use elections which have 100% presence, so "Presence" columns are redundant
sliding_election_column_names = list(
    itertools.chain.from_iterable(
        [
            [feat for feat in election_date_to_feature_names(f"T-{i}") if "Presence" not in feat]
            for i in range(SLIDING_WINDOW_LENGTH - 1, 0, -1)
        ]
    )
)

# Remove the features which would cause leakage in the sliding window approach
sliding_demographic_features = [feature for feature in demographic_features if "Gender" in feature]

sliding_features = sliding_demographic_features + sliding_election_column_names + ["Response"]

df = pd.DataFrame(columns=sliding_features)

Above, we've removed some demographic features for this sliding window approach. For example, `Registration Date`, `Last Vote Date`, etc. have been removed. Most importantly, `DOB` has been removed. For all these features, including them would require, for each sample, re-calculating their expected value (for those like `Last Vote Date`), and then calculating their value. For `DOB`, the age of the voter should be adjusted for each election year. For this version, we have left this out in the interest of testing general model ability on this data. However, we plan to include it in our final version.

In [10]:
dfs = []

# Now, read in the data and sample for each sliding window
for i, file in enumerate(tqdm(
    os.listdir(PROCESSED_DATA_DIR), desc="Reading files", unit="file", colour="green"
)):
    if file.endswith(".csv"):
        if file == TEN_PCT_SAMPLE_FILE:
            continue
        else:
            df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, file))

            # Sample each sliding window
            for i in range(num_sliding_windows):
                # Gather the election features for the current sliding window
                election_columns = list(
                    itertools.chain.from_iterable(
                        [feat for feat in election_date_to_feature_names(date) if "Presence" not in feat]
                        for date in feature_election_dates[i : i + SLIDING_WINDOW_LENGTH - 1]
                    )
                )

                # Gather the response variable election for the current sliding window
                response_column = f"Election {feature_election_dates[i + SLIDING_WINDOW_LENGTH - 1]} Voted"

                all_selected_columns = sliding_demographic_features + election_columns + [response_column]

                # Sample the data
                df_sample = df[all_selected_columns].sample(frac=0.01)
                df_sample.columns = sliding_features

                dfs.append(df_sample)

df = pd.concat(dfs)
df.to_csv(SLIDING_WINDOW_FILE, index=False)


Reading files: 100%|[32m██████████[0m| 72/72 [02:08<00:00,  1.78s/file]


NOTE: For now, we've included voters in this dataset which were actually registered after the response election dates. For the next version, we will check the `DOB` and eliminate voters from the sample which could never have been in the voter file at that time. Note that there is still some bias introduced which we cannot overcome, from voters which have been eliminated from the file since the response election date. Those voters may have died or moved, but we'd like our model to still learn from them. The only solution to this is to request the voter files from those dates, which would (for us) be too time-consuming and expensive to be worth it.