In [1]:
import pandas as pd
from tqdm import tqdm
import os
import json
import itertools
from config import (
    MAJOR_ELECTIONS_DATES_FILE,
    TEN_PCT_SAMPLE_FILE,
    PROCESSED_DATA_DIR,
    FEATURES_FILE,
    election_date_to_feature_names,
)

In [3]:
# Find the percent of all samples which have each election date

major_elections = json.load(open(MAJOR_ELECTIONS_DATES_FILE))["dates"]

num_total = 0
num_sampled = [0] * len(major_elections)

for file in tqdm(
    os.listdir(PROCESSED_DATA_DIR), desc="Reading files", unit="file", colour="green"
):
    if file.endswith(".csv"):
        if file == TEN_PCT_SAMPLE_FILE:
            continue
        else:
            df = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, file))

            num_total += len(df)

            for date in major_elections:
                presence_column = f"Election {date} Presence"

                if presence_column in df.columns:
                    num_sampled[major_elections.index(date)] += df[presence_column].sum()

for i in range(len(major_elections)):
    print(f"{major_elections[i]}: has {num_sampled[i]} samples and {num_sampled[i] / num_total * 100:.2f}% existance")


Reading files: 100%|[32m██████████[0m| 70/70 [02:01<00:00,  1.73s/file]

11/04/2003: has 1860273 samples and 19.67% existance
04/27/2004: has 3437031 samples and 36.35% existance
11/02/2004: has 4484242 samples and 47.42% existance
05/17/2005: has 6635385 samples and 70.17% existance
11/08/2005: has 7313625 samples and 77.34% existance
05/16/2006: has 7343628 samples and 77.66% existance
11/07/2006: has 7343628 samples and 77.66% existance
05/15/2007: has 8471785 samples and 89.59% existance
11/06/2007: has 8471785 samples and 89.59% existance
04/22/2008: has 9456662 samples and 100.00% existance
11/04/2008: has 9456662 samples and 100.00% existance
05/19/2009: has 9456662 samples and 100.00% existance
11/03/2009: has 9456662 samples and 100.00% existance
05/18/2010: has 9456662 samples and 100.00% existance
11/02/2010: has 9456662 samples and 100.00% existance
05/17/2011: has 9456662 samples and 100.00% existance
11/08/2011: has 9456662 samples and 100.00% existance
04/24/2012: has 9456662 samples and 100.00% existance
11/06/2012: has 9456662 samples and 1




Now, we've decided which election years to use (2008 Primary through 2023 General) by thresholding the existance rate at 100%. We are left with 30 elections, for which we'll use 10 sliding windows of 20 features (10 years) each.

Now, we will build a full dataset, sampling from the data for each model. Optimally, we would sample use the full dataset on each sliding window (thus, a 100% sample), but this would result in about 100,000,000 samples. To balance computational cost, we'll sample 10% of the data for now.

In [3]:
feature_election_dates = json.load(open(MAJOR_ELECTIONS_DATES_FILE))["feature_dates"]

demographic_features = json.load(open(FEATURES_FILE))["demographic"]

# Allow for 10 years of election history and 1 response variable election
SLIDING_WINDOW_LENGTH = 21
num_sliding_windows = len(feature_election_dates) - SLIDING_WINDOW_LENGTH + 1

# election_features = itertools.chain.from_iterable(election_date_to_feature_names(date) for date in feature_election_dates)

# final_df_columns = demographic_features + list(election_features)

# Now, read in the data and sample for each sliding window



11