In [46]:
import pandas as pd
import numpy as np
import plotly.express as px
from collections import defaultdict
from math import exp
from utils import get_names2abbrs_dict, get_samples

In [47]:
start_date = '2020-05-01'
end_date = '2020-10-31'

In [48]:
battleground_df = pd.read_csv('pop_data/2016_pres_labels_battleground.csv')[['state_po', 'battleground']]

In [49]:
battleground_states = battleground_df.set_index('state_po').to_dict()['battleground']

In [50]:
d_probs = pd.read_csv('poll_data/all_pres_polls_d_probs.csv')

In [51]:
state_polls = get_samples(d_probs, start_date, end_date)
all_state_pos = list(sorted(state_polls))

In [52]:
state_populations = pd.read_csv('pop_data/1900-2019-all-states-populations.csv')
current_populations = state_populations[state_populations['year'] == 2019].set_index('state_po').to_dict()['population']

In [53]:
def apply_poll_func(poll_func):
    polls = pd.DataFrame([(po, poll_func(po), battleground_states[po]) for po in all_state_pos],
                         columns=['state_po', 'samples', 'battleground']).sort_values(by=['battleground', 'state_po'])
    return polls

In [67]:
poll_funcs = {
    'sum': lambda po: sum(state_polls[po]),
    'mean': lambda po: np.mean(state_polls[po]),
    'number': lambda po: len(state_polls[po]),
    'sum_pop_adj': lambda po: 1000 * sum(state_polls[po]) / current_populations[po],
    'mean_pop_adj': lambda po: 1000 * np.mean(state_polls[po]) / current_populations[po],
    'number_pop_adj': lambda po: 1000 * len(state_polls[po]) / current_populations[po],
}
y_axes = {
    'sum': f'Cumulative Sample',
    'mean': f'Mean Sample Size',
    'number': f'Number of Polls',
    'sum_pop_adj': f'Cumulative Sample Per 1k Population',
    'mean_pop_adj': f'Mean Sample Size Per 1k Population',
    'number_pop_adj': f'Number of Polls Per 1k Population',
}

In [69]:
prefix = f'Aggregate Polling ({start_date} - {end_date}) '
for name, func in poll_funcs.items():
    title = prefix + y_axes[name]
    df = apply_poll_func(func)
    fig = px.bar(df, x='state_po', y='samples', color='battleground',
             labels={'samples':y_axes[name], 'state_po': 'State'}, 
             title=title)
    filename = '_'.join(title.lower().replace('(', '').replace(')', '').split())
    fig.update_layout(
        autosize=False,
        width=1000,
        height=500,)
#     fig.show()
    fig.write_image('figures/' + filename + '.png')
    df.to_csv(f'figures/data/{filename}.csv', index=False)
    print(f'Wrote to {filename}')

Wrote to aggregate_polling_2020-05-01_-_2020-10-31_cumulative_sample
Wrote to aggregate_polling_2020-05-01_-_2020-10-31_mean_sample_size
Wrote to aggregate_polling_2020-05-01_-_2020-10-31_number_of_polls
Wrote to aggregate_polling_2020-05-01_-_2020-10-31_cumulative_sample_per_1k_population
Wrote to aggregate_polling_2020-05-01_-_2020-10-31_mean_sample_size_per_1k_population
Wrote to aggregate_polling_2020-05-01_-_2020-10-31_number_of_polls_per_1k_population
