In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from collections import defaultdict
from math import exp
from utils import get_names2abbrs_dict, get_samples

In [3]:
start_date = '2020-05-01'
end_date = '2020-10-31'

In [4]:
battleground_df = pd.read_csv('label_data/2016_pres_labels_battleground.csv')[['state_po', 'battleground']]

In [5]:
battleground_states = battleground_df.set_index('state_po').to_dict()['battleground']

In [6]:
d_probs = pd.read_csv('poll_data/all_pres_polls_d_probs.csv')

In [7]:
state_polls = get_samples(d_probs, start_date, end_date)
all_state_pos = list(sorted(state_polls))

In [8]:
state_populations = pd.read_csv('population_data/1900-2019-all-states-populations.csv')
current_populations = state_populations[state_populations['year'] == 2019].set_index('state_po').to_dict()['population']

In [9]:
def apply_poll_func(poll_func):
    polls = pd.DataFrame([(po, poll_func(po), battleground_states[po]) for po in all_state_pos],
                         columns=['state_po', 'samples', 'battleground']).sort_values(by=['battleground', 'state_po'])
    return polls

In [10]:
poll_funcs = {
    'sum': lambda po: sum(state_polls[po]),
    'mean': lambda po: np.mean(state_polls[po]),
    'number': lambda po: len(state_polls[po]),
    'sum_pop_adj': lambda po: 1000 * sum(state_polls[po]) / current_populations[po],
    'mean_pop_adj': lambda po: 1000 * np.mean(state_polls[po]) / current_populations[po],
    'number_pop_adj': lambda po: 1000 * len(state_polls[po]) / current_populations[po],
}
y_axes = {
    'sum': f'Cumulative Sample',
    'mean': f'Mean Sample Size',
    'number': f'Number of Polls',
    'sum_pop_adj': f'Cumulative Sample Per 1k Population',
    'mean_pop_adj': f'Mean Sample Size Per 1k Population',
    'number_pop_adj': f'Number of Polls Per 1k Population',
}

In [14]:
prefix = f'Aggregate Polling ({start_date} - {end_date}) '
for name, func in poll_funcs.items():
    title = prefix + y_axes[name]
    df = apply_poll_func(func)
    fig = px.bar(df, x='state_po', y='samples', color='battleground',
             labels={'samples':y_axes[name], 'state_po': 'State'}, 
             title=title)
    filename = '_'.join(title.lower().replace('(', '').replace(')', '').split())
    fig.update_layout(
        autosize=False,
        width=1000,
        height=500,)
#     fig.show()
    fig.write_image('figures/' + filename + '.png')
    df.to_csv(f'figures/data/{filename}.csv', index=False)
    print(df.groupby('battleground', as_index=False).mean().to_latex(index=False, caption=title, label='table:' + filename.replace("_", "\_")))
    print(f'Wrote to {filename}')

\begin{table}
\centering
\caption{Aggregate Polling (2020-05-01 - 2020-10-31) Cumulative Sample}
\label{table:aggregate\_polling\_2020-05-01\_-\_2020-10-31\_cumulative\_sample}
\begin{tabular}{lr}
\toprule
 battleground &        samples \\
\midrule
        False &  156638.444444 \\
         True &  269452.800000 \\
\bottomrule
\end{tabular}
\end{table}

Wrote to aggregate_polling_2020-05-01_-_2020-10-31_cumulative_sample
\begin{table}
\centering
\caption{Aggregate Polling (2020-05-01 - 2020-10-31) Mean Sample Size}
\label{table:aggregate\_polling\_2020-05-01\_-\_2020-10-31\_mean\_sample\_size}
\begin{tabular}{lr}
\toprule
 battleground &      samples \\
\midrule
        False &  2835.778209 \\
         True &  2659.970856 \\
\bottomrule
\end{tabular}
\end{table}

Wrote to aggregate_polling_2020-05-01_-_2020-10-31_mean_sample_size
\begin{table}
\centering
\caption{Aggregate Polling (2020-05-01 - 2020-10-31) Number of Polls}
\label{table:aggregate\_polling\_2020-05-01\_-\_2020-10-31\_num