In [1]:
import pandas as pd
from collections import Counter, defaultdict
from us import states
from utils import get_names2abbrs_dict

In [2]:
# pres_df = pd.read_csv(open('poll_data/president_polls.csv'))
pres_df = pd.read_csv('president_polls_new.csv')

In [3]:
def clean_pres_polls(in_df):
    df = in_df.copy()
    polls = list()
    names2abbrs = get_names2abbrs_dict()
    column_list = ['question_id', 'poll_id', 'state', 'fte_grade', 'sample_size', 
               'population', 'population_full', 'methodology', 'start_date', 
               'end_date', 'internal', 'partisan', 'tracking', 'created_at', 
               'url', 'answer', 'candidate_name', 'candidate_party', 'pct']
    for p_id in Counter(df['poll_id']):
        for q_id in Counter(df[df['poll_id'] == p_id]['question_id']):
            mask = (df['poll_id'] == p_id) & (df['question_id'] == q_id) & (df['cycle'] == 2020) & (df['stage'] == 'general')
            rows = df[mask]
            rows = df[column_list]
            if set(rows['candidate_name']) == set(['Joseph R. Biden Jr.', 'Donald Trump']) and len(rows) == 2:
                state = list(set(rows['state']))
                assert len(state) == 1
                state_str = str(state[0])
                if state_str != 'nan':
                    state_str = names2abbrs[state_str]
                else:
                    state_str = 'NAT'
                rows['state_po'] = state_str
                polls.extend(rows)
    df = pd.concat(polls)

In [4]:
def get_pres_poll_d_prob(in_df):
    df = in_df.copy()
    results = list()
    names2abbrs = get_names2abbrs_dict()
    for p_id in Counter(df['poll_id']):
        for q_id in Counter(df[df['poll_id'] == p_id]['question_id']):
            mask = (df['poll_id'] == p_id) & (df['question_id'] == q_id) & (df['cycle'] == 2020) & (df['stage'] == 'general')
            rows = df[mask]
            if set(rows['candidate_name']) == set(['Joseph R. Biden Jr.', 'Donald Trump']) and len(rows) == 2:
                state = list(set(rows['state']))
                assert len(state) == 1
                state_str = str(state[0])
                if state_str != 'nan':
                    state_po = names2abbrs[state_str]
                else:
                    state_po = 'NAT'
                assert len(set(rows['end_date'])) == 1
                date = rows['end_date'].iloc[0]
                month, day, year = date.split('/')
                month = month.zfill(2)
                day = day.zfill(2)
                year = f'20{year}'
                date = '-'.join([year, month, day])  # get yyyy-mm-dd sortable format
                sample_size = rows['sample_size'].iloc[0].item()
                population = rows['population'].iloc[0]
                
                d_row = rows[rows['candidate_party'] == 'DEM']
                r_row = rows[rows['candidate_party'] == 'REP']
                assert len(d_row) == 1 and len(r_row) == 1
                d_pct = d_row['pct'].item()
                r_pct = r_row['pct'].item()
                d_prob = d_pct / (d_pct + r_pct)
                result = [date, p_id, q_id, state_str, state_po, sample_size, population, d_prob]
                results.append(result)
    return pd.DataFrame(results, columns=['date', 'poll_id', 'question_id', 'state', 'state_po', 'sample_size', 'population', 'd_prob']).sort_values(by='date').reset_index(drop=True)

In [5]:
d_probs = get_pres_poll_d_prob(pres_df)

In [9]:
d_probs = d_probs.loc[list(d_probs['sample_size'].dropna().index)].reset_index(drop=True).sort_values(by='date')

In [34]:
'question_id', 'poll_id', 'state', 'fte_grade', 'sample_size', 
'population', 'population_full', 'methodology', 'start_date', 
'end_date', 'internal', 'partisan', 'tracking', 'created_at', 
'url', 'race_id', 'answer', 'candidate_name', 'candidate_party', 
'pct'

'pct'

In [11]:
d_probs.to_csv('./poll_data/all_pres_polls_d_probs.csv', index=False)