# Analysis of National Democratic Primary Polling, 2019

Data taken from FiveThirtyEight's aggregation of polls for the 2019 primary.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
data_raw = pd.read_csv('https://projects.fivethirtyeight.com/polls-page/president_primary_polls.csv')

In [3]:
data_raw.head()

Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,...,nationwide_batch,created_at,notes,url,stage,party,answer,candidate_id,candidate_name,pct
0,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Trump,13254,Donald Trump,90.0
1,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Weld,13351,William F. Weld,4.7
2,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Walsh,13482,Joe Walsh,5.4
3,114955,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,DEM,Bennet,13352,Michael F. Bennet,0.3
4,114955,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,DEM,Biden,13256,Joseph R. Biden Jr.,32.1


# Tasks

### Clean Data
(can fiddle with how to clean in different ways)

1. get a feature for each candidate's percentage for each poll (question id)
2. other features can include many features in the raw data sans the candidate names and percentages. e.g. poll_id, rating, poll name, etc.


### Analyze Data

## Cleaning Process

First, work on step 1 from above cell.


In [4]:
#drop all state polls

indices = data_raw.index

for i in indices:
    if(not data_raw['state'].isna().loc[i]):
        data_raw.drop(axis = 0,index = i,inplace=True)

In [5]:
data_raw.head()

Unnamed: 0,question_id,poll_id,cycle,state,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,...,nationwide_batch,created_at,notes,url,stage,party,answer,candidate_id,candidate_name,pct
0,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Trump,13254,Donald Trump,90.0
1,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Weld,13351,William F. Weld,4.7
2,114954,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,REP,Walsh,13482,Joe Walsh,5.4
3,114955,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,DEM,Bennet,13352,Michael F. Bennet,0.3
4,114955,63139,2020,,1102,Emerson College,43,7News,Emerson College,88.0,...,False,12/17/19 21:07,,http://emersonpolling.com/2019/12/18/december-...,primary,DEM,Biden,13256,Joseph R. Biden Jr.,32.1


In [6]:
question_ids = data_raw['question_id'].unique()

In [None]:
data_clean = pd.DataFrame(index = question_ids, 
                            columns = ['start_date','end_date','sample_size','population','population_full',
                                       'pollster','pollster_id','pollster_grade', 
                                       'Buttigieg-pct','Biden-pct','Sanders-pct','Warren-pct'])

for q_id in question_ids:
    data_clean.loc[q_id]['start_date'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['start_date']
    data_clean.loc[q_id]['end_date'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['end_date']
    data_clean.loc[q_id]['pollster'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['pollster']
    data_clean.loc[q_id]['pollster_id'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['pollster_id']
    data_clean.loc[q_id]['sample_size'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['sample_size']
    data_clean.loc[q_id]['population_full'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['population_full']
    data_clean.loc[q_id]['population'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['population']
    data_clean.loc[q_id]['pollster_grade'] = data_raw[data_raw['question_id'] == q_id].iloc[0]['fte_grade']

data_clean['Buttigieg-pct'] = pd.Series(data = np.array(data_raw[data_raw['answer'] == 'Buttigieg']['pct']), 
                       index = data_raw[data_raw['answer'] == 'Buttigieg']['question_id'],
                      name = 'Buttigieg-pct')

data_clean['Biden-pct'] = pd.Series(data = np.array(data_raw[data_raw['answer'] == 'Biden']['pct']), 
                       index = data_raw[data_raw['answer'] == 'Biden']['question_id'],
                      name = 'Biden-pct')

data_clean['Sanders-pct'] = pd.Series(data = np.array(data_raw[data_raw['answer'] == 'Sanders']['pct']), 
                       index = data_raw[data_raw['answer'] == 'Sanders']['question_id'],
                      name = 'Sanders-pct')

data_clean['Warren-pct'] = pd.Series(data = np.array(data_raw[data_raw['answer'] == 'Warren']['pct']), 
                       index = data_raw[data_raw['answer'] == 'Warren']['question_id'],
                      name = 'Warren-pct')


In [None]:
data_clean.head()

In [None]:
data_clean.describe()

In [None]:
data_clean.start_date = data_clean.start_date.astype("datetime64")
data_clean.end_date = data_clean.end_date.astype("datetime64")

In [None]:
data_clean.start_date.head()

## Data Analysis

Now that we have a clean data set, we can do some plotting and analysis

First, we show a plot of how the different candidates are doing as a function of time.
From the results, we can see that it might be useful to do some smoothing...

In [None]:
%matplotlib inline

plt.figure(figsize=(16,10))

sns.lineplot(x = 'start_date', y = 'Buttigieg-pct', data=data_clean, label = "Buttigieg")
sns.lineplot(x = 'start_date', y = 'Biden-pct', data=data_clean, label = "Biden")
sns.lineplot(x = 'start_date', y = 'Sanders-pct', data=data_clean, label = "Sanders")
sns.lineplot(x = 'start_date', y = 'Warren-pct', data=data_clean, label = "Warren")

#plt.xlim([np.datetime64('2019-07'),np.datetime64('2020-01')])

plt.ylabel('Percent Support')
plt.ylim(0,50)
yticks = np.array(plt.yticks())
new_labels = [str(int(item)) + '%' for item in yticks[0]]
new_labels
plt.yticks(yticks[0],new_labels)

plt.xlabel('Date')


### Smoothing

#### Polynomial Regression

In [None]:
data_clean_drop = data_clean.dropna(axis = 0)

In [None]:
data_clean_drop['date_ordinal'] = pd.to_datetime(data_clean_drop['start_date']).apply(lambda start_date: start_date.toordinal())
data_clean_drop[['start_date','date_ordinal']].head()

In [None]:
import warnings
import datetime
warnings.simplefilter('ignore', np.RankWarning)
import matplotlib

sns.set_style("whitegrid")

matplotlib.rcParams.update({'font.size': 18})

plt.figure(figsize=(16,10))
deg_order =20

sns.regplot(x ='date_ordinal', y = data_clean_drop['Buttigieg-pct'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Buttigieg",color = 'blue')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Biden-pct'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Biden", color = 'green')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Sanders-pct'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Sanders",color='red')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Warren-pct'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Warren",color = (0.0,1,0.0))



plt.legend()
plt.xlabel('date')
plt.xlim(data_clean_drop['date_ordinal'].min() - 1, data_clean_drop['date_ordinal'].max() + 1)#new_labels = [datetime.date.fromordinal(int(item)) for item in plt.xticks()]

xticks = np.array(plt.xticks())
new_labels = [datetime.date.fromordinal(int(item)) for item in xticks[0]]
plt.xticks(new_labels,new_labels)
plt.xlim(data_clean_drop['start_date'].min(),data_clean_drop['start_date'].max())

plt.ylabel('Percent Support')
plt.ylim(0,50)
yticks = np.array(plt.yticks())
new_labels = [str(int(item)) + '%' for item in yticks[0]]
plt.yticks(yticks[0],new_labels)

plt.title('Democratic Primary - Candidate Support')


leg = plt.legend()
for lh in leg.legendHandles: 
    lh.set_alpha(1)


### Other ideas

Can use machine learning to predict poll biases for each candidate, and reweight based on how each poll favors a candidate

e.g. if Warren does 5 points better than average on a particular poll, machine learning can determine that, and then I can correct for it

### Machine Learning poll bias

First, learn based on date, pollster, population, and pollster's FiveThirtyEight rating

Then can later compare to the prediction based on date alone

In [None]:
attributes = ['date_ordinal','population','pollster_grade','pollster_id']
cat_attributes = ['population','pollster_grade']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data_clean_prep = data_clean_drop[attributes]

for col in cat_attributes:
    data_clean_prep[col] = encoder.fit_transform(data_clean_prep[col])

In [None]:
data_clean_prep.head()

In [None]:
data_clean_drop.index

In [None]:
from sklearn.model_selection import train_test_split

X = data_clean_prep
y_index = data_clean_drop.index
y_butt = data_clean_drop['Buttigieg-pct']
y_warren = data_clean_drop['Warren-pct']
y_biden = data_clean_drop['Biden-pct']
y_sanders = data_clean_drop['Sanders-pct']

X_train, X_valid, y_train_index, y_valid_index = train_test_split(X, y_index,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=42)


In [None]:
y_train_butt = y_butt.loc[y_train_index]
y_train_warren = y_warren.loc[y_train_index]
y_train_biden = y_biden.loc[y_train_index]
y_train_sanders = y_sanders.loc[y_train_index]

y_valid_butt = y_butt.loc[y_valid_index]
y_valid_warren = y_warren.loc[y_valid_index]
y_valid_biden = y_biden.loc[y_valid_index]
y_valid_sanders = y_sanders.loc[y_valid_index]



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model_butt = RandomForestRegressor(random_state = 42)
forest_model_warren = RandomForestRegressor(random_state = 42)
forest_model_biden = RandomForestRegressor(random_state = 42)
forest_model_sanders = RandomForestRegressor(random_state = 42)

forest_model_butt.fit(X_train,y_train_butt)
forest_model_warren.fit(X_train,y_train_warren)
forest_model_biden.fit(X_train,y_train_biden)
forest_model_sanders.fit(X_train,y_train_sanders)


butt_preds = forest_model_butt.predict(X_valid)
warren_preds = forest_model_warren.predict(X_valid)
biden_preds = forest_model_biden.predict(X_valid)
sanders_preds = forest_model_sanders.predict(X_valid)

print(mean_absolute_error(butt_preds,y_valid_butt))
print(mean_absolute_error(warren_preds,y_valid_warren))
print(mean_absolute_error(biden_preds,y_valid_biden))
print(mean_absolute_error(sanders_preds,y_valid_sanders))

Machine learning based on date of poll alone

In [None]:
X = X['date_ordinal']
X_train = X_train['date_ordinal']
X_valid = X_valid['date_ordinal']

X_train = np.array(X_train).reshape(-1,1)
X_valid = np.array(X_valid).reshape(-1,1)

In [None]:
forest_model_butt_dates = RandomForestRegressor(random_state = 42)
forest_model_warren_dates = RandomForestRegressor(random_state = 42)
forest_model_biden_dates = RandomForestRegressor(random_state = 42)
forest_model_sanders_dates = RandomForestRegressor(random_state = 42)

forest_model_butt_dates.fit(X_train,y_train_butt)
forest_model_warren_dates.fit(X_train,y_train_warren)
forest_model_biden_dates.fit(X_train,y_train_biden)
forest_model_sanders_dates.fit(X_train,y_train_sanders)


butt_preds_dates = forest_model_butt_dates.predict(X_valid)
warren_preds_dates = forest_model_warren_dates.predict(X_valid)
biden_preds_dates = forest_model_biden_dates.predict(X_valid)
sanders_preds_dates = forest_model_sanders_dates.predict(X_valid)

print(mean_absolute_error(butt_preds_dates,y_valid_butt))
print(mean_absolute_error(warren_preds_dates,y_valid_warren))
print(mean_absolute_error(biden_preds_dates,y_valid_biden))
print(mean_absolute_error(sanders_preds_dates,y_valid_sanders))

Continuing, can subtract the full predictions from the predictions based only on dates (after I run model on full dataset)

Then, can average the difference for a particular pollster, for a particular population sample, etc, to determine the bias factor

In [None]:
X = data_clean_prep

In [None]:
forest_model_butt.fit(X,y_butt)
forest_model_warren.fit(X,y_warren)
forest_model_biden.fit(X,y_biden)
forest_model_sanders.fit(X,y_sanders)

butt_preds = forest_model_butt.predict(X)
warren_preds = forest_model_warren.predict(X)
biden_preds = forest_model_biden.predict(X)
sanders_preds = forest_model_sanders.predict(X)

In [None]:
X_dates = np.array(X['date_ordinal']).reshape(-1,1)

forest_model_butt_dates.fit(X_dates,y_butt)
forest_model_warren_dates.fit(X_dates,y_warren)
forest_model_biden_dates.fit(X_dates,y_biden)
forest_model_sanders_dates.fit(X_dates,y_sanders)

butt_preds_dates = forest_model_butt_dates.predict(X_dates)
warren_preds_dates = forest_model_warren_dates.predict(X_dates)
biden_preds_dates = forest_model_biden_dates.predict(X_dates)
sanders_preds_dates = forest_model_sanders_dates.predict(X_dates)

In [None]:
data_clean_drop['prediction_diff-Buttigieg'] = butt_preds - butt_preds_dates
data_clean_drop['prediction_diff-Warren'] = warren_preds - warren_preds_dates
data_clean_drop['prediction_diff-Biden'] = biden_preds - biden_preds_dates
data_clean_drop['prediction_diff-Sanders'] = sanders_preds - sanders_preds_dates


### "House Effect" Adjustments

In [None]:
for pollster in data_clean_drop.pollster.unique():
    data_pollster = data_clean_drop[data_clean_drop['pollster'] == pollster]
    if data_pollster.pollster.count() > 5:
        print(pollster + ' (poll count: ' + str(data_pollster.pollster.count()) + ')')
        print('\t' + 'Buttigieg bias: ' + '%0.2f' % data_pollster['prediction_diff-Buttigieg'].mean())
        print('\t' + 'Warren bias: ' + '%0.2f' % data_pollster['prediction_diff-Warren'].mean())
        print('\t' + 'Biden bias: ' + '%0.2f' % data_pollster['prediction_diff-Biden'].mean())
        print('\t' + 'Sanders bias: ' + '%0.2f' % data_pollster['prediction_diff-Sanders'].mean())


In [None]:
data_clean_drop['pollster_Buttigieg_bias'] = data_clean_drop.index
data_clean_drop['pollster_Warren_bias'] = data_clean_drop.index
data_clean_drop['pollster_Biden_bias'] = data_clean_drop.index
data_clean_drop['pollster_Sanders_bias'] = data_clean_drop.index


In [None]:
for pollster in data_clean_drop.pollster.unique():
    data_pollster = data_clean_drop[data_clean['pollster'] == pollster]
    data_clean_drop.replace({'pollster_Buttigieg_bias':np.array(data_pollster.index)},
                       data_pollster['prediction_diff-Buttigieg'].mean(),inplace = True)
    data_clean_drop.replace({'pollster_Warren_bias':np.array(data_pollster.index)},
                       data_pollster['prediction_diff-Warren'].mean(),inplace = True)
    data_clean_drop.replace({'pollster_Biden_bias':np.array(data_pollster.index)},
                       data_pollster['prediction_diff-Biden'].mean(),inplace = True)
    data_clean_drop.replace({'pollster_Sanders_bias':np.array(data_pollster.index)},
                       data_pollster['prediction_diff-Sanders'].mean(),inplace = True)



In [None]:
data_clean_drop.pollster_Buttigieg_bias.head()

In [None]:
data_clean_drop['Buttigieg_adj'] = data_clean_drop['Buttigieg-pct'] - data_clean_drop['pollster_Buttigieg_bias']
data_clean_drop['Warren_adj'] = data_clean_drop['Warren-pct'] - data_clean_drop['pollster_Warren_bias']
data_clean_drop['Biden_adj'] = data_clean_drop['Biden-pct'] - data_clean_drop['pollster_Biden_bias']
data_clean_drop['Sanders_adj'] = data_clean_drop['Sanders-pct'] - data_clean_drop['pollster_Sanders_bias']

In [None]:
%matplotlib inline
plt.figure(figsize=(16,10))
deg_order =20

sns.regplot(x ='date_ordinal', y = data_clean_drop['Buttigieg_adj'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Buttigieg",color = 'blue')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Biden_adj'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Biden", color = 'green')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Sanders_adj'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Sanders",color='red')
sns.regplot(x ='date_ordinal', y = data_clean_drop['Warren_adj'], data = data_clean_drop, order = deg_order, truncate = True, scatter_kws={'s':15, 'alpha': 0.3}, label = "Warren",color = (0.0,1,0.0))



plt.legend()
plt.xlabel('date')
plt.xlim(data_clean_drop['date_ordinal'].min() - 1, data_clean_drop['date_ordinal'].max() + 1)#new_labels = [datetime.date.fromordinal(int(item)) for item in plt.xticks()]

xticks = np.array(plt.xticks())
new_labels = [datetime.date.fromordinal(int(item)) for item in xticks[0]]
plt.xticks(new_labels,new_labels)
plt.xlim(data_clean_drop['start_date'].min(),data_clean_drop['start_date'].max())

plt.ylabel('Percent Support')
plt.ylim(0,50)
yticks = np.array(plt.yticks())
new_labels = [str(int(item)) + '%' for item in yticks[0]]
plt.yticks(yticks[0],new_labels)

plt.title('Democratic Primary - Candidate Support with Adjustments')


leg = plt.legend()
for lh in leg.legendHandles: 
    lh.set_alpha(1)

In [None]:
%matplotlib inline
plt.figure(figsize=(16,10))
deg_order = 20

sns.lineplot(x ='date_ordinal', y = data_clean_drop['Buttigieg_adj'], data = data_clean_drop, 
             label = "Buttigieg",color = 'blue')
sns.lineplot(x ='date_ordinal', y = data_clean_drop['Biden_adj'], data = data_clean_drop, 
             label = "Biden", color = 'green')
sns.lineplot(x ='date_ordinal', y = data_clean_drop['Sanders_adj'], data = data_clean_drop, 
             label = "Sanders",color='red')
sns.lineplot(x ='date_ordinal', y = data_clean_drop['Warren_adj'], data = data_clean_drop, 
             label = "Warren",color = (0.0,1,0.0))



plt.legend()
plt.xlabel('date')
plt.xlim(data_clean_drop['date_ordinal'].min() - 1, data_clean_drop['date_ordinal'].max() + 1)#new_labels = [datetime.date.fromordinal(int(item)) for item in plt.xticks()]

xticks = np.array(plt.xticks())
new_labels = [datetime.date.fromordinal(int(item)) for item in xticks[0]]
plt.xticks(new_labels,new_labels)
plt.xlim(data_clean_drop['start_date'].min(),data_clean_drop['start_date'].max())

plt.ylabel('Percent Support')
plt.ylim(0,50)
yticks = np.array(plt.yticks())
new_labels = [str(int(item)) + '%' for item in yticks[0]]
plt.yticks(yticks[0],new_labels)

plt.title('Democratic Primary - Candidate Support with Adjustments')


leg = plt.legend()
for lh in leg.legendHandles: 
    lh.set_alpha(1)