In [1]:
# Load data
import pandas as pd
ssc_excel = pd.read_excel('2020ssc_public.xlsx')
print(ssc_excel.shape)

(7339, 238)


In [2]:
# Only keep columns up to primary candidate choice
ssc = ssc_excel[ssc_excel.columns[:50]]

In [3]:
# Only look at American samples
ssc = ssc[ssc['Country'].str.match('United States')]
ssc = ssc.drop('Country', axis = 1)
print(ssc.shape)

(4362, 49)


In [4]:
# Replace whitespace with NaN
import numpy as np
ssc = ssc.replace(r'^\s*$', np.nan, regex = True)

In [5]:
# Only look at candidates up to Biden
candidates = ssc['DemocraticNominee'].value_counts()[:6].index
ssc = ssc[ssc['DemocraticNominee'].isin(candidates)]
print(ssc.shape)

(3422, 49)


In [6]:
# Drop columns with lots of NAs
lots_of_nas = ssc.isna().sum().sort_values()[-11:].index
ssc = ssc.drop(lots_of_nas, axis = 1)
print(lots_of_nas, 'columns were dropped')
print(ssc.shape)

Index(['Referrals', 'SATpreparationtype', 'SATpreparationamount',
       'SATfirsttime', 'SATscoremath', 'SATscoreverbalreading', 'LifeEffects',
       'ReligiousDenomination', 'IQ', 'MeetupApproval', 'LifeEffects2'],
      dtype='object') columns were dropped
(3422, 38)


In [7]:
# Drop irrelevant columns
ssc = ssc.drop(['PreviousSurveys', 'Public', 
                'LengthofTime', 'Comment', 'Subreddit', 'Discord', 'Ads', 'Meetup', 'LWID', 'EAID'], axis = 1)
print(ssc.shape)

(3422, 28)


In [8]:
# Drop rows with NAs
ssc = ssc.dropna(axis = 0)
print(ssc.shape)

(2850, 28)


In [9]:
# Kept 80% of data without imputation
round(100*2850/3422, 2)

83.28

In [10]:
# Drop nonsense age row
print(ssc['Age'].sort_values()[-1:])
ssc = ssc.drop(417, axis = 0)
print(ssc.shape)
age = pd.to_numeric(ssc['Age'])
ssc['Age'] = age

417    4600000000000000262563516096904207430093507854...
Name: Age, dtype: object
(2849, 28)


In [11]:
ssc['DemocraticNominee'].value_counts()

Andrew Yang         808
Elizabeth Warren    621
Bernie Sanders      524
Pete Buttigieg      361
Tulsi Gabbard       310
Joe Biden           225
Name: DemocraticNominee, dtype: int64

In [12]:
# Random guess = 17%, always guess the top candidate = 28%
ssc['DemocraticNominee'].value_counts()/len(ssc)

Andrew Yang         0.283608
Elizabeth Warren    0.217971
Bernie Sanders      0.183924
Pete Buttigieg      0.126711
Tulsi Gabbard       0.108810
Joe Biden           0.078975
Name: DemocraticNominee, dtype: float64

In [13]:
# Code race as white vs. non-white since 2526/2849 samples are white
ssc['White'] = ssc['Race'] == 'White (non-Hispanic)'
ssc = ssc.drop('Race', axis = 1)

In [14]:
# Code sex as male vs. non-male
ssc['Male'] = ssc['Sex'] == 'Male'
ssc = ssc.drop('Sex', axis = 1)

In [15]:
# Code gender as cisgender vs. non-cisgender
cisgender = ssc['Gender'].value_counts()[:2].index
ssc['Cisgender'] = ssc['Gender'].isin(cisgender)
ssc = ssc.drop('Gender', axis = 1)

In [16]:
# Code sexuality as heterosexual vs. non-heterosexual
ssc['Heterosexual'] = ssc['SexualOrientation'] == 'Heterosexual'
ssc = ssc.drop('SexualOrientation', axis = 1)

In [17]:
# Code relationship style as monogamous vs. non-monogamous
ssc['Monogamous'] = ssc['Relationshipstyle'] == 'Prefer monogamous'
ssc = ssc.drop('Relationshipstyle', axis = 1)

In [18]:
# Encode relationship status as single vs. non-single
ssc['Single'] = ssc['RelationshipStatus'] == 'Single'
ssc = ssc.drop('RelationshipStatus', axis = 1)

In [19]:
# Code children as true or false
ssc['HasChildren'] = ssc['Children'] > 0
ssc = ssc.drop('Children', axis = 1)

In [20]:
# Extract student from work status
ssc['Student'] = ssc['WorkStatus'] == 'Student'
ssc = ssc.drop('WorkStatus', axis = 1)

In [21]:
# Drop profession, too many options
ssc = ssc.drop('Profession', axis = 1)

In [22]:
# Quantify degree
degree = {'Bachelor\'s': 1, 'Master\'s': 2, 'High school': 0, 'Ph D.': 3, 'JD': 3, '2 year degree': 1,
       'None': 0, 'MD': 3, 'Other professional degree': 1}
degree_quantified = []
for row in ssc['Degree']:
    degree_quantified.append(degree[row])
ssc['DegreeQuantified'] = degree_quantified
ssc = ssc.drop('Degree', axis = 1)

In [23]:
# Drop EducationComplete
ssc = ssc.drop('EducationComplete', axis = 1)

In [24]:
# Quantify religion
religion = {'Atheist and not spiritual': 0, 
            'Atheist but spiritual': 1, 
            'Agnostic': 2, 
            'Deist/Pantheist/etc.': 3,
            'Lukewarm theist': 4,
            'Committed theist': 5}
religion_quantified = []
for row in ssc['ReligiousViews']:
    religion_quantified.append(religion[row])
ssc['Religiosity'] = religion_quantified
ssc = ssc.drop('ReligiousViews', axis = 1)

In [25]:
# Code religious background
top_three = ['Christian (Protestant)', 'Christian (Catholic)', 'Jewish']
ssc['Protestant'] = ssc['ReligiousBackground'] == 'Christian (Protestant)'
ssc['Catholic'] = ssc['ReligiousBackground'] == 'Christian (Catholic)'
ssc['Jewish'] = ssc['ReligiousBackground'] == 'Jewish'
ssc['Other'] = ~ssc['ReligiousBackground'].isin(top_three)
ssc = ssc.drop(['ReligiousBackground', 'Other'], axis = 1)

In [26]:
# Drop moral views
ssc = ssc.drop('MoralViews', axis = 1)

In [27]:
# One-hot encode affiliations
affiliations = list(ssc['PoliticalAffiliation'].value_counts().index)
for x, affiliation in enumerate(affiliations):
    affiliations[x] = affiliation.split(',')[0]
ssc[affiliations] = pd.get_dummies(ssc['PoliticalAffiliation'])
ssc = ssc.drop(['PoliticalAffiliation', 'Liberal'], axis = 1)

In [28]:
# Encode parties
ssc['AmericanParties'].value_counts()
two_parties = ['Democratic Party', 'Republican Party']
ssc['Democratic Party'] = ssc['AmericanParties'] == 'Democratic Party'
ssc['Republican Party'] = ssc['AmericanParties'] == 'Republican Party'
ssc['Neither Party'] = ~ssc['AmericanParties'].isin(two_parties)
ssc = ssc.drop(['AmericanParties', 'Neither Party'], axis = 1)

In [29]:
# One-hot encode political change
political_change = ssc['PoliticalChange'].value_counts().index
ssc[political_change] = pd.get_dummies(ssc['PoliticalChange'])
ssc = ssc.drop(['PoliticalChange', 'No significant change'], axis = 1)

In [30]:
# Convert boolean to 1/0
y = ssc['DemocraticNominee']
ssc = ssc.drop('DemocraticNominee', axis = 1)
ssc = ssc.astype(int)
ssc['DemocraticNominee'] = y

In [31]:
# Rename long columns
ssc.columns = ['Age', 'PoliticalSpectrum', 'PoliticalInterest', 'GlobalWarming',
       'Immigration', 'MinimumWage', 'Feminism', 'HumanBiodiversity',
       'BasicIncome', 'DonaldTrump', 'White', 'Male', 'Cisgender',
       'Heterosexual', 'Monogamous', 'Single', 'HasChildren', 'Student',
       'DegreeQuantified', 'Religiosity', 'Protestant', 'Catholic', 'Jewish',
       'Social democratic', 'Libertarian', 'Conservative', 'Neoreactionary',
       'Alt-right', 'Marxist', 'Democratic Party', 'Republican Party',
       'Some other change', 'Further left', 'Further right', 'DemocraticNominee']

In [32]:
# Separate features from target
X = ssc.drop('DemocraticNominee', axis = 1)

In [33]:
# Examine multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame(columns = ['Features', 'VIF'])
vif['Features'] = X.columns
for i in range(X.shape[1]):
    vif.loc[i, 'VIF'] = variance_inflation_factor(X.values, i).round(1)
# Remove constant row
vif = vif[1:]
# Remove VIF > X
vif_remove = list(vif[vif['VIF'] > 20]['Features'])
print(vif_remove, "columns have been removed")
X = X.drop(vif_remove, axis = 1)

['PoliticalSpectrum', 'Cisgender'] columns have been removed


In [34]:
# Examine multicollinearity again
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = pd.DataFrame(columns = ['Features', 'VIF'])
vif['Features'] = X.columns
for i in range(X.shape[1]):
    vif.loc[i, 'VIF'] = variance_inflation_factor(X.values, i).round(1)
# Remove constant row
vif = vif[1:]
vif.sort_values('VIF', ascending = False)[:5]

Unnamed: 0,Features,VIF
3,Immigration,17.8
5,Feminism,16.1
1,PoliticalInterest,14.5
26,Marxist,14.5
22,Libertarian,13.8


In [35]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 20201203)
print(X_train.shape)
print(X_test.shape)

(2279, 32)
(570, 32)


In [36]:
# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
columns = X_train.columns
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [37]:
# Logistic regression with grid search
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
grid = {'C': [0.01, 0.1, 1], 'penalty' :['elasticnet'], 'solver': ['saga'], 
        'l1_ratio': [0, 0.5, 1], 'class_weight': ['balanced', None]}
gridsearch = GridSearchCV(estimator = logreg, param_grid = grid, n_jobs = -1,                      
                          cv = 5, return_train_score = True, scoring = 'balanced_accuracy')
gridsearch.fit(X_train, y_train)
logreg_models = pd.DataFrame(gridsearch.cv_results_).sort_values('mean_test_score', ascending = False)
print('LR mean test score:', round(logreg_models['mean_test_score'].iloc[0], 3))
print('LR std test score:', round(logreg_models['std_test_score'].iloc[0], 3))
print('LR best parameters', logreg_models['params'].iloc[0])

LR mean test score: 0.443
LR std test score: 0.013
LR best parameters {'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 0.5, 'penalty': 'elasticnet', 'solver': 'saga'}


In [38]:
# Train with best paraeters
logreg = LogisticRegression(
 C = logreg_models['params'].iloc[0]['C'],
 class_weight = logreg_models['params'].iloc[0]['class_weight'],
 l1_ratio = logreg_models['params'].iloc[0]['l1_ratio'],
 penalty = 'elasticnet',
 solver = 'saga')
logreg.fit(X_train, y_train)

LogisticRegression(C=0.1, class_weight='balanced', l1_ratio=0.5,
                   penalty='elasticnet', solver='saga')

In [39]:
# Sort features by beta value for Andrew Yang
feature_coefs = pd.DataFrame(columns = ['Feature', 'Beta'])
feature_coefs['Feature'] = columns
coefs = logreg.coef_[0].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
5,Feminism,-0.18
27,Democratic Party,-0.16
8,DonaldTrump,0.14
7,BasicIncome,0.68


In [40]:
# Sort features by beta value for Bernie Sanders
coefs = logreg.coef_[1].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
0,Age,-0.2
15,Student,-0.14
26,Marxist,0.66
4,MinimumWage,0.68


In [41]:
# Sort features by beta value for Elizabeth Warren
coefs = logreg.coef_[2].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
2,GlobalWarming,-0.36
8,DonaldTrump,-0.35
26,Marxist,0.26
5,Feminism,0.4


In [42]:
# Sort features by beta value for Joe Biden
coefs = logreg.coef_[3].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
7,BasicIncome,-0.39
29,Some other change,-0.22
21,Social democratic,0.23
11,Heterosexual,0.35


In [43]:
# Sort features by beta value for Pete Buttigieg
coefs = logreg.coef_[4].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
8,DonaldTrump,-0.28
7,BasicIncome,-0.24
3,Immigration,0.19
22,Libertarian,0.26


In [44]:
# Sort features by beta value for Tulsi Gabbard
coefs = logreg.coef_[5].round(2)
feature_coefs['Beta'] = coefs
feature_coefs.sort_values('Beta').iloc[[0, 1, -2, -1]]

Unnamed: 0,Feature,Beta
22,Libertarian,-0.45
27,Democratic Party,-0.31
0,Age,0.24
8,DonaldTrump,0.46


In [45]:
# Create confusion matrix
from sklearn.metrics import confusion_matrix
y_pred_logreg = logreg.predict(X_train)
cm = confusion_matrix(y_train, y_pred_logreg)

In [46]:
# Compute % correct per candidate
cm = pd.DataFrame(cm)
cm.columns = logreg.classes_
cm.index = logreg.classes_
for candidate in logreg.classes_:
    print(candidate, round(100*cm.loc[candidate,candidate]/cm.loc[candidate].sum(),2), '% correct')

Andrew Yang 41.33 % correct
Bernie Sanders 54.65 % correct
Elizabeth Warren 52.31 % correct
Joe Biden 40.56 % correct
Pete Buttigieg 32.53 % correct
Tulsi Gabbard 68.15 % correct


In [47]:
# Evaluate on the test set
from sklearn.metrics import balanced_accuracy_score
test_logreg = logreg.predict(X_test)
print('LR Balanced accuracy =', balanced_accuracy_score(y_test, test_logreg).round(3))

LR Balanced accuracy = 0.476
