# Predicting candidate using contributor data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
%matplotlib inline

In [2]:
# read in CSV 
df = pd.read_csv('clean_data/2019-fec-contr-census.csv', index_col=0)
df.head()

Unnamed: 0,cand_nm,contbr_zip,contb_receipt_amt,converted_date,income,target
0,"Sanders, Bernard",20001,100.0,201906,85976.0,2
1,"Sanders, Bernard",20001,3.0,201906,85976.0,2
2,"Sanders, Bernard",20001,27.0,201906,85976.0,2
3,"Sanders, Bernard",20007,3.0,201903,119267.0,2
4,"Sanders, Bernard",20001,27.0,201906,85976.0,2


In [3]:
y = df.target

In [4]:
df.contbr_zip = df.contbr_zip.astype(str)
X_feats = ['contbr_zip',
           'converted_date',
           'contb_receipt_amt']
X = pd.get_dummies(df[X_feats],
                   drop_first=True)
print(X.shape)
X.head()

(11502, 23)


Unnamed: 0,converted_date,contb_receipt_amt,contbr_zip_20002,contbr_zip_20003,contbr_zip_20004,contbr_zip_20005,contbr_zip_20006,contbr_zip_20007,contbr_zip_20008,contbr_zip_20009,...,contbr_zip_20015,contbr_zip_20016,contbr_zip_20017,contbr_zip_20018,contbr_zip_20019,contbr_zip_20020,contbr_zip_20024,contbr_zip_20032,contbr_zip_20036,contbr_zip_20037
0,201906,100.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,201906,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,201903,3.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

In [6]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)

In [7]:
clf = LogisticRegression(  C=2, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: .89 ,1: .78, 2: .98, 3: .87})

In [8]:
clf.fit(scaled_X_train, y_train)
y_hat_train = clf.predict(scaled_X_train)
clf.score(scaled_X_train, y_train)

0.44027823062710575

In [9]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,120,435,293,289
actual Buttigieg,86,1197,539,1174
actual Sanders,18,241,992,930
actual Warren,31,488,626,1742


In [10]:
scaled_X_test = scaler.transform(X_test)
clf.score(scaled_X_test, y_test)

0.4211212516297262