In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, accuracy_score
%matplotlib inline

In [5]:
df = pd.read_csv('final_data/contr-income.csv', index_col=0)
df.head()

Unnamed: 0,cand_nm,contbr_zip,contbr_occupation,contb_receipt_amt,occ_cat,converted_date,zip,income
0,"Sanders, Bernard",20001,UNION REPRESENTATIVE,100.0,union representative,201906,20001.0,85976.0
1,"Sanders, Bernard",20001,UNION REPRESENTATIVE,3.0,union representative,201906,20001.0,85976.0
2,"Sanders, Bernard",20001,UNION REPRESENTATIVE,27.0,union representative,201906,20001.0,85976.0
3,"Sanders, Bernard",20007,IT,3.0,it,201903,20007.0,119267.0
4,"Sanders, Bernard",20001,SOFTWARE DEVELOPER,27.0,it,201906,20001.0,85976.0


In [6]:
# mean contribution amount by candidate
df.groupby('cand_nm').mean()['contb_receipt_amt']

cand_nm
Biden, Joseph R Jr    455.965489
Buttigieg, Pete       316.766818
Sanders, Bernard       59.188698
Warren, Elizabeth     101.616753
Name: contb_receipt_amt, dtype: float64

In [7]:
# mode zip code 
df.groupby('cand_nm').mean()['income']

cand_nm
Biden, Joseph R Jr    103579.781844
Buttigieg, Pete        96094.845969
Sanders, Bernard       91437.470653
Warren, Elizabeth      92991.747298
Name: income, dtype: float64

In [111]:
df.groupby('cand_nm').max()['occ_cat']

cand_nm
Biden, Joseph R Jr        volunteer
Buttigieg, Pete                wine
Sanders, Bernard               yoga
Warren, Elizabeth     writer/editor
Name: occ_cat, dtype: object

In [9]:
df.groupby('cand_nm').min()['converted_date']


cand_nm
Biden, Joseph R Jr    201904
Buttigieg, Pete       201901
Sanders, Bernard      201902
Warren, Elizabeth     201901
Name: converted_date, dtype: int64

In [10]:
# contribution amount by month for each candidate 
df.groupby(['cand_nm','converted_date']).mean()['contb_receipt_amt']

cand_nm             converted_date
Biden, Joseph R Jr  201904            711.465022
                    201905            447.095444
                    201906            596.462398
                    201907            197.217160
                    201908            516.234118
                    201909            251.383166
Buttigieg, Pete     201901            321.428571
                    201902            571.052632
                    201903            481.066948
                    201904            533.287420
                    201905            683.124096
                    201906            325.153974
                    201907            160.074282
                    201908             94.050600
                    201909            148.499467
Sanders, Bernard    201902            307.370727
                    201903             68.412271
                    201904             65.967037
                    201905             58.492265
                    201906        

In [11]:
df['cand_num'] = df['cand_nm']

In [12]:
df.head()
df.shape

(11502, 9)

## Label encoding

In [13]:
# encode the candidates as specific labels 
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
num_y = le.transform(y)
df.cand_num = num_y

In [14]:
print(df.cand_num.value_counts())
print(df.cand_nm.value_counts())

1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64
Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64


## Split dataset into train and test

In [98]:
# X predictor variables 
x_feats = ['converted_date','income','occ_cat','contb_receipt_amt']
X = pd.get_dummies(df[x_feats],drop_first=True, dtype=float)

In [99]:
# target variable is candidate's numeric value 
y = df.cand_num

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=.2)

## Scale x values 

In [101]:
scaler = StandardScaler().fit(X_train)

columns = X_train.columns 

scaled_train = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(scaled_train, columns = columns)

## Call logistic regression function

In [102]:
logreg = LogisticRegression(C=1e9, solver='newton-cg', max_iter=1000)
multi = LogisticRegression(C=1e9, solver='newton-cg', multi_class='multinomial', max_iter=1000)

In [103]:
model_log = logreg.fit(X_train_scaled,y_train)
multi_model = multi.fit(X_train_scaled, y_train)



In [104]:
print(logreg.score(X_train_scaled, y_train))
print(multi.score(X_train_scaled, y_train))

0.5098358874035431
0.51450929246821


In [105]:
y_hat_train = logreg.predict(X_train)

In [106]:
residuals = np.abs(y_train - y_hat_train)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

1    2996
3    2887
2    2181
0    1137
Name: cand_num, dtype: int64
1    0.325617
3    0.313770
2    0.237039
0    0.123574
Name: cand_num, dtype: float64


In [107]:
# create confusion matrix 
cm = confusion_matrix(y_train, y_hat_train)
cm

array([[1137,    0,    0,    0],
       [2996,    0,    0,    0],
       [2181,    0,    0,    0],
       [2887,    0,    0,    0]])

In [108]:
print('intercept:', logreg.intercept_)
print('coef:', logreg.coef_)

intercept: [-3.7689325  -1.54457256 -3.04204368 -2.06197147]
coef: [[ 0.11720071  0.28782184  0.34705795 ... -0.49794072 -0.78959388
  -0.49186117]
 [-0.10257106  0.03417873  0.30798379 ...  0.53314022 -0.30128909
  -0.18717642]
 [-0.06656317 -0.05803454 -1.67994676 ... -0.18062846 -0.32823707
   0.49479682]
 [ 0.00774707 -0.0991788  -0.70795719 ... -0.18433756  0.78773326
  -0.18900606]]


In [109]:
print('precision score:',precision_score(y_train, y_hat_train, average='macro'))

precision score: 0.03089338115422237


  'precision', 'predicted', average, warn_for)


In [110]:
print('accuracy score:',accuracy_score(y_train, y_hat_train))

accuracy score: 0.12357352461688947
