In [257]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, accuracy_score
from sklearn.feature_selection import RFE 
%matplotlib inline

In [204]:
df = pd.read_csv('final_data/contr-income.csv', index_col=0)
df.head()

Unnamed: 0,cand_nm,contbr_zip,contbr_occupation,contb_receipt_amt,occ_cat,converted_date,zip,income
0,"Sanders, Bernard",20001,UNION REPRESENTATIVE,100.0,union representative,201906,20001.0,85976.0
1,"Sanders, Bernard",20001,UNION REPRESENTATIVE,3.0,union representative,201906,20001.0,85976.0
2,"Sanders, Bernard",20001,UNION REPRESENTATIVE,27.0,union representative,201906,20001.0,85976.0
3,"Sanders, Bernard",20007,IT,3.0,it,201903,20007.0,119267.0
4,"Sanders, Bernard",20001,SOFTWARE DEVELOPER,27.0,it,201906,20001.0,85976.0


In [205]:
# mean contribution amount by candidate
df.groupby('cand_nm').mean()['contb_receipt_amt']

cand_nm
Biden, Joseph R Jr    455.965489
Buttigieg, Pete       316.766818
Sanders, Bernard       59.188698
Warren, Elizabeth     101.616753
Name: contb_receipt_amt, dtype: float64

In [206]:
# mode zip code 
df.groupby('cand_nm').mean()['income']

cand_nm
Biden, Joseph R Jr    103579.781844
Buttigieg, Pete        96094.845969
Sanders, Bernard       91437.470653
Warren, Elizabeth      92991.747298
Name: income, dtype: float64

In [207]:
df.groupby('cand_nm').max()['occ_cat']

cand_nm
Biden, Joseph R Jr        volunteer
Buttigieg, Pete                wine
Sanders, Bernard               yoga
Warren, Elizabeth     writer/editor
Name: occ_cat, dtype: object

In [208]:
df.groupby('cand_nm').min()['converted_date']


cand_nm
Biden, Joseph R Jr    201904
Buttigieg, Pete       201901
Sanders, Bernard      201902
Warren, Elizabeth     201901
Name: converted_date, dtype: int64

In [209]:
# contribution amount by month for each candidate 
df.groupby(['cand_nm','converted_date']).mean()['contb_receipt_amt']

cand_nm             converted_date
Biden, Joseph R Jr  201904            711.465022
                    201905            447.095444
                    201906            596.462398
                    201907            197.217160
                    201908            516.234118
                    201909            251.383166
Buttigieg, Pete     201901            321.428571
                    201902            571.052632
                    201903            481.066948
                    201904            533.287420
                    201905            683.124096
                    201906            325.153974
                    201907            160.074282
                    201908             94.050600
                    201909            148.499467
Sanders, Bernard    201902            307.370727
                    201903             68.412271
                    201904             65.967037
                    201905             58.492265
                    201906        

In [210]:
df['cand_num'] = df['cand_nm']

In [211]:
df.head()
df.shape

(11502, 9)

## Label encoding

In [212]:
# encode the candidates as specific labels 
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
num_y = le.transform(y)
df.cand_num = num_y

In [213]:
print(df.cand_num.value_counts())
print(df.cand_nm.value_counts())

1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64
Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64


In [214]:
# get dummies for occ_cat and converted_date? 
len(pd.get_dummies(data=df, columns=['converted_date'], drop_first=True).columns)
df_dummy_dates = pd.get_dummies(data=df, columns=['converted_date'], drop_first=True)
df_dummy_dates.head()


Unnamed: 0,cand_nm,contbr_zip,contbr_occupation,contb_receipt_amt,occ_cat,zip,income,cand_num,converted_date_201902,converted_date_201903,converted_date_201904,converted_date_201905,converted_date_201906,converted_date_201907,converted_date_201908,converted_date_201909
0,"Sanders, Bernard",20001,UNION REPRESENTATIVE,100.0,union representative,20001.0,85976.0,2,0,0,0,0,1,0,0,0
1,"Sanders, Bernard",20001,UNION REPRESENTATIVE,3.0,union representative,20001.0,85976.0,2,0,0,0,0,1,0,0,0
2,"Sanders, Bernard",20001,UNION REPRESENTATIVE,27.0,union representative,20001.0,85976.0,2,0,0,0,0,1,0,0,0
3,"Sanders, Bernard",20007,IT,3.0,it,20007.0,119267.0,2,0,1,0,0,0,0,0,0
4,"Sanders, Bernard",20001,SOFTWARE DEVELOPER,27.0,it,20001.0,85976.0,2,0,0,0,0,1,0,0,0


In [215]:
# creating dummies for occ_cat 

len(pd.get_dummies(data=df, columns=['occ_cat'], drop_first=True).columns)
df_dummy_all = pd.get_dummies(data=df, columns=['converted_date', 'occ_cat'], drop_first=True)

In [216]:
# 404 columns 
df_dummy_all.head().columns

Index(['cand_nm', 'contbr_zip', 'contbr_occupation', 'contb_receipt_amt',
       'zip', 'income', 'cand_num', 'converted_date_201902',
       'converted_date_201903', 'converted_date_201904',
       ...
       'occ_cat_ux design', 'occ_cat_val', 'occ_cat_venture capital',
       'occ_cat_veterinarian', 'occ_cat_video and film production',
       'occ_cat_volunteer', 'occ_cat_welder', 'occ_cat_wine',
       'occ_cat_writer/editor', 'occ_cat_yoga'],
      dtype='object', length=404)

## Split dataset into train and test

In [241]:
# X predictor variables 
x_feats = ['converted_date','occ_cat', 'income','contb_receipt_amt']
X = pd.get_dummies(df[x_feats],drop_first=True, dtype=float)

In [242]:
# target variable is candidate's numeric value 
y = df.cand_num

In [243]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, test_size=.2)


In [244]:
scaler = StandardScaler().fit(X_train)

columns = X_train.columns 

scaled_train = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(scaled_train, columns = columns)

## Logistic Regression

In [264]:
logreg = LogisticRegression(C=1e9, solver='lbfgs', multi_class='auto', max_iter=1000)


In [246]:
model_log = logreg.fit(X_train_scaled,y_train)
model_log

LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [247]:
logreg.score(X_train_scaled, y_train)

0.51450929246821

In [248]:
y_hat_train = logreg.predict(X_train)


In [249]:
residuals = np.abs(y_train - y_hat_train)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

1    2996
3    2887
2    2181
0    1137
Name: cand_num, dtype: int64
1    0.325617
3    0.313770
2    0.237039
0    0.123574
Name: cand_num, dtype: float64


In [250]:
cm = confusion_matrix(y_train, y_hat_train)
cm

array([[1137,    0,    0,    0],
       [2996,    0,    0,    0],
       [2181,    0,    0,    0],
       [2887,    0,    0,    0]])

In [251]:
logreg.intercept_

array([-1.63073709,  2.06605207, -1.41049341,  0.97517843])

In [252]:
logreg.coef_

array([[ 0.1196791 ,  0.2051216 ,  0.84077857, ..., -0.42629674,
        -0.51431028, -0.381101  ],
       [-0.04648365, -0.01558936,  0.71366867, ...,  1.13248606,
        -0.69411804, -0.43714939],
       [-0.06863427, -0.08496606, -1.30615112, ..., -0.31561665,
        -0.86798849,  1.23456567],
       [-0.00456119, -0.10456619, -0.24829612, ..., -0.39057267,
         2.07641681, -0.41631527]])

In [258]:
print('precision score:',precision_score(y_train, y_hat_train, average='macro'))

precision score: 0.03089338115422237


In [262]:
print('accuracy score:',accuracy_score(y_train, y_hat_train))

accuracy score: 0.12357352461688947
