# Technical Notebook 3 - Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, accuracy_score
import statsmodels.api as sm
%matplotlib inline

In [3]:
# read in CSV saved after cleaning in technical notebook 2 
df = pd.read_csv('final_data/contr-income.csv', 
                 index_col=0
                )
df.shape

(11502, 8)

In [4]:
# mean contribution amount by candidate
df.groupby('cand_nm').mean()['contb_receipt_amt']

cand_nm
Biden, Joseph R Jr    455.965489
Buttigieg, Pete       316.766818
Sanders, Bernard       59.188698
Warren, Elizabeth     101.616753
Name: contb_receipt_amt, dtype: float64

In [5]:
# mode zip code 
df.groupby('cand_nm').mean()['income']

cand_nm
Biden, Joseph R Jr    103579.781844
Buttigieg, Pete        96094.845969
Sanders, Bernard       91437.470653
Warren, Elizabeth      92991.747298
Name: income, dtype: float64

In [6]:
df.groupby('cand_nm').min()['converted_date']


cand_nm
Biden, Joseph R Jr    201904
Buttigieg, Pete       201901
Sanders, Bernard      201902
Warren, Elizabeth     201901
Name: converted_date, dtype: int64

In [7]:
# contribution amount by month for each candidate 
df.groupby(['cand_nm','converted_date']).mean()['contb_receipt_amt']

cand_nm             converted_date
Biden, Joseph R Jr  201904            711.465022
                    201905            447.095444
                    201906            596.462398
                    201907            197.217160
                    201908            516.234118
                    201909            251.383166
Buttigieg, Pete     201901            321.428571
                    201902            571.052632
                    201903            481.066948
                    201904            533.287420
                    201905            683.124096
                    201906            325.153974
                    201907            160.074282
                    201908             94.050600
                    201909            148.499467
Sanders, Bernard    201902            307.370727
                    201903             68.412271
                    201904             65.967037
                    201905             58.492265
                    201906        

In [8]:
df['cand_num'] = df['cand_nm']

In [9]:
df.head()
df.shape

(11502, 9)

## Label encoding

In [10]:
# encode the candidates as specific labels 
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
num_y = le.transform(y)
df.cand_num = num_y

In [11]:
print(df.cand_nm.value_counts())
print(df.cand_num.value_counts())

Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64
1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64


## Split dataset into train and test

In [12]:
# X predictor variables 
x_feats = ['converted_date',
           'income',
           'occ_cat',
           'contb_receipt_amt']
X = pd.get_dummies(df[x_feats],
                   drop_first=True, 
                   dtype=float)

In [13]:
# target variable is candidate's numeric value 
y = df.cand_num

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

## Scale x values 

In [15]:
scaler = StandardScaler().fit(X_train)

columns = X_train.columns 

scaled_train = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(scaled_train, 
                              columns = columns
                             )

In [16]:
y_train = list(y_train)

## Call logistic regression function

In [17]:
multi = LogisticRegression(C=1e9, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2')

## Fit the model

In [18]:
multi_model = multi.fit(X_train_scaled, y_train)

## Validate fit

In [19]:
y_hat_train = multi.predict(X_train_scaled)

In [20]:
len(multi.coef_[0])

15

## Metrics 

In [21]:
# multinomial performs better than logreg
print(multi.score(X_train_scaled, y_train))

0.4420171720465167


In [22]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,30,653,38,416
actual Buttigieg,21,1629,216,1130
actual Sanders,1,376,681,1123
actual Warren,10,804,346,1727


## Lasso and Ridge

In [None]:
rr = Ridge(alpha=100)
rr.fit(X_train_scaled, 
       y_train
      )

In [None]:
ridge_train_score = rr.score(X_train_scaled, 
                             y_train
                            )
print(ridge_train_score)

In [None]:
lasso = Lasso(alpha=.001)
lasso.fit(X_train_scaled, 
          y_train
         )
lasso_train_score = lasso.score(X_train_scaled, 
                                y_train
                               )
print(lasso_train_score)
coeff_used = np.sum(lasso.coef_!=0)
print(coeff_used)

  ## Tuning  parameters

In [None]:
# liblinear doesn't support multinomial backend (error message)
# tuned C to 2 from 1e9 originally 

multi = LogisticRegression(C=2, 
                           solver='lbfgs',
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2')

In [None]:
multi_model = multi.fit(X_train_scaled, y_train)

In [None]:
y_hat_train = multi.predict(X_train_scaled)

In [None]:
print(multi.score(X_train_scaled, y_train))

In [None]:
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

In [None]:
# changed weights to preference sensitivity to Biden
multi = LogisticRegression(C=2, 
                           solver='lbfgs',
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: 1, 1: .3, 2: .4, 3: .3})

In [None]:
# fit model 
multi_model = multi.fit(X_train_scaled, y_train)
# predit y based on X_train_scaled
y_hat_train = multi.predict(X_train_scaled)
print(multi.score(X_train_scaled, y_train))

In [None]:
# confusion matrix 
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

## Cross Validate

In [None]:
# k = 3 
multi_scores = cross_val_score(multi_model,
                        X_train_scaled, 
                        y_train, 
                        cv=3, 
                        )
multi_scores

## Test data

In [23]:
# scale test data 
scaler = StandardScaler().fit(X_test)

columns = X_test.columns 

scaled_test = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(scaled_test, 
                              columns = columns
                             )

In [24]:
y_hat_test = multi.predict(X_test_scaled)

In [25]:
# 44.763% accuracywith the first function (optimizing for accuracy)
print(multi.score(X_test_scaled, y_test))

0.44763146458061714
