# Technical Notebook 4 - LogReg

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, accuracy_score
import statsmodels.api as sm
%matplotlib inline

In [3]:
# read in CSV saved after cleaning in technical notebook 2 
df = pd.read_csv('final_data/contr-income.csv', 
                 index_col=0
                )
df.shape

(11502, 8)

## Label encoding

In [10]:
# encode the candidates as specific labels 
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
num_y = le.transform(y)
df.cand_num = num_y

In [11]:
print(df.cand_nm.value_counts())
print(df.cand_num.value_counts())

Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64
1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64


## Split dataset into train and test

In [6]:
# X predictor variables 
x_feats = ['converted_date',
           'income',
           'contb_receipt_amt']
X = pd.get_dummies(df[x_feats],
                   drop_first=True, 
                   dtype=float)

In [7]:
# target variable is candidate's numeric value 
y = df.cand_num

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

## Scale x values 

In [9]:
scaler = StandardScaler().fit(X_train)

columns = X_train.columns 

scaled_train = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(scaled_train, 
                              columns = columns
                             )

## Call logistic regression function

In [12]:
multi = LogisticRegression(C=1e9, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2')

## Fit the model

In [25]:
multi_model = multi.fit(X_train_scaled, y_train)

## Validate fit

In [26]:
y_hat_train = multi.predict(X_train_scaled)

## Metrics 

In [27]:
# multinomial performs better than logreg
print(multi.score(X_train_scaled, y_train))

0.4040865123356157


In [28]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,14,492,2,629
actual Buttigieg,14,1237,3,1742
actual Sanders,1,195,97,1888
actual Warren,2,474,41,2370


  ## Tuning  parameters

In [56]:
# tuned C to 2 from 1e9 originally 

multi = LogisticRegression(C=2, 
                           solver='lbfgs',
                           multi_class='multinomial', 
                           max_iter=2000, 
                           penalty='l2')

In [57]:
multi_model = multi.fit(X_train_scaled, y_train)

In [58]:
y_hat_train = multi.predict(X_train_scaled)

In [59]:
print(multi.score(X_train_scaled, y_train))

0.4040865123356157


In [60]:
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,14,492,2,629
actual Buttigieg,14,1237,3,1742
actual Sanders,1,195,97,1888
actual Warren,2,474,41,2370


In [61]:
# changed weights to preference sensitivity to Biden
multi = LogisticRegression(C=2, 
                           solver='lbfgs',
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: 1, 1: .3, 2: .4, 3: .3})

In [62]:
# confusion matrix 
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

Unnamed: 0,Biden,Buttigieg,Sanders,Warren
actual Biden,14,492,2,629
actual Buttigieg,14,1237,3,1742
actual Sanders,1,195,97,1888
actual Warren,2,474,41,2370


## Cross Validate

In [63]:
# k = 3 
multi_scores = cross_val_score(multi_model,
                        X_train_scaled, 
                        y_train, 
                        cv=3, 
                        )
multi_scores

array([0.40156454, 0.39713075, 0.39693412])