# Technical Notebook 4 - LogReg

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
%matplotlib inline

In [2]:
# read in CSV saved after cleaning in technical notebook 2 
df = pd.read_csv('final_data/contr-income.csv', 
                 index_col=0
                )
df.shape

(11502, 8)

## Label encoding

In [3]:
# encode the candidates as specific labels 
df['cand_num'] = df.cand_nm
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
df.cand_num = le.transform(y)
y = df.cand_num

In [4]:
print(df.cand_nm.value_counts())
print(df.cand_num.value_counts())

Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64
1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64


## Convert zip code to dummy variables

In [5]:
df.contbr_zip = df.contbr_zip.astype(str)
df.dtypes

cand_nm               object
contbr_zip            object
contbr_occupation     object
contb_receipt_amt    float64
occ_cat               object
converted_date         int64
zip                  float64
income               float64
cand_num               int64
dtype: object

In [6]:
x_feats = ['converted_date',
           'contbr_zip',
           'contb_receipt_amt']
X = pd.get_dummies(df[x_feats],
                   drop_first=True)
X.head()

Unnamed: 0,converted_date,contb_receipt_amt,contbr_zip_20002,contbr_zip_20003,contbr_zip_20004,contbr_zip_20005,contbr_zip_20006,contbr_zip_20007,contbr_zip_20008,contbr_zip_20009,...,contbr_zip_20015,contbr_zip_20016,contbr_zip_20017,contbr_zip_20018,contbr_zip_20019,contbr_zip_20020,contbr_zip_20024,contbr_zip_20032,contbr_zip_20036,contbr_zip_20037
0,201906,100.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,201906,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,201903,3.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,201906,27.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# target variable is candidate's numeric value 
y = df.cand_num

## Split into train and test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

## Scale and transform training data

In [10]:
scaler = StandardScaler().fit(X_train)
X_scaled_train = scaler.transform(X_train)
X_scaled_train

array([[-0.87874505,  5.57690606, -0.32499792, ..., -0.05005989,
        -0.15394241, -0.1674655 ],
       [ 1.16934138, -0.39667775, -0.32499792, ..., -0.05005989,
        -0.15394241, -0.1674655 ],
       [-2.41480987,  1.70911798, -0.32499792, ..., -0.05005989,
        -0.15394241, -0.1674655 ],
       ...,
       [-0.36672344, -0.43320686, -0.32499792, ..., -0.05005989,
        -0.15394241, -0.1674655 ],
       [ 1.16934138, -0.43320686, -0.32499792, ..., -0.05005989,
        -0.15394241, -0.1674655 ],
       [ 1.16934138, -0.41816547, -0.32499792, ..., -0.05005989,
        -0.15394241,  5.97137928]])

## Create classifier

In [None]:
clf = LogisticRegression(  C=2, 
                           solver='lbfgs', 
                           multi_class='multinomial', 
                           max_iter=1000, 
                           penalty='l2',
                           class_weight={0: .89 ,1: .78, 2: .98, 3: .87})

## Call the model

In [None]:
clf.fit(X_scaled_train, y_train)
y_hat_train = clf.predict(X_scaled_train)
clf.score(X_scaled_train, y_train)

## Check confusion matrix

In [None]:
# create confusion matrix
cm = confusion_matrix(y_train, y_hat_train)
conf_matrix = pd.DataFrame(cm, index=['actual Biden',
                                      'actual Buttigieg',
                                      'actual Sanders',
                                      'actual Warren'], 
                           columns= ['Biden', 
                                     'Buttigieg',
                                     'Sanders',
                                     'Warren'])
conf_matrix

In [None]:
clf.score(X_test, y_test)

## Optimize for Sanders metric
Classify correctly as many Sanders contributors as possible. 