# Technical Notebook 5 - Other Models

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, roc_curve, precision_score, accuracy_score
import statsmodels.api as sm
%matplotlib inline

In [40]:
# read in CSV saved after cleaning in technical notebook 2 
df = pd.read_csv('final_data/contr-income.csv', 
                 index_col=0
                )
df.shape

(11502, 8)

## Label encoding

In [41]:
df['cand_num'] = df['cand_nm']

In [42]:
# encode the candidates as specific labels 
y = df.cand_nm
le = LabelEncoder()
le.fit(y)
df['cand_num'] = le.transform(y)


In [43]:
print(df.cand_nm.value_counts())
print(df.cand_num.value_counts())

Buttigieg, Pete       3746
Warren, Elizabeth     3609
Sanders, Bernard      2726
Biden, Joseph R Jr    1421
Name: cand_nm, dtype: int64
1    3746
3    3609
2    2726
0    1421
Name: cand_num, dtype: int64


## Split dataset into train and test

In [44]:
# X predictor variables 
x_feats = ['converted_date',
           'contbr_zip',
           'contb_receipt_amt']
X = pd.get_dummies(df[x_feats],
                   drop_first=True, 
                   dtype=float)

In [45]:
corr = X.corr()
corr

Unnamed: 0,converted_date,contbr_zip,contb_receipt_amt
converted_date,1.0,0.050477,-0.185717
contbr_zip,0.050477,1.0,0.003237
contb_receipt_amt,-0.185717,0.003237,1.0


In [46]:
# target variable is candidate's numeric value 
y = df.cand_num

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    random_state=0, 
                                                    stratify=y, 
                                                    test_size=0.2
                                                   )

## Scale x values 

In [48]:
scaler = StandardScaler().fit(X_train)

columns = X_train.columns 

scaled_train = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(scaled_train, 
                              columns = columns
                             )