#Models Predicting Candidate Support

This model shows strong or leaning support for the Republican Candidate in the initial wave.

###Data Wrangling

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('FX_indicators_2020.csv', index_col='VOTER_ID')

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,,,,,,,,,,
608312,1,0,35,0,0,3,46,46,46,88,...,,,,,,,,,,
222821,3,0,73,3,0,0,42,36,48,45,...,,,,,,,,,,
137882,2,0,54,1,1,0,37,34,51,61,...,,,,,,,,,,
531303,2,0,51,0,3,0,46,46,46,88,...,,,,,,,,,,


In [None]:
#create a list of dependent variables
dv_list = ['CAND1S','CAND2S','MESSAGE','VG_14_DV','D2','R2','D3','R3','I3','CAND1_UND','CAND1_SD2','CAND1_SDA','CAND1_LD2',
           'CAND1_LDA', 'CAND1_SR2','CAND1_SRA','CAND1_LRA','CAND1_LR2','CAND1_SFT','CAND2_UND','CAND2_SD2','CAND2_SDA',
           'CAND2_LD2','CAND2_LDA', 'CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT','MOVED_RD','MOVED_DR',
           'MOVED_AW','MOVED_U','MOVED_AD', 'MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA',
           'MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB']

#identify the dependent variable for this model
dv = 'CAND1_LR2'

#create a drop variable and remove the dependent variable from the list
fields_to_drop = dv_list
fields_to_drop.remove(dv)

#remove all unneeded fields
data.drop(fields_to_drop, axis=1, inplace=True)

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,CAND2_LRA
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,0,0,0,0,12,48.8,49.6,0,0,
608312,1,0,35,0,0,3,46,46,46,88,...,0,0,1,1,16,48.4,49.9,0,0,
222821,3,0,73,3,0,0,42,36,48,45,...,0,0,1,1,12,47.8,50.5,0,0,
137882,2,0,54,1,1,0,37,34,51,61,...,0,0,0,0,12,54.9,43.4,1,0,
531303,2,0,51,0,3,0,46,46,46,88,...,0,0,1,1,15,48.3,50.1,0,0,


In [None]:
#change all Y/N to 1/0 for target variable 
data[dv].replace({'N':0, 'Y':1}, inplace=True)

#drop null records from target variable and save to new dataset 
data_all = data.dropna(subset=[dv]) 
data_all.head(5) 

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,CAND2_LRA
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404213,1,29,78,0,0,2,39,26,50,37,...,0,0,0,0,12,43.4,55.0,0,1,1.0
136981,2,0,29,5,0,0,50,49,52,82,...,0,0,1,0,12,45.6,52.7,0,0,0.0
532172,1,0,26,2,0,2,37,35,38,87,...,0,0,0,0,12,49.2,49.1,0,1,0.0
208471,1,0,57,1,2,1,37,37,38,54,...,0,0,0,0,12,45.2,53.2,0,1,1.0
261184,3,0,50,0,0,2,38,38,39,83,...,0,0,0,0,12,39.5,58.9,0,0,1.0


In [None]:
#create a train/test split using set 1 and 2 for training and set 3 for testing 
df_train = data_all[(data_all['SET_NO'] == 1) | (data_all['SET_NO'] == 2)] 
df_train.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_train = df_train.pop(dv).values
X_train = df_train 
df_test = data_all[data_all['SET_NO'] == 3] 
df_test.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_test = df_test.pop(dv).values 
X_test = df_test 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


##Decision Tree Model

In [None]:
#create and fit the decision tree model to the dataset
clf = tree.DecisionTreeClassifier(min_samples_leaf=500)
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 

In [None]:
#evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.82      0.77      7574
         1.0       0.74      0.61      0.67      6260

    accuracy                           0.73     13834
   macro avg       0.73      0.72      0.72     13834
weighted avg       0.73      0.73      0.72     13834



In [None]:
#show importance in tree for each feature
pd.set_option('display.max_rows', None)
pd.Series(clf.feature_importances_, index=X_test.columns)

OPP_SEX       0.000000
AGE           0.038279
HH_ND         0.477553
HH_NR         0.118669
HH_NI         0.009309
MED_AGE       0.000000
MED_AGE_M     0.000000
MED_AGE_F     0.002094
NH_WHITE      0.037445
NH_AA         0.000000
NH_NATAM      0.000000
NH_ASIAN      0.000000
NH_HPI        0.000000
NH_OTHER      0.000000
NH_MULT       0.000000
HISP          0.000000
COMM_LT10     0.000000
COMM_609P     0.000000
MED_HH_INC    0.000000
COMM_CAR      0.000000
COMM_CP       0.000000
COMM_PT       0.000000
COMM_WALK     0.000000
KIDS          0.000377
KIDS_MC       0.000000
M_NEV_MAR     0.000000
M_MAR         0.000000
M_MAR_SP      0.000000
M_MAR_SNP     0.002386
F_NEV_MAR     0.000000
F_MAR         0.000000
F_MAR_SP      0.000000
F_MAR_SNP     0.000000
ED_ASSOC      0.000000
ED_BACH       0.002999
ED_MD         0.037407
ED_PROF       0.000000
ED_DOC        0.005948
ED_4COL       0.102994
GENDER_F      0.099916
GENDER_M      0.013250
H_AFDLN3P     0.000000
H_AFSSLN3P    0.000000
H_F1       

##Logistic Regression Model

In [None]:
#create and fit the logistic regression model to the dataset
lr = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.73      0.78      0.76      7574
         1.0       0.71      0.65      0.68      6260

    accuracy                           0.72     13834
   macro avg       0.72      0.72      0.72     13834
weighted avg       0.72      0.72      0.72     13834



##Scoring and exporting both models

In [None]:
# Remove extraneous fields (the DV and `SET_NO`) before scoring.
data.drop([dv, 'SET_NO'], axis=1, inplace=True)

#Decision Tree scoring
# Score the whole file.
pred = clf.predict_proba(data.values)[:, 1]
scores = pd.DataFrame({dv: pred}, index=data.index)

#export the file to csv
model_type = 'DTcand_support'
filename = 'DTcand_support.csv'.format(dv, model_type)
scores.to_csv(filename)

  "X does not have valid feature names, but"


In [None]:
#Logistic Regression Scoring
# Score the whole file. 
pred = lr.predict_proba(data.values)[:, 1] 
scores = pd.DataFrame({dv: pred}, index=data.index) 

#export the file to csv
model_type = 'LRcand_support'
filename = 'LRcand_support.csv'.format(dv, model_type) 
scores.to_csv(filename) 

  "X does not have valid feature names, but"
