#Decision Tree Model Predicting Turnout

##Load libraries and data set

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn import tree

data = pd.read_csv('FX_indicators_2020.csv', index_col='VOTER_ID')
data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,,,,,,,,,,
608312,1,0,35,0,0,3,46,46,46,88,...,,,,,,,,,,
222821,3,0,73,3,0,0,42,36,48,45,...,,,,,,,,,,
137882,2,0,54,1,1,0,37,34,51,61,...,,,,,,,,,,
531303,2,0,51,0,3,0,46,46,46,88,...,,,,,,,,,,


##Data Wrangling

In [2]:
#create a model type for csv output file
model_type = 'DTturnout'

#create a list of dependent variables
dv_list = ['CAND1S','CAND2S','MESSAGE','VG_14_DV','D2','R2','D3','R3','I3','CAND1_UND','CAND1_SD2','CAND1_SDA','CAND1_LD2',
           'CAND1_LDA', 'CAND1_SR2','CAND1_SRA','CAND1_LRA','CAND1_LR2','CAND1_SFT','CAND2_UND','CAND2_SD2','CAND2_SDA',
           'CAND2_LD2','CAND2_LDA', 'CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT','MOVED_RD','MOVED_DR',
           'MOVED_AW','MOVED_U','MOVED_AD', 'MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA',
           'MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB']

#identify the dependent variable for this model
dv = 'VG_14_DV'

#create a drop variable and remove the dependent variable from the list
fields_to_drop = dv_list
fields_to_drop.remove(dv)

#add indicators that need to be dropped to the drop variable
fields_to_drop = fields_to_drop + ['VPP_12','VPP_16','VPR_12','VPR_14','VPR_16','VG_08','VG_12','VG_14','VG_16',
                                   'PP_PELIG','PR_PELIG', 'AP_PELIG','G_PELIG','E_PELIG','NL5G','NL3PR','NL5AP','NL2PP',
                                   'PRS16_PD','PRS16_PR']

#remove all unneeded fields
data.drop(fields_to_drop, axis=1, inplace=True)

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,CULINARYIN,HEALTHFITN,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,MSG_A,MSG_B,VG_14_DV
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,0,1,0,0,0,0,12,0,0,
608312,1,0,35,0,0,3,46,46,46,88,...,0,1,0,0,1,1,16,0,0,
222821,3,0,73,3,0,0,42,36,48,45,...,2,1,0,0,1,1,12,0,0,N
137882,2,0,54,1,1,0,37,34,51,61,...,0,1,0,0,0,0,12,1,0,Y
531303,2,0,51,0,3,0,46,46,46,88,...,0,1,0,0,1,1,15,0,0,Y


In [3]:
#change all Y/N to 1/0 for target variable
data[dv].replace({'N':0, 'Y':1}, inplace=True)

#drop null records from target variable and save to new dataset
data_all = data.dropna(subset=[dv])

data_all.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,CULINARYIN,HEALTHFITN,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,MSG_A,MSG_B,VG_14_DV
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
222821,3,0,73,3,0,0,42,36,48,45,...,2,1,0,0,1,1,12,0,0,0.0
137882,2,0,54,1,1,0,37,34,51,61,...,0,1,0,0,0,0,12,1,0,1.0
531303,2,0,51,0,3,0,46,46,46,88,...,0,1,0,0,1,1,15,0,0,1.0
367387,2,0,32,1,0,2,41,34,42,23,...,0,1,0,0,1,1,12,0,0,0.0
65445,2,0,76,0,2,0,32,32,33,67,...,0,1,0,0,1,1,14,0,0,1.0


In [4]:
#create a train/test split using set 1 and 2 for training and set 3 for testing

df_train = data_all[(data_all['SET_NO'] == 1) | (data_all['SET_NO'] == 2)]
df_train.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator

y_train = df_train.pop(dv).values
X_train = df_train

df_test = data_all[data_all['SET_NO'] == 3]
df_test.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator

y_test = df_test.pop(dv).values
X_test = df_test


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


##Build/Run/Evaluate Model (full dataset)

In [5]:
#create and fit the decision tree model to the dataset
clf = tree.DecisionTreeClassifier(min_samples_leaf=500)
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 

In [6]:
#evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.68      0.73      0.70     38717
         1.0       0.81      0.77      0.79     57073

    accuracy                           0.75     95790
   macro avg       0.74      0.75      0.74     95790
weighted avg       0.75      0.75      0.75     95790



In [7]:
#show importance in tree for each feature
pd.set_option('display.max_rows', None)
pd.Series(clf.feature_importances_, index=X_test.columns)


OPP_SEX       0.000000
AGE           0.064138
HH_ND         0.002105
HH_NR         0.006672
HH_NI         0.011370
MED_AGE       0.000050
MED_AGE_M     0.000442
MED_AGE_F     0.000249
NH_WHITE      0.000660
NH_AA         0.000953
NH_NATAM      0.000000
NH_ASIAN      0.000067
NH_HPI        0.000000
NH_OTHER      0.000000
NH_MULT       0.000124
HISP          0.000190
COMM_LT10     0.000269
COMM_609P     0.000665
MED_HH_INC    0.001391
COMM_CAR      0.000237
COMM_CP       0.000259
COMM_PT       0.000345
COMM_WALK     0.000290
KIDS          0.000496
KIDS_MC       0.000414
M_NEV_MAR     0.000064
M_MAR         0.000149
M_MAR_SP      0.000086
M_MAR_SNP     0.000033
F_NEV_MAR     0.001694
F_MAR         0.000619
F_MAR_SP      0.000804
F_MAR_SNP     0.000307
ED_ASSOC      0.000000
ED_BACH       0.000948
ED_MD         0.000851
ED_PROF       0.000000
ED_DOC        0.000000
ED_4COL       0.001060
GENDER_F      0.000931
GENDER_M      0.000269
H_AFDLN3P     0.000000
H_AFSSLN3P    0.000000
H_F1       

##Score the Voter IDs and Export

In [8]:
# Remove extraneous fields (the DV and `SET_NO`) before scoring.
data.drop([dv, 'SET_NO'], axis=1, inplace=True)

# Score the whole file.
pred = clf.predict_proba(data.values)[:, 1]
scores = pd.DataFrame({dv: pred}, index=data.index)

  "X does not have valid feature names, but"


In [9]:
filename = 'DTturnout.csv'.format(dv, model_type)
scores.to_csv(filename)