#Combined (Ensemble) Model predicting Partisanship

###Data Wrangling

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('FX_indicators_2020.csv', index_col='VOTER_ID')

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,,,,,,,,,,
608312,1,0,35,0,0,3,46,46,46,88,...,,,,,,,,,,
222821,3,0,73,3,0,0,42,36,48,45,...,,,,,,,,,,
137882,2,0,54,1,1,0,37,34,51,61,...,,,,,,,,,,
531303,2,0,51,0,3,0,46,46,46,88,...,,,,,,,,,,


In [2]:
#create a list of dependent variables
dv_list = ['CAND1S','CAND2S','MESSAGE','VG_14_DV','D2','R2','D3','R3','I3','CAND1_UND','CAND1_SD2','CAND1_SDA','CAND1_LD2',
           'CAND1_LDA', 'CAND1_SR2','CAND1_SRA','CAND1_LRA','CAND1_LR2','CAND1_SFT','CAND2_UND','CAND2_SD2','CAND2_SDA',
           'CAND2_LD2','CAND2_LDA', 'CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT','MOVED_RD','MOVED_DR',
           'MOVED_AW','MOVED_U','MOVED_AD', 'MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA',
           'MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB']

#identify the dependent variable for this model
dv = 'R2'

#create a drop variable and remove the dependent variable from the list
fields_to_drop = dv_list
fields_to_drop.remove(dv)

#add indicators that need to be dropped to the drop variable
fields_to_drop = fields_to_drop + ['PARTY_D','PARTY_I','PARTY_R','HHP_D', \
'HHP_DD','HHP_DI','HHP_DR','HHP_I','HHP_II','HHP_R','HHP_RI','HHP_RR','HH_ND', \
'HH_NR','HH_NI']

#remove all unneeded fields
data.drop(fields_to_drop, axis=1, inplace=True)

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,NH_AA,NH_NATAM,NH_ASIAN,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,R2
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,38,39,38,76,17,1,1,...,0,0,0,0,12,48.8,49.6,0,0,N
608312,1,0,35,46,46,46,88,2,0,4,...,0,0,1,1,16,48.4,49.9,0,0,
222821,3,0,73,42,36,48,45,35,3,0,...,0,0,1,1,12,47.8,50.5,0,0,
137882,2,0,54,37,34,51,61,34,1,0,...,0,0,0,0,12,54.9,43.4,1,0,N
531303,2,0,51,46,46,46,88,2,0,4,...,0,0,1,1,15,48.3,50.1,0,0,N


In [3]:
#change all Y/N to 1/0 for target variable 
data[dv].replace({'N':0, 'Y':1}, inplace=True)

#drop null records from target variable and save to new dataset 
data_all = data.dropna(subset=[dv]) 
data_all.head(5) 

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,NH_AA,NH_NATAM,NH_ASIAN,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,R2
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,38,39,38,76,17,1,1,...,0,0,0,0,12,48.8,49.6,0,0,0.0
137882,2,0,54,37,34,51,61,34,1,0,...,0,0,0,0,12,54.9,43.4,1,0,0.0
531303,2,0,51,46,46,46,88,2,0,4,...,0,0,1,1,15,48.3,50.1,0,0,0.0
367387,2,0,32,41,34,42,23,64,0,7,...,0,0,1,1,12,40.6,57.8,0,0,0.0
257339,3,0,41,41,34,42,23,64,0,7,...,0,0,1,1,12,44.3,54.1,0,0,0.0


In [4]:
#create a train/test split using set 1 and 2 for training and set 3 for testing 
df_train = data_all[(data_all['SET_NO'] == 1) | (data_all['SET_NO'] == 2)] 
df_train.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_train = df_train.pop(dv).values
X_train = df_train 
df_test = data_all[data_all['SET_NO'] == 3] 
df_test.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_test = df_test.pop(dv).values 
X_test = df_test 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


##Decision Tree Model

In [5]:
#create and fit the decision tree model to the dataset
clf = tree.DecisionTreeClassifier(min_samples_leaf=500)
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test) 

In [6]:
#evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.67      0.83      0.74     61591
         1.0       0.53      0.32      0.39     36326

    accuracy                           0.64     97917
   macro avg       0.60      0.57      0.57     97917
weighted avg       0.62      0.64      0.61     97917



In [7]:
#show importance in tree for each feature
pd.set_option('display.max_rows', None)
pd.Series(clf.feature_importances_, index=X_test.columns)

OPP_SEX       0.000000
AGE           0.006275
MED_AGE       0.001186
MED_AGE_M     0.000999
MED_AGE_F     0.001664
NH_WHITE      0.001023
NH_AA         0.000744
NH_NATAM      0.000763
NH_ASIAN      0.000117
NH_HPI        0.000000
NH_OTHER      0.000113
NH_MULT       0.000854
HISP          0.000452
COMM_LT10     0.002828
COMM_609P     0.001397
MED_HH_INC    0.004385
COMM_CAR      0.001098
COMM_CP       0.001874
COMM_PT       0.001193
COMM_WALK     0.000835
KIDS          0.001314
KIDS_MC       0.001574
M_NEV_MAR     0.001085
M_MAR         0.000309
M_MAR_SP      0.000868
M_MAR_SNP     0.000848
F_NEV_MAR     0.001889
F_MAR         0.000815
F_MAR_SP      0.001039
F_MAR_SNP     0.001111
ED_ASSOC      0.000214
ED_BACH       0.000490
ED_MD         0.000408
ED_PROF       0.000757
ED_DOC        0.002064
ED_4COL       0.001063
GENDER_F      0.000071
GENDER_M      0.000622
H_AFDLN3P     0.000000
H_AFSSLN3P    0.000000
H_F1          0.000000
H_FFDLN2      0.000000
H_FFSLN2      0.000000
H_M1       

##Logistic Regression Model

In [8]:
#create and fit the logistic regression model to the dataset
lr = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.66      0.88      0.76     61591
         1.0       0.55      0.24      0.34     36326

    accuracy                           0.65     97917
   macro avg       0.61      0.56      0.55     97917
weighted avg       0.62      0.65      0.60     97917



##Scoring and exporting both models

In [10]:
# Remove extraneous fields (the DV and `SET_NO`) before scoring.
data.drop([dv, 'SET_NO'], axis=1, inplace=True)

#Decision Tree scoring
# Score the whole file.
pred = clf.predict_proba(data.values)[:, 1]
scores = pd.DataFrame({dv: pred}, index=data.index)

#export the file to csv
model_type = 'DTcand_support'
filename = 'DTcand_support.csv'.format(dv, model_type)
scores.to_csv(filename)

  "X does not have valid feature names, but"


In [11]:
#Logistic Regression Scoring
# Score the whole file. 
pred2 = lr.predict_proba(data.values)[:, 1] 
scores2 = pd.DataFrame({dv: pred2}, index=data.index) 

#export the file to csv
model_type2 = 'LRcand_support'
filename2 = 'LRcand_support.csv'.format(dv, model_type2) 
scores2.to_csv(filename2) 

  "X does not have valid feature names, but"


##Combined Model (Max Voting Method)

In [12]:
#Max voting model
from sklearn.ensemble import VotingClassifier

model = VotingClassifier(estimators=[('lr', lr), ('dt', clf)], voting='soft')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.6451382293166662

In [13]:
# Score the whole file. 
pred3 = model.predict_proba(data.values)[:, 1] 
scores3 = pd.DataFrame({dv: pred3}, index=data.index) 

#export the file to csv
model_type = 'MaxEnsemble_cand_support'
filename = 'MaxEnsemble_cand_support.csv'.format(dv, model_type)
scores3.to_csv(filename)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [14]:
  #group the sorted predictions into 5 roughly equal groups and calculate the mean
  groups = [int(5 * i/len(scores3.R2)) for i in range(len(scores3.R2))]
  meanPercentile = scores3.R2.groupby(groups).mean()
  #divide the mean prediction to get the mean response
  meanResponse = meanPercentile / scores3.R2.mean()
  meanResponse.index = (meanResponse.index + 1) * 20
  print ("Lift by Quintile\n", meanResponse)

Lift by Quintile
 20     0.997856
40     1.000392
60     1.000686
80     1.000740
100    1.000326
Name: R2, dtype: float64


##Combined Model (Average Model Method)

In [15]:
pred1=lr.predict(X_test)
pred2=clf.predict(X_test)

finalpred=(pred1+pred2)/2

In [16]:
pred1=lr.predict_proba(data.values)[:, 1] 
pred2=clf.predict_proba(data.values)[:, 1] 

finalpred=(pred1+pred2)/2

scores4 = pd.DataFrame({dv: finalpred}, index=data.index) 

#export the file to csv
model_type = 'AvgEnsemble_cand_support'
filename = 'AvgEnsemble_cand_support.csv'.format(dv, model_type)
scores4.to_csv(filename)

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [17]:
  #group the sorted predictions into 5 roughly equal groups and calculate the mean
  groups = [int(5 * i/len(scores4.R2)) for i in range(len(scores4.R2))]
  meanPercentile = scores4.R2.groupby(groups).mean()
  #divide the mean prediction to get the mean response
  meanResponse = meanPercentile / scores4.R2.mean()
  meanResponse.index = (meanResponse.index + 1) * 20
  print ("Lift by Quintile\n", meanResponse)

Lift by Quintile
 20     0.997856
40     1.000392
60     1.000686
80     1.000740
100    1.000326
Name: R2, dtype: float64
