#Random Forest Model Predicting Likely Support of Democratic Candidate

For this model, I will be focusing on people who received a message and moved to be more supportive of the Republican candidate. The predictor variable I will be using is MOVED_AR. I will be tweaking uplift for each message with MSG_A and MSG_B indicator variables.

##Load libraries and data set

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv('FX_indicators_2020.csv', index_col='VOTER_ID')

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,MOVED_RDMA,MOVED_DRMA,MOVED_AWMA,MOVED_ADMA,MOVED_ARMA,MOVED_RDMB,MOVED_DRMB,MOVED_AWMB,MOVED_ADMB,MOVED_ARMB
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,,,,,,,,,,
608312,1,0,35,0,0,3,46,46,46,88,...,,,,,,,,,,
222821,3,0,73,3,0,0,42,36,48,45,...,,,,,,,,,,
137882,2,0,54,1,1,0,37,34,51,61,...,,,,,,,,,,
531303,2,0,51,0,3,0,46,46,46,88,...,,,,,,,,,,


##Data Wrangling

In [None]:
#create a model type for csv output file
model_type ='RFcand_move'

#create a list of dependent variables
dv_list = ['CAND1S','CAND2S','MESSAGE','VG_14_DV','D2','R2','D3','R3','I3','CAND1_UND','CAND1_SD2','CAND1_SDA','CAND1_LD2',
           'CAND1_LDA', 'CAND1_SR2','CAND1_SRA','CAND1_LRA','CAND1_LR2','CAND1_SFT','CAND2_UND','CAND2_SD2','CAND2_SDA',
           'CAND2_LD2','CAND2_LDA', 'CAND2_SR2','CAND2_SRA','CAND2_LRA','CAND2_LR2','CAND2_SFT','MOVED_RD','MOVED_DR',
           'MOVED_AW','MOVED_U','MOVED_AD', 'MOVED_AR','MOVED_RDMA','MOVED_DRMA','MOVED_AWMA','MOVED_ADMA','MOVED_ARMA',
           'MOVED_RDMB','MOVED_DRMB','MOVED_AWMB','MOVED_ADMB','MOVED_ARMB']

#identify the dependent variable for this model
dv = 'MOVED_AR'

#create a drop variable and remove the dependent variable from the list
fields_to_drop = dv_list
fields_to_drop.remove(dv)

#remove all unneeded fields
data.drop(fields_to_drop, axis=1, inplace=True)

data.head(5)

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,MOVED_AR
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
84508,3,0,25,4,0,0,38,39,38,76,...,0,0,0,0,12,48.8,49.6,0,0,
608312,1,0,35,0,0,3,46,46,46,88,...,0,0,1,1,16,48.4,49.9,0,0,
222821,3,0,73,3,0,0,42,36,48,45,...,0,0,1,1,12,47.8,50.5,0,0,
137882,2,0,54,1,1,0,37,34,51,61,...,0,0,0,0,12,54.9,43.4,1,0,
531303,2,0,51,0,3,0,46,46,46,88,...,0,0,1,1,15,48.3,50.1,0,0,


In [None]:
#change all Y/N to 1/0 for target variable 
data[dv].replace({'N':0, 'Y':1}, inplace=True)

#drop null records from target variable and save to new dataset 
data_all = data.dropna(subset=[dv]) 
data_all.head(5) 

Unnamed: 0_level_0,SET_NO,OPP_SEX,AGE,HH_ND,HH_NR,HH_NI,MED_AGE,MED_AGE_M,MED_AGE_F,NH_WHITE,...,DOITYOURSE,FINANCIALM,RELIGIOUSC,POLITICALC,MEDIANEDUC,PRS16_PD,PRS16_PR,MSG_A,MSG_B,MOVED_AR
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
404213,1,29,78,0,0,2,39,26,50,37,...,0,0,0,0,12,43.4,55.0,0,1,0.0
136981,2,0,29,5,0,0,50,49,52,82,...,0,0,1,0,12,45.6,52.7,0,0,0.0
532172,1,0,26,2,0,2,37,35,38,87,...,0,0,0,0,12,49.2,49.1,0,1,0.0
208471,1,0,57,1,2,1,37,37,38,54,...,0,0,0,0,12,45.2,53.2,0,1,0.0
261184,3,0,50,0,0,2,38,38,39,83,...,0,0,0,0,12,39.5,58.9,0,0,1.0


In [None]:
#create a train/test split using set 1 and 2 for training and set 3 for testing 
df_train = data_all[(data_all['SET_NO'] == 1) | (data_all['SET_NO'] == 2)] 
df_train.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_train = df_train.pop(dv).values
X_train = df_train 
df_test = data_all[data_all['SET_NO'] == 3] 
df_test.drop('SET_NO', axis=1, inplace=True) # Remove `SET_NO` as a indicator 
y_test = df_test.pop(dv).values 
X_test = df_test 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
#create and fit the random forest model to the dataset
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

##Build Model for Message A and Calculate Uplift

In [None]:
#Calculating the Uplift for Message A
#create a copy to modify data
uplift_df_A = X_test.copy()

uplift_df_A.MSG_A = 1
predTreatmentA = rf.predict_proba(uplift_df_A)
uplift_df_A.MSG_A = 0
predControlA = rf.predict_proba(uplift_df_A)

In [None]:
#uplift results
upliftResult_df_A = pd.DataFrame({
    'probMessage': predTreatmentA[:,1],
    'probNoMessage': predControlA[:,1],
    'uplift': predTreatmentA[:,1] - predControlA[:,1],},
     index=uplift_df_A.index)
upliftResult_df_A.head(20)

Unnamed: 0_level_0,probMessage,probNoMessage,uplift
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
261184,0.4,0.38,0.02
511179,0.02,0.01,0.01
623265,0.0,0.0,0.0
596038,0.0,0.0,0.0
218534,0.22,0.23,-0.01
9110,0.0,0.0,0.0
298522,0.05,0.04,0.01
59853,0.0,0.0,0.0
157607,0.17,0.18,-0.01
376292,0.56,0.58,-0.02


In [None]:
#export the file to csv
upliftResult_df_A.to_csv('Uplift_msgA.csv')

##Build Model for Message B and Calculate Uplift

In [None]:
#Calculating the Uplift for Message A
#create a copy to modify data
uplift_df_B = X_test.copy()

uplift_df_B.MSG_B = 1
predTreatmentB = rf.predict_proba(uplift_df_B)
uplift_df_B.MSG_B = 0
predControlB = rf.predict_proba(uplift_df_B)

In [None]:
#uplift results
upliftResult_df_B = pd.DataFrame({
    'probMessage': predTreatmentB[:,1],
    'probNoMessage': predControlB[:,1],
    'uplift': predTreatmentB[:,1] - predControlB[:,1],},
     index=uplift_df_B.index)
upliftResult_df_B.head(20)

Unnamed: 0_level_0,probMessage,probNoMessage,uplift
VOTER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
261184,0.37,0.38,-0.01
511179,0.01,0.01,0.0
623265,0.0,0.0,0.0
596038,0.0,0.0,0.0
218534,0.19,0.23,-0.04
9110,0.0,0.0,0.0
298522,0.04,0.05,-0.01
59853,0.0,0.0,0.0
157607,0.16,0.17,-0.01
376292,0.62,0.56,0.06


In [None]:
#export the file to csv
upliftResult_df_B.to_csv('Uplift_msgB.csv')