### Forward Stepwise Selection Model
by Annie Zhou

In [None]:
# create a copy for stepwise selection 
# data processing: categorical to dummy
def stepwise_trans(data):
    stepwise_data = data.copy()

    stepwise_data['employed'] = stepwise_data['job'].replace({'Employed': 1, 'Unemployed': 0})
    stepwise_data['married'] = stepwise_data['marital'].replace({'married': 1, 'single': 0})
    stepwise_data['housing'] = stepwise_data['housing'].replace({'yes': 1, 'no': 0})
    stepwise_data['contact_cell'] = stepwise_data['contact'].replace({'cellular': 1, 'telephone': 0})
    stepwise_data['loan'] = stepwise_data['loan'].replace({'yes': 1, 'no': 0})

    # get dummies
    edu_dum = pd.get_dummies(stepwise_data.education, drop_first=True)
    stepwise_data = pd.concat([stepwise_data,edu_dum.add_suffix('_edu')], axis = 1)

    #stepwise_data = pd.concat([stepwise_data,pd.get_dummies(stepwise_data.month, drop_first=True)], axis = 1)

    #stepwise_data = pd.concat([stepwise_data,pd.get_dummies(stepwise_data.day_of_week, drop_first=True)], axis = 1)
       
    stepwise_data = stepwise_data.drop(['marital', 'contact','job','education', 'poutcome'], axis = 1)
    return stepwise_data

In [None]:
stepwise_train = stepwise_trans(train)
stepwise_test = stepwise_trans(test)

In [None]:
# check for categorical variables
num_cols = stepwise_train._get_numeric_data().columns
cat_cols = list(set(stepwise_train.columns) - set(num_cols))
cat_cols

In [None]:
stepwise_train = stepwise_train.drop(cat_cols, axis = 1)
stepwise_train.columns

In [None]:
X = stepwise_train.drop("y_dum", axis=1)

In [None]:
# check for multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(X)
vif_data = pd.DataFrame()
vif_data["predictor"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

print(vif_data)

In [None]:
drop_var = vif_data.loc[vif_data['VIF']>=10, 'predictor']
X = X.drop(drop_var, axis = 1)

X.columns

In [None]:
# create X_backup for interaction
X_backup = X

In [None]:
#Including 2-factor interactions of predictors in train and 'X'.
for combo in itertools.combinations(X_backup.columns, 2):    
    stepwise_train['_'.join(combo)] = stepwise_train[combo[0]]*stepwise_train[combo[1]]
    stepwise_test['_'.join(combo)] = stepwise_test[combo[0]]*stepwise_test[combo[1]]
    X.loc[:,'_'.join(combo)] = stepwise_train.loc[:,'_'.join(combo)] 

In [None]:
#Function to develop a model based on all predictors in predictor_subset
def processSubset(predictor_subset):
    # Fit model on feature_set and calculate R-squared
    model = sm.logit('y_dum~' + '+'.join(predictor_subset),data = stepwise_train).fit()
    bic = model.bic
    return {"model":model, "bic":bic}

#Function to find the best predictor out of p-k predictors and add it to the model containing the k predictors
def forward(predictors):

    # Pull out predictors we still need to process
    remaining_predictors = [p for p in X.columns if p not in predictors]
    
    tic = time.time()
    
    results = []
    
    for p in remaining_predictors:
        results.append(processSubset(predictors+[p]))
    
    # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the highest RSS
    best_model = models.loc[models['bic'].argmin()]
    
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

def forward_selection():
    models_best = pd.DataFrame(columns=["bic", "model"])

    tic = time.time()
    predictors = []

    for i in range(1,len(X.columns)+1):    
        models_best.loc[i] = forward(predictors)
        predictors = list(models_best.loc[i]["model"].params.index[1:])

    toc = time.time()
    print("Total elapsed time:", (toc-tic), "seconds.")
    return models_best

In [None]:
# check for collinearity in interaction terms
X = add_constant(X)

vif_data = pd.DataFrame()
vif_data["predictor"] = X.columns

for i in range(len(X.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(X.values, i)

drop_var = vif_data.loc[vif_data['VIF'].isna()==True, 'predictor']
X = X.drop(drop_var, axis = 1)
X = X.drop('const', axis = 1)

In [None]:
# takes about 5 minutes to run

models_best = forward_selection()

In [None]:
def best_sub_plots():
    plt.figure(figsize=(15,8))
    plt.rcParams.update({'font.size': 18, 'lines.markersize': 10})

    # Set up a 1x2 grid so we can look at 4 plots at once

    # We'll do the same for AIC and BIC, this time looking for the models with the SMALLEST statistic
    aic = models_best.apply(lambda row: row[1].aic, axis=1)

    plt.subplot(1, 2, 1)
    plt.plot(aic)
    plt.plot(1+aic.argmin(), aic.min(), "or")
    plt.xlabel('# Predictors')
    plt.ylabel('AIC')

    bic = models_best.apply(lambda row: row[1].bic, axis=1)

    plt.subplot(1, 2, 2)
    plt.plot(bic)
    plt.plot(1+bic.argmin(), bic.min(), "or")
    plt.xlabel('# Predictors')
    plt.ylabel('BIC')
    
    print("Predictor number with min. AIC: ", 1+aic.argmin())
    print("Predictor number with min. BIC: ", 1+bic.argmin())

In [None]:
best_sub_plots()

In [None]:
best_fwd_reg_model = models_best['model'][24]
best_fwd_reg_model.summary()

In [None]:
confusion_matrix_data(stepwise_train,stepwise_train.y_dum,best_fwd_reg_model,0.3)

In [None]:
confusion_matrix_data(stepwise_test,stepwise_test.y_dum,best_fwd_reg_model,0.3)

### Code fitting the final model

In [None]:
fwd_model = sm.logit(formula = 'y_dum~pdays_cons_price_idx+campaign_cons_conf_idx+cons_price_idx_contact_cell+loan_cons_conf_idx+pdays_previous+previous_Tertiary_edu+contact_cell_Secondary_edu+contact_cell_Tertiary_edu+age_employed+housing_contact_cell+loan_Tertiary_edu+loan_cons_price_idx+contact_cell+pdays+cons_conf_idx_contact_cell+cons_price_idx_cons_conf_idx+housing_campaign+cons_conf_idx+cons_price_idx+loan_contact_cell+housing_loan+previous_cons_conf_idx+previous_cons_price_idx+previous', data=stepwise_train).fit()

fwd_model.summary()


In [None]:
fwd_params = best_fwd_reg_model.params.index.tolist()
fwd_params

In [None]:
confusion_matrix_data(stepwise_train,stepwise_train.y_dum,best_fwd_reg_model,0.3)

In [None]:
confusion_matrix_data(stepwise_test,stepwise_test.y_dum,best_fwd_reg_model,0.3)