## 1. Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as stats
from sklearn import preprocessing
import plotly.express as px
from string import ascii_letters
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.base import clone
import shap

## 2. Import cleaned train dataset

In [2]:
df = pd.read_csv('../data/train_cleaned.csv')

## 3. Creating Functions for Modeling
### 3a) Train Test Split

In [3]:
# test train split function with scaling
# test train split function
def set_train_test(data, input_seed, pct_test):
    """Returns train_df (pd.DataFrame), test_df (pd.DataFrame) which are randomly split training and testing data.

    Keyword arguments:
    data (pd.DataFrame) -- All the data, must include all features that you want to look at and also target.
        Optionally, can include other random columns.
    input_seed (int) -- seed for randomization.
    pct_test (float) -- must be between 0 and 1 inclusive. The percent of data used for testing.
    """
    train_df, test_df = train_test_split(
        data,
        test_size = pct_test, 
        random_state = input_seed
    )
    
    return train_df, test_df

### 3b) Outline model stats

In [4]:
def helper_acc_stats(y_actual_test, y_pred_test, model_name, features):
    """Returns model_stats (pd.DataFrame) which contains performance data for the model.

    Keyword arguments:
    y_actual (pd.Series or numpy.ndarray) -- the actual y values.
    y_pred (pd.Series or numpy.ndarray) -- the predicted y values.
    model_name (str) -- what you want to call the model. it will be saved in a csv later.
    features (list) -- list of the column names used in this model to create the predictions.
    """
    
    # test metrics
    test_accuracy = accuracy_score(y_actual_test, y_pred_test) # real y goes first, predicted y goes second
    test_precision = precision_score(y_actual_test, y_pred_test)
    test_recall = recall_score(y_actual_test, y_pred_test)
    test_f1 = f1_score(y_actual_test, y_pred_test)
    test_roc_auc = roc_auc_score(y_actual_test, y_pred_test)

    # make a dataframe with model results used to save results into a CSV
    model_stats = pd.DataFrame( 
            {
                'model_name':model_name, # this is the model type I testing
                'features':[features], 
                'test_accuracy':test_accuracy,
                'test_precision':test_precision,
                'test_recall':test_recall,
                'test_f1':test_f1,
                'test_roc_auc':test_roc_auc
            }
    )
    return model_stats

# write stats to a csv
def helper_write(stats, file_path):
    try: # try means it will "try" the following code. if it results in an error, then it will stop and jump to except
        df_results = pd.read_csv(file_path) # this returns an error if results.csv isn't an actual file. 
        # if there is df_results, then add new results to it.

        df_results = pd.concat([df_results, stats])
        df_results.to_csv(file_path, index=False)
    except:
        print(f'There is no CSV called {file_path}')
        stats.to_csv(file_path, index=False)

### 3c) Outline how to evaluate model, for forward step feature selection, testing, and validation

In [5]:
def evaluate_model(input_name, input_model, training_data, testing_data, features, target):
    """Writes test results of a model using training and testing data with features and a target.

    Keyword arguments:
    input_name (str) -- what you want to call the model. it will be saved in a csv later.
    input_model (sklearn model) -- model.
    training_data (pd.DataFrame)
    testing_data (pd.DataFrame)
    features (list) -- list of the column names used in this model to create the predictions.
    features (str) -- name of column that we want to predict.
    """
    
    # set training/test X/y
    X_train = training_data[features]
    y_train = training_data[target]
    X_test = testing_data[features]
    y_test = testing_data[target]
    
    # fit the model that was passed in
    # model = input_model()
    model = clone(input_model) #testing
    model.fit(X_train,y_train)
    
    # get predictions on the test data
    y_test_pred = model.predict(X_test)
    
    # generate stats (dataframe) of the accuracy, precision, recall, f1, and roc auc
    stats = helper_acc_stats(
        y_actual_test = y_test, 
        y_pred_test = y_test_pred,
        model_name = input_name, 
        features = features,
    )
    # write the stats to a csv so we can look at
    helper_write(stats = stats, file_path = 'results.csv')

    return model

### 3d) Feature Selection + Run Model

In [14]:
def forward_selection(model, data, all_feats, target, model_name):
    
    # set train/test split
    train_df, test_df = set_train_test(data = data, input_seed = 1, pct_test = 0.2)
    
    best_model_feats = [] # stores best model feats
    for i in range(len(all_feats)):
        
        prop_feats = [] # proposed feats to add
        best_model_roc_auc = -999999999999
        
        for c in all_feats:
            # if the feat that we are considering adding is already in our current best model, then skip
            if c in best_model_feats: 
                continue

            # this is a list of the features we are going to test for this iteration
            feats_to_test = best_model_feats.copy()
            feats_to_test.append(c)

            fitted_model = evaluate_model(model_name, model, train_df, test_df, feats_to_test, target)
            
            y_pred = fitted_model.predict(test_df[feats_to_test])
            y_actual = test_df[target]

            stats = helper_acc_stats(y_actual, y_pred, model_name, feats_to_test)
            roc_auc_stat = stats.loc[0,'test_roc_auc']

            if roc_auc_stat > best_model_roc_auc:

                best_model_roc_auc = roc_auc_stat 
                prop_feats = feats_to_test
                
        best_model_feats = prop_feats

## 4. Run Models

In [19]:
# define starting features
def starting_features(data):
    features = []
    for i in data.columns:
        if i != 'Loan_Status':
            features.append(i)
    return features

In [20]:
starting_features = starting_features(df)

In [22]:
# run logistic regression model
forward_selection(LogisticRegression(max_iter = 500), df, starting_features, 'Loan_Status', 'Logistic Regression')

There is no CSV called results.csv


In [24]:
# run random forest regression model
'''
You need a bit more than 8GB RAM for predicting 2 classes with 256 trees.
Of course, you can use a lower number, but gradually you'll notice worse performance.
'''
forward_selection(RandomForestClassifier(n_estimators=256), df, starting_features, 'Loan_Status', 'Random Forest Classifier')

Based on 'results.csv', the Random Forest Classifier performed best with the following features:<br>
- 'Credit_History'
- 'Gender'
- 'Married'
- 'Dependents'
- 'Education'
- 'Loan_Amount_Term_Months'
- 'Property_Area'
- 'Applicant_Income(total yearly)_to_Loan_Amount(total)'

First, let's make sure our training results were similar and that our model is not overfit.

In [54]:
# select features from top performing model
features = ['Credit_History', 'Gender', 'Married', 'Dependents', 'Education', 'Loan_Amount_Term_Months', 'Property_Area', 'Applicant_Income(total yearly)_to_Loan_Amount(total)']

In [55]:
# get test/train data
test_train_RF_Classifier = set_train_test(data = df, input_seed = 1, pct_test = 0.2)

In [58]:
# run evaluate_model and save returned model as new variable
RF_Class_model = evaluate_model(input_name = 'test_RF_Class', input_model = RandomForestClassifier(n_estimators=256) , training_data=test_train_RF_Classifier[0], testing_data=test_train_RF_Classifier[1], features=features, target='Loan_Status')

In [59]:
# define X_train
X_train = test_train_RF_Classifier[0][features]

# define y predicted for the training data
y_pred_train = RF_Class_model.predict(X_train)

# define y actual for the training data
y_actual_train = test_train_RF_Classifier[0]['Loan_Status']

In [60]:
# calculate roc auc for train data
train_roc_auc = roc_auc_score(y_actual_train, y_pred_train)
train_roc_auc

1.0

The train_roc_auc = 1 suggests our model is overfit to the training data. As a way to avoid overfit in Random Forest models, we can prune the decision trees. In the hyperparameters this is reducing the 'max_depth'.

Additionally, I will reduce the number of variables samples at each split.

In [63]:
forward_selection(RandomForestClassifier(n_estimators=256, max_depth = 5, max_features = .5), df, starting_features, 'Loan_Status', 'Random Forest Classifier Hyperparameter')

In [66]:
# select features from top performing model
features = ['Credit_History', 'Applicant_Income(total yearly)_to_Loan_Amount(total)', 'Gender']

# get test/train data
test_train_RF_Classifier_Hyperparameter = set_train_test(data = df, input_seed = 1, pct_test = 0.2)

# run evaluate_model and save returned model as new variable
RF_Class__Hyperparameter_model = evaluate_model(input_name = 'test_RF_hyper_Class', input_model = RandomForestClassifier(n_estimators=256, max_depth = 5, max_features = .5), training_data=test_train_RF_Classifier_Hyperparameter[0], testing_data=test_train_RF_Classifier_Hyperparameter[1], features=features, target='Loan_Status')

# define X_train
X_train = test_train_RF_Classifier_Hyperparameter[0][features]

# define y predicted for the training data
y_pred_train = RF_Class__Hyperparameter_model.predict(X_train)

# define y actual for the training data
y_actual_train = test_train_RF_Classifier_Hyperparameter[0]['Loan_Status']

# calculate roc auc for train data
train_roc_auc = roc_auc_score(y_actual_train, y_pred_train)
train_roc_auc

0.7123632458554265