# Introduction

Next Steps
1/5/2022

What am I going to present?
* Underwriting Policies. Market segmentation, credit scoring, differentiated pricing. 
* XGBoost. Hyperparameter tuning, cross validation, performance. Here is the credit score to use. 
* Fair Lending performance. 


In [1]:
import os
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt

import multiprocessing

import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)

## Data

Let's load up the results of the last notebook. Because we're building an alternative credit score, we'll only consider models that do not use FICO as an input. 

In [2]:
# Load performance results from previous notebook 

# Store everything in one performance dataframe
performance = pd.DataFrame()

# Iterate through feature sets and search numbers
feature_sets = ['no_credit']
search_numbers = ['0001', '0002', '0003']

# For each search
for i in search_numbers:
    # For each feature set
    for name in feature_sets:
        # Store data in a dictionary
        df = pd.read_pickle(f'output_data/04_{name}_performance_{i}.pkl')
        
        # Track the features it used and the grid search it came from
        df['features'] = name
        df['grid_search'] = i
        
        # Append to the overall performance DF
        performance = performance.append(df)

#### Let's rename some of these columns. 

In [3]:
performance.rename({'mean_fit_time': 'training_seconds'},
                   inplace = True)

#### Build some new columns. 

In [4]:
performance = performance.reset_index()
performance = performance.rename({'index': 'model_id'}, axis=1)

performance['framework'] = 'xgboost'

#### Take a look. 

In [5]:
performance.head()

Unnamed: 0,model_id,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_booster,param_eta,param_eval_metric,param_gamma,param_max_depth,...,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,features,grid_search,framework
0,0,0.484207,0.033851,0.04241,0.00506,gbtree,0.01,logloss,0,2,...,0.480492,0.382951,0.489836,0.556616,0.494504,0.065054,94,no_credit,1,xgboost
1,1,0.490694,0.061369,0.040287,0.003249,gbtree,0.01,logloss,0,2,...,0.469836,0.388852,0.540656,0.500823,0.498427,0.068356,91,no_credit,1,xgboost
2,2,0.568504,0.037977,0.032832,0.003758,gbtree,0.01,logloss,0,2,...,0.46082,0.421639,0.509508,0.549375,0.500203,0.052468,85,no_credit,1,xgboost
3,3,0.52409,0.039263,0.032482,0.00432,gbtree,0.01,logloss,0,2,...,0.467869,0.379836,0.521803,0.500823,0.492066,0.068896,103,no_credit,1,xgboost
4,4,0.509604,0.026835,0.029784,0.000767,gbtree,0.01,logloss,0,2,...,0.44541,0.385246,0.481311,0.589203,0.489414,0.072114,113,no_credit,1,xgboost


#### Build a new Performance Dictionary

In [6]:
performance_dict = pd.DataFrame(performance.dtypes)
performance_dict = performance_dict.reset_index()
performance_dict.rename({0: 'var_dtype',
                        'index': 'var_name'}, 
                        axis=1, 
                        inplace = True)

performance_dict.head()

Unnamed: 0,var_name,var_dtype
0,model_id,int64
1,mean_fit_time,float64
2,std_fit_time,float64
3,mean_score_time,float64
4,std_score_time,float64


In [7]:
performance_dict.head()

Unnamed: 0,var_name,var_dtype
0,model_id,int64
1,mean_fit_time,float64
2,std_fit_time,float64
3,mean_score_time,float64
4,std_score_time,float64


#### Categorize Variables with NLP Regex Tagging

In [8]:
category_tags = {
    '_time': 'runtime',
    'param_': 'model_parameters',
    '_test_': 'score' 
    # Left is Data (including aggregated data) 
    # Right is eval_metric
}

for k, v in category_tags.items():
    performance_dict.loc[performance_dict['var_name'].str.contains(k),
                        'performance_category'] = v

In [9]:
show_cols = [
    'model_id',
    'framework',
    'features',
    'mean_test_accuracy',
    'mean_test_neg_brier_score',
    'mean_test_roc_auc',
    'mean_test_precision',
    'mean_test_recall',
    'param_booster', 
    'param_objective', 
    'param_eval_metric',
    'param_subsample',
    'param_max_depth', 
    'param_min_child_weight',
    'param_reg_lambda',
    'param_gamma', 
    'param_eta',
]

performance[show_cols].sort_values('mean_test_accuracy', 
                                   ascending = False).round(2).head(200)

Unnamed: 0,model_id,framework,features,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall,param_booster,param_objective,param_eval_metric,param_subsample,param_max_depth,param_min_child_weight,param_reg_lambda,param_gamma,param_eta
1241,617,xgboost,no_credit,0.6,-0.25,0.56,0.6,0.82,gbtree,binary:logistic,logloss,1.0,4,20,0.0,0.25,0.0005
1365,741,xgboost,no_credit,0.6,-0.25,0.57,0.6,0.82,gbtree,binary:logistic,logloss,1.0,2,20,1.0,2.0,0.0005
1219,595,xgboost,no_credit,0.6,-0.25,0.57,0.6,0.82,gbtree,binary:logistic,logloss,1.0,2,20,0.5,0.25,0.0005
1217,593,xgboost,no_credit,0.6,-0.25,0.56,0.6,0.82,gbtree,binary:logistic,logloss,1.0,2,20,0.0,0.25,0.0005
1337,713,xgboost,no_credit,0.6,-0.25,0.56,0.6,0.82,gbtree,binary:logistic,logloss,1.0,8,20,0.0,0.75,0.0005
1339,715,xgboost,no_credit,0.6,-0.25,0.57,0.6,0.82,gbtree,binary:logistic,logloss,1.0,8,20,0.5,0.75,0.0005
1341,717,xgboost,no_credit,0.6,-0.25,0.57,0.6,0.82,gbtree,binary:logistic,logloss,1.0,8,20,1.0,0.75,0.0005
1343,719,xgboost,no_credit,0.6,-0.25,0.56,0.6,0.82,gbtree,binary:logistic,logloss,1.0,8,20,1.5,0.75,0.0005
1361,737,xgboost,no_credit,0.6,-0.25,0.56,0.6,0.82,gbtree,binary:logistic,logloss,1.0,2,20,0.0,2.0,0.0005
1363,739,xgboost,no_credit,0.6,-0.25,0.57,0.6,0.82,gbtree,binary:logistic,logloss,1.0,2,20,0.5,2.0,0.0005


# Evaluate Hyperparameter Performance

In [10]:
# Designate Hyperparameters
hypers = ['param_subsample', 
          'param_max_depth', 
          'param_min_child_weight',
          'param_reg_lambda',
          'param_gamma', 
          'param_eta',
          'features']

In [11]:
def hyper_results(performance: pd.DataFrame, hyper: str):
    """
    Summarizes the effects of each hyperparameter on validation set scoring metrics. 
    
    """
    groupby_dict = {
   
    }
    gb = performance.groupby(hyper).agg({'mean_test_accuracy': 'mean',
                                    'mean_test_neg_brier_score': 'mean',
                                    'mean_test_roc_auc': 'mean',
                                    'mean_test_precision': 'mean',
                                    'mean_test_recall': 'mean'})
    
    
    return gb

#### Maximum Depth of a Tree
Increasing the max depth of the tree has little effect on most measures of accuracy. Deeper trees have higher precision and lower recall scores. 

In [12]:
hyper_results(performance, 'param_max_depth')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.500253,-0.291303,0.495528,0.515723,0.60274
4,0.497889,-0.297518,0.490185,0.519327,0.602814
8,0.497172,-0.299034,0.490251,0.521527,0.599792
16,0.549625,-0.250335,0.526892,0.544583,0.863717
32,0.549625,-0.250335,0.526892,0.544583,0.863717


#### Minimum Sample Size for a New Node
You can strongly improve performance by increasing the minimum sample size for a new tree leaf. The best models we have use the highest possible min_child_weight value, so we should run more tests to find out if higher weights would be even more performant. 

In [13]:
hyper_results(performance, 'param_min_child_weight')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_min_child_weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.429267,-0.382507,0.430664,0.405872,0.416174
4,0.44268,-0.351781,0.441843,0.433259,0.454999
8,0.494558,-0.291438,0.487769,0.490751,0.592666
12,0.511369,-0.282249,0.505722,0.547792,0.625593
16,0.507683,-0.286907,0.493611,0.554317,0.619279
20,0.526037,-0.269906,0.518438,0.573992,0.67525
24,0.556081,-0.249014,0.533624,0.565122,0.904551
48,0.560473,-0.248024,0.535644,0.572934,0.879839


#### Minimum Performance Gain for a New Node
Gamma is good! Raise to 10, 20, and 50 to compare. 

In [14]:
hyper_results(performance, 'param_gamma')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_gamma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.451364,-0.363815,0.445544,0.460664,0.470129
0.25,0.492492,-0.302007,0.485656,0.534111,0.551198
0.5,0.451564,-0.358777,0.446115,0.464597,0.471103
0.75,0.492924,-0.298105,0.486154,0.533201,0.553404
1.0,0.547485,-0.255592,0.545881,0.543027,0.650637
2.0,0.504774,-0.281667,0.496418,0.534007,0.585559
5.0,0.476702,-0.298604,0.474459,0.468187,0.54922
8.0,0.543906,-0.251355,0.542439,0.547534,0.786655
10.0,0.549099,-0.248358,0.561583,0.533178,0.80594
50.0,0.551351,-0.248643,0.5,0.551351,1.0


#### Lambda (L2) Regularization Parameter
To prevent overfitting, you can use an L1 regularization parameter. But in this test, it has no visible effect on performance. 

In [15]:
hyper_results(performance, 'param_reg_lambda')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_reg_lambda,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.486401,-0.310928,0.480624,0.506216,0.563203
0.5,0.508509,-0.283366,0.503273,0.537688,0.616678
1.0,0.488217,-0.305266,0.483151,0.505669,0.568439
1.5,0.509822,-0.282029,0.503848,0.537091,0.624992


#### Learning Rate (Eta)

Prevents overfitting by minimizing the impact of new trees on the overall prediction. Over time, adjustments become fine-grained. 

'eta': [0.05, 0.005, 0.001, 0.00001], 

In [16]:
hyper_results(performance, 'param_eta')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_eta,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0001,0.556569,-0.249907,0.530433,0.55531,0.828889
0.0005,0.561781,-0.249342,0.553646,0.568668,0.714956
0.001,0.553416,-0.24925,0.529476,0.551006,0.838452
0.01,0.507946,-0.264347,0.503184,0.489289,0.666013
0.05,0.540465,-0.253594,0.519526,0.532979,0.887753
0.08,0.500188,-0.277914,0.494654,0.525834,0.625792
0.1,0.455506,-0.335219,0.448197,0.447722,0.492462
0.12,0.491441,-0.287282,0.489354,0.527993,0.594151
0.25,0.446463,-0.367235,0.44113,0.458368,0.468289
0.4,0.480687,-0.318597,0.473013,0.526356,0.541917


#### Subsample %
You can also fiddle with dropout layers / subsampling within the model, but no big effects here. 

In [17]:
hyper_results(performance, 'param_subsample')

Unnamed: 0_level_0,mean_test_accuracy,mean_test_neg_brier_score,mean_test_roc_auc,mean_test_precision,mean_test_recall
param_subsample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.7,0.495652,-0.298707,0.490898,0.532347,0.579158
1.0,0.494861,-0.299014,0.488639,0.502408,0.592493


# Selecting One Model

In [18]:
select_cols = [
    'model_id',
    'param_booster',
    'param_max_depth',
    'param_min_child_weight',
    'param_reg_lambda',
    'param_gamma',
    'param_eta',
    'param_subsample',
    'mean_test_accuracy',
    'mean_test_neg_brier_score',
    'mean_test_roc_auc',
    'mean_test_precision',
    'mean_test_recall',
    'features'
]

In [19]:
top_100 = performance[select_cols].sort_values('mean_test_neg_brier_score', ascending=False).head(100)

In [20]:
for col in hypers:
    print(top_100[col].value_counts())

1.0    72
0.7     8
Name: param_subsample, dtype: int64
8     31
4     31
2     28
16     5
32     5
Name: param_max_depth, dtype: int64
20    36
12    26
16    18
48    12
24     8
Name: param_min_child_weight, dtype: int64
1.0    25
0.0    23
1.5    17
0.5    15
Name: param_reg_lambda, dtype: int64
8.0     62
1.0     12
10.0     8
5.0      6
0.5      6
0.0      6
Name: param_gamma, dtype: int64
0.12    24
0.01    24
0.08    24
0.40    14
0.05    12
0.10     2
Name: param_eta, dtype: int64
no_credit    100
Name: features, dtype: int64


In [21]:
# Choose by hand the most effective hyperparameters for the model
best_hypers = {
    'param_max_depth': 4,
    'param_min_child_weight': 16,
    'param_reg_lambda': 0,
    'param_gamma': 10,
    'param_eta': 0.10,
    'param_subsample': 0.7,
}

# Rebuild the Best Model

In [22]:
# Import data
data = pd.read_pickle('output_data/02_data.pkl')
data_dict = pd.read_pickle('output_data/02_data_dict.pkl')

X_train = pd.read_pickle('output_data/04_X_train.pkl')
X_test = pd.read_pickle('output_data/04_X_test.pkl')

y_train = pd.read_pickle('output_data/04_y_train.pkl')
y_test = pd.read_pickle('output_data/04_y_test.pkl')