In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import time

import random
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error as mae 

In [2]:
#Load (unpickle) the dictionary of DataFrames from the file
with open('../Data/test_train_datasets.pkl', 'rb') as f:
    sample_dfs = pickle.load(f)  # 'rb' for reading in binary mode

sample_dfs.keys()

dict_keys(['metro_samp_1', 'metro_samp_1_train', 'metro_samp_1_test', 'metro_samp_2', 'metro_samp_2_train', 'metro_samp_2_test', 'metro_samp_3', 'metro_samp_3_train', 'metro_samp_3_test', 'metro_samp_val', 'metro_samp_val_train', 'metro_samp_val_test', 'metro_samp_1_train_normalized', 'metro_samp_1_test_normalized', 'metro_samp_2_train_normalized', 'metro_samp_2_test_normalized', 'metro_samp_3_train_normalized', 'metro_samp_3_test_normalized', 'metro_samp_val_train_normalized', 'metro_samp_val_test_normalized'])

In [3]:
X_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" not in col]
y_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" in col]

In [4]:
#set folds for cross validation testing
cv_folds = pd.DataFrame({'fold':['samp_1','samp_2','samp_3'],
                         'train_df':['metro_samp_1_train_normalized','metro_samp_2_train_normalized','metro_samp_3_train_normalized'],
                         'test_df':['metro_samp_1_test_normalized','metro_samp_2_test_normalized','metro_samp_3_test_normalized']})

val = pd.DataFrame({'fold':['val'],
                    'train_df':['metro_samp_val_train_normalized'],
                    'test_df':['metro_samp_val_test_normalized']})

In [5]:
#create function to produce confusion matrix from test results
def conf_matrix(test_results):
    pos = test_results[test_results.y_test < 0]
    true_pos = pos[pos.y_pred < 0]
    false_neg = pos[pos.y_pred >= 0]

    neg = test_results[test_results.y_test >= 0]
    true_neg = neg[neg.y_pred >= 0]
    false_pos = neg[neg.y_pred < 0]

    confusion_matrix = np.array([[len(true_pos), len(false_pos)],
                                [len(false_neg), len(true_neg)]])

    confusion_matrix_pct = confusion_matrix / len(test_results)
    
    return(confusion_matrix, confusion_matrix_pct)

In [6]:
#create class to generate key statistics from confusion matrix
class confusion_matrix_class:
    def __init__(self, cm):
        self.cm = cm
        self.tp = cm[0,0]
        self.fp = cm[0,1]
        self.fn = cm[1,0]
        self.tn = cm[1,1]
        self.accuracy = ((self.tp+self.tn)/(self.tp+self.fp+self.fn+self.tn))
        self.precision = (self.tp/(self.tp+self.fp))
        self.recall = (self.tp/(self.tp+self.fn))
        self.F1 = (2*self.precision*self.recall)/(self.precision+self.recall)
        self.specificity = (self.tn/(self.tn+self.fp))

In [7]:
compare_df = pd.DataFrame()

#create function to produce consistent results for comparison across models
def create_results_record(model, cv_fold, y_var, dataset, cm, error):                
    new_record_dict = {'model': model,
                      'cv_fold': cv_fold,
                      'y': y_var,
                      'dataset': dataset,
                      'mae': error,
                      'accuracy': cm.accuracy,
                      'precision': cm.precision,
                      'recall': cm.recall,
                      'F1': cm.F1,
                      'specificity': cm.specificity,
                      'tp': cm.tp,
                      'fp': cm.fp,
                      'fn': cm.fn,
                      'tn': cm.tn}
    return(new_record_dict)

In [8]:
HGBR_results = pd.DataFrame()

#create function to test Gradient Boosting model
def HGBR(cv_fold, 
         X_vars=X_cols,
         y_var='frwd01_mon_metro_hvi_pct_chg',
         learning_rate=.1,
         min_samples_leaf=20,
         max_depth=None,
         seed=42, 
         HGBR_results_df=HGBR_results):

    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']]
    test_df = sample_dfs[cv_fold['test_df']]
    
    #define training set
    train_df = train_df.dropna(subset=[y_var])
    X = train_df[X_vars[7:]]
    y = train_df[y_var]

    #train and score model with training data
    hgbr = HistGradientBoostingRegressor(learning_rate=learning_rate,
                                         min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth,
                                         random_state=seed)
    hgbr.fit(X, y)
    train_score = hgbr.score(X, y)

    #predict and score with test data
    test_df = test_df.dropna(subset=[y_var])
    X_test = test_df[X_vars[7:]]
    y_test = test_df[y_var]
    test_score = hgbr.score(X_test, y_test)
    y_pred = hgbr.predict(X_test)
    error = mae(y_test, y_pred)
    
    #create categorization of prediction
    results_index = test_df[X_vars[:7]].reset_index()
    results = pd.DataFrame(zip(y_test,y_pred), columns=['y_test','y_pred']).reset_index()
    test_results = pd.concat([results_index, results], axis=1)
    test_results['y_diff'] = test_results['y_pred'] - test_results['y_test']
    test_results['direction'] = ((test_results['y_test']*test_results['y_pred'])/
                                  abs(test_results['y_test']*test_results['y_pred']))
    test_results['direction'] = np.where(test_results['direction'] == 1, "same", "diff")
    cm, cm_pct = conf_matrix(test_results)
    hgbr_cm = confusion_matrix_class(cm)

    #add record to test results df
    new_record = create_results_record(model='HGBR', 
                                       cv_fold=fold, 
                                       y_var=y_var, 
                                       dataset='test', 
                                       cm=hgbr_cm,
                                       error=error)
                  
    HGBR_results_df = pd.concat([HGBR_results_df, pd.DataFrame([new_record])], ignore_index=True).drop_duplicates()
    HGBR_results_df[['tp','fp','fn','tn']] = HGBR_results_df[['tp','fp','fn','tn']].astype(int)
    
    return(HGBR_results_df, test_results)

In [9]:
y_focus = [col for col in y_cols if "mon_metro_hvi_pct_chg" in col]

In [10]:
for y in y_focus:

    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_results, test_results = HGBR(cv_fold,
                                          y_var=y,
                                          HGBR_results_df=HGBR_results)

In [11]:
mean_df = HGBR_results.groupby(['model','y','dataset'])[HGBR_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

HGBR_results = pd.concat([HGBR_results, mean_df], ignore_index=True)
HGBR_results = HGBR_results.sort_values(['y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')
HGBR_results.sort_values(by=['recall'], ascending=False)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
3,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002582,0.807801,0.436269,0.756462,0.553387,0.817392,3190.0,4122.0,1027.0,18451.0
1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002732,0.766323,0.522402,0.714474,0.603525,0.783507,2705.0,2473.0,1081.0,8950.0
7,HGBR,samp_3,frwd02_mon_metro_hvi_pct_chg,test,0.005648,0.818029,0.425949,0.697935,0.529031,0.838632,2738.0,3690.0,1185.0,19177.0
0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002453,0.81142,0.505487,0.696449,0.58115,0.835741,2473.333333,2601.333333,1016.333333,13032.666667
2,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002045,0.860135,0.557791,0.61841,0.586538,0.906323,1525.0,1209.0,941.0,11697.0
4,HGBR,mean,frwd02_mon_metro_hvi_pct_chg,test,0.005144,0.828043,0.534455,0.566441,0.536306,0.889929,1910.333333,1914.333333,1410.666667,13888.333333
6,HGBR,samp_2,frwd02_mon_metro_hvi_pct_chg,test,0.004582,0.866706,0.55855,0.522382,0.539861,0.92732,1202.0,950.0,1099.0,12121.0
11,HGBR,samp_3,frwd03_mon_metro_hvi_pct_chg,test,0.008235,0.861217,0.496746,0.516653,0.506504,0.916309,1908.0,1933.0,1785.0,21164.0
5,HGBR,samp_1,frwd02_mon_metro_hvi_pct_chg,test,0.005202,0.799395,0.618867,0.479005,0.540027,0.903836,1791.0,1103.0,1948.0,10367.0
15,HGBR,samp_3,frwd04_mon_metro_hvi_pct_chg,test,0.01067,0.870698,0.493141,0.419977,0.453628,0.936746,1438.0,1478.0,1986.0,21888.0


Will focus on predicting whether price will increase or decrease in the following month

# Tuning

Set y variable for focus of analysis

In [12]:
y_var = 'frwd01_mon_metro_hvi_pct_chg'

Identify variations of features to use for tuning

In [13]:
#set X features (all pct change lag from 1 month, selective other pct lag, clusters, and normalized amounts for current period)
X_cols_1 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            ['prev04_mon_metro_hvi']+
            ['prev04_mon_metro_rent']+
            ['prev04_mon_state_job_openings_pct_chg']+
            ['prev04_mon_state_population_pct_chg']+
            ['prev04_mon_state_personal_income_pct_chg']+
            ['prev07_mon_metro_hvi']+
            ['prev07_mon_metro_rent']+
            ['prev07_mon_state_job_openings_pct_chg']+
            ['prev07_mon_state_population_pct_chg']+
            ['prev07_mon_state_personal_income_pct_chg']+
            ['prev12_mon_metro_hvi']+
            ['prev12_mon_metro_rent']+
            ['prev12_mon_state_job_openings_pct_chg']+
            ['prev12_mon_state_population_pct_chg']+
            ['prev12_mon_state_personal_income_pct_chg']+
            X_cols[-5:])
X_cols_1.remove('state_job_openings_szn_adjd_normalized')
X_cols_1.remove('state_personal_income_per_capita_normalized')
X_cols_1.remove('prev01_mon_state_personal_income_per_capita_pct_chg')

In [14]:
#set X features (all pct change lag and normalized amounts for current period)
X_cols_2 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev04_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev04_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev07_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev07_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev12_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev12_mon_state_job_openings_szn_adjd_pct_chg')])

*****
**Baseline model**
*****
Create baseline model (prediction constant as average % change for each metro area)

In [15]:
start_time = time.time()
baseline_results_df = pd.DataFrame()

for i in range(len(cv_folds)):
    #set datasets for cross validation fold
    cv_fold = cv_folds.loc[i]
    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']].dropna(subset=y_var)
    test_df = sample_dfs[cv_fold['test_df']].dropna(subset=y_var)

    #train model
    baseline_y_pred = train_df.groupby('metro_id')[y_var].agg(['mean','median']).reset_index()
    train_df = train_df.merge(baseline_y_pred, on='metro_id')

    #use mean for baseline
    train_df['y_true'] = np.where(train_df[y_var] >= 0, 0, 1)
    train_df['y_pred'] = np.where(train_df['mean'] >= 0, 0, 1)
    baseline_cm = confusion_matrix_class(confusion_matrix(train_df['y_true'], train_df['y_pred']))
    train_error = mae(train_df[y_var],train_df['mean'])

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="train", 
                                       cm=baseline_cm,
                                       error=train_error)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)

    #test model
    test_df = test_df.merge(baseline_y_pred, on='metro_id')
    test_df['y_true'] = np.where(test_df[y_var] >= 0, 1, 0)
    test_df['y_pred'] = np.where(test_df['mean'] >= 0, 1, 0)
    baseline_cm = confusion_matrix_class(confusion_matrix(test_df['y_true'], test_df['y_pred']))
    test_error = mae(test_df[y_var],test_df['mean'])

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="test", 
                                       cm=baseline_cm,
                                       error=test_error)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    
mean_df = baseline_results_df.groupby(['model','y','dataset'])[baseline_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

baseline_results_df = pd.concat([baseline_results_df, mean_df], ignore_index=True)
baseline_results_df = baseline_results_df.sort_values(['dataset','y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        baseline_results_df[(baseline_results_df['cv_fold'] == 'mean') & 
                                            (baseline_results_df['dataset'] == 'test')]],
                       axis=0)
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df = compare_df.drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

baseline_results_df

Execution time: 1.02 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004159,0.724844,0.218602,0.241648,0.228088,0.821684,758.333333,2731.333333,2307.666667,13326.333333
1,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.004305,0.653758,0.284733,0.29648,0.290488,0.766007,1078.0,2708.0,2558.0,8865.0
2,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004216,0.738355,0.210057,0.199846,0.204824,0.847574,518.0,1948.0,2074.0,10832.0
3,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003955,0.782419,0.161015,0.22862,0.188952,0.851469,679.0,3538.0,2291.0,20282.0
4,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,train,0.00459,0.660696,0.933031,0.670141,0.780023,0.56791,68030.333333,4694.333333,33309.0,6267.0
5,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,train,0.004681,0.653099,0.913555,0.661673,0.767476,0.598055,55472.0,5249.0,28364.0,7810.0
6,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,train,0.00459,0.656388,0.929178,0.66738,0.776814,0.56167,67095.0,5114.0,33440.0,6553.0
7,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,train,0.0045,0.672603,0.956361,0.681371,0.795779,0.544006,81524.0,3720.0,38123.0,4438.0


*****
**Historical Gradient Boosting Model (HGBM)**
*****
Tune historical gradient boosting model and assess results 

In [16]:
#set parameter ranges for tuning HGBR model
random.seed(42)
learning_rate_rdm = np.array(random.choices(range(0,15),k=10)) / 100 + .01
min_samples_leaf_rdm = np.array(random.choices(range(100,201),k=10))
max_depth_rdm = np.array(random.choices(range(5,21),k=10))

params_df = pd.DataFrame({'learning_rate':learning_rate_rdm,
                          'min_samples_leaf':min_samples_leaf_rdm,
                          'max_depth':max_depth_rdm})

In [17]:
#test with all X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols
m_name = 'HGBR|x0|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 409.10 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x0|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002475,0.794638,0.473873,0.734303,0.572662,0.808568,2556,2796,933,12838
8,0.07,181,18,HGBR|x0|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002432,0.800359,0.477358,0.733019,0.576322,0.815277,2580,2899,909,12734
3,0.04,120,7,HGBR|x0|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002461,0.803003,0.481359,0.721689,0.577099,0.82029,2534,2720,955,12913
2,0.05,102,10,HGBR|x0|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002409,0.817962,0.511574,0.71594,0.595756,0.83988,2519,2456,970,13178
5,0.11,155,10,HGBR|x0|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002412,0.814739,0.512255,0.710716,0.589789,0.837219,2525,2621,964,13013
4,0.12,165,20,HGBR|x0|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00238,0.82019,0.521712,0.698989,0.592076,0.845946,2494,2446,995,13187
0,0.1,122,17,HGBR|x0|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002381,0.8187,0.514717,0.69368,0.588103,0.84558,2463,2413,1026,13220
6,0.14,122,6,HGBR|x0|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002373,0.833986,0.547755,0.693563,0.610869,0.864878,2456,2050,1033,13584
1,0.01,151,16,HGBR|x0|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002569,0.835024,0.601629,0.599109,0.586291,0.885107,2086,1470,1403,14163
9,0.01,100,14,HGBR|x0|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002588,0.830657,0.59521,0.589586,0.576825,0.881427,2055,1509,1434,14125


In [18]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002381,0.8187,0.514717,0.69368,0.588103,0.84558,2463.0,2413.0,1026.0,13220.0
10,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002517,0.777171,0.539803,0.711041,0.613701,0.79909,2692.0,2295.0,1094.0,9128.0
20,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.00212,0.853695,0.538434,0.616383,0.574778,0.899039,1520.0,1303.0,946.0,11603.0
30,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002506,0.825233,0.465914,0.753616,0.575829,0.838613,3178.0,3643.0,1039.0,18930.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002569,0.835024,0.601629,0.599109,0.586291,0.885107,2086.0,1470.0,1403.0,14163.0
11,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002576,0.782103,0.5518,0.664025,0.602733,0.821238,2514.0,2042.0,1272.0,9381.0
21,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002564,0.814793,0.442115,0.590024,0.505472,0.857741,1455.0,1836.0,1011.0,11070.0
31,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002569,0.908175,0.810973,0.543277,0.650667,0.976343,2291.0,534.0,1926.0,22039.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002409,0.817962,0.511574,0.71594,0.595756,0.83988,2519.0,2456.0,970.0,13178.0
12,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002533,0.771911,0.530975,0.717644,0.610356,0.789898,2717.0,2400.0,1069.0,9023.0


In [19]:
#test with first subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_1
m_name = 'HGBR|x1|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 131.20 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x1|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002519,0.790946,0.462651,0.734987,0.5656,0.804238,2562,2965,927,12669
2,0.05,102,10,HGBR|x1|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002425,0.821525,0.520265,0.726953,0.606013,0.841318,2553,2385,936,13248
8,0.07,181,18,HGBR|x1|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002492,0.814111,0.502865,0.724706,0.593537,0.833376,2541,2478,948,13156
3,0.04,120,7,HGBR|x1|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002459,0.818082,0.513262,0.722239,0.599951,0.838714,2525,2360,964,13273
4,0.12,165,20,HGBR|x1|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002403,0.821003,0.516059,0.705228,0.595268,0.846788,2486,2334,1003,13299
0,0.1,122,17,HGBR|x1|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002426,0.821623,0.521923,0.703413,0.598962,0.847129,2472,2273,1017,13361
5,0.11,155,10,HGBR|x1|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002415,0.830279,0.540665,0.701046,0.609433,0.858561,2467,2150,1022,13484
6,0.14,122,6,HGBR|x1|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00236,0.833591,0.558164,0.676151,0.609323,0.866282,2398,1946,1091,13687
1,0.01,151,16,HGBR|x1|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002602,0.833657,0.595026,0.61087,0.588151,0.880401,2123,1536,1366,14097
9,0.01,100,14,HGBR|x1|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002621,0.831488,0.593792,0.605165,0.58388,0.878639,2105,1549,1384,14085


In [20]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002426,0.821623,0.521923,0.703413,0.598962,0.847129,2472.0,2273.0,1017.0,13361.0
10,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002643,0.760997,0.514766,0.695193,0.591527,0.782807,2632.0,2481.0,1154.0,8942.0
20,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.00222,0.851353,0.528739,0.675182,0.593054,0.885015,1665.0,1484.0,801.0,11422.0
30,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002416,0.85252,0.522263,0.739862,0.612305,0.873566,3120.0,2854.0,1097.0,19719.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002602,0.833657,0.595026,0.61087,0.588151,0.880401,2123.0,1536.0,1366.0,14097.0
11,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002596,0.78276,0.55143,0.682515,0.610009,0.815985,2584.0,2102.0,1202.0,9321.0
21,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002592,0.812126,0.438232,0.607056,0.509011,0.851309,1497.0,1919.0,969.0,10987.0
31,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002619,0.906084,0.795415,0.54304,0.645434,0.973907,2290.0,589.0,1927.0,21984.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002425,0.821525,0.520265,0.726953,0.606013,0.841318,2553.0,2385.0,936.0,13248.0
12,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002619,0.765336,0.520553,0.725832,0.606288,0.778429,2748.0,2531.0,1038.0,8892.0


In [21]:
#test with second subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_2
m_name = 'HGBR|x2|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 188.52 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x2|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002478,0.79402,0.472557,0.734321,0.571796,0.807743,2558,2812,931,12822
3,0.04,120,7,HGBR|x2|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00246,0.799319,0.474738,0.732812,0.575205,0.813957,2571,2851,918,12782
8,0.07,181,18,HGBR|x2|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00248,0.785902,0.454628,0.730101,0.557348,0.798742,2569,3213,920,12420
2,0.05,102,10,HGBR|x2|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002374,0.819253,0.515471,0.715767,0.597638,0.841521,2522,2457,967,13177
5,0.11,155,10,HGBR|x2|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002431,0.817366,0.518918,0.701974,0.592046,0.842471,2488,2490,1001,13143
0,0.1,122,17,HGBR|x2|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00242,0.810436,0.499407,0.697548,0.577935,0.834576,2479,2643,1010,12990
6,0.14,122,6,HGBR|x2|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002354,0.830881,0.538165,0.691659,0.604852,0.861594,2449,2077,1040,13556
4,0.12,165,20,HGBR|x2|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002395,0.816215,0.520806,0.688622,0.585437,0.843792,2447,2550,1042,13083
1,0.01,151,16,HGBR|x2|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002569,0.835732,0.603402,0.598434,0.58693,0.886122,2084,1457,1405,14176
9,0.01,100,14,HGBR|x2|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00259,0.832526,0.599066,0.591531,0.580178,0.883339,2060,1485,1429,14149


In [22]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00242,0.810436,0.499407,0.697548,0.577935,0.834576,2479.0,2643.0,1010.0,12990.0
10,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002494,0.778486,0.541279,0.722134,0.618762,0.797164,2734.0,2317.0,1052.0,9106.0
20,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002139,0.850507,0.52933,0.614761,0.568856,0.895552,1516.0,1348.0,950.0,11558.0
30,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002626,0.802314,0.427613,0.755751,0.546187,0.811013,3187.0,4266.0,1030.0,18307.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002569,0.835732,0.603402,0.598434,0.58693,0.886122,2084.0,1457.0,1405.0,14176.0
11,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002574,0.78276,0.553037,0.663761,0.603361,0.822201,2513.0,2031.0,1273.0,9392.0
21,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002559,0.816224,0.445023,0.589213,0.507067,0.8596,1453.0,1812.0,1013.0,11094.0
31,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002573,0.908212,0.812145,0.542329,0.650363,0.976565,2287.0,529.0,1930.0,22044.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002374,0.819253,0.515471,0.715767,0.597638,0.841521,2522.0,2457.0,967.0,13177.0
12,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002508,0.774213,0.534578,0.7187,0.613114,0.792611,2721.0,2369.0,1065.0,9054.0


Summary of cross validation results for Historical Gradient Boosting Model

In [23]:
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df[compare_df.model.str.contains("HGBR")].sort_values('recall', ascending=False).drop(columns=['y'])

Unnamed: 0,model,cv_fold,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,HGBR|x1|0.02-159-6,mean,test,0.002519,0.790946,0.462651,0.734987,0.5656,0.804238,2562,2965,927,12669
7,HGBR|x2|0.02-159-6,mean,test,0.002478,0.79402,0.472557,0.734321,0.571796,0.807743,2558,2812,931,12822
7,HGBR|x0|0.02-159-6,mean,test,0.002475,0.794638,0.473873,0.734303,0.572662,0.808568,2556,2796,933,12838
8,HGBR|x0|0.07-181-18,mean,test,0.002432,0.800359,0.477358,0.733019,0.576322,0.815277,2580,2899,909,12734
3,HGBR|x2|0.04-120-7,mean,test,0.00246,0.799319,0.474738,0.732812,0.575205,0.813957,2571,2851,918,12782
8,HGBR|x2|0.07-181-18,mean,test,0.00248,0.785902,0.454628,0.730101,0.557348,0.798742,2569,3213,920,12420
2,HGBR|x1|0.05-102-10,mean,test,0.002425,0.821525,0.520265,0.726953,0.606013,0.841318,2553,2385,936,13248
8,HGBR|x1|0.07-181-18,mean,test,0.002492,0.814111,0.502865,0.724706,0.593537,0.833376,2541,2478,948,13156
3,HGBR|x1|0.04-120-7,mean,test,0.002459,0.818082,0.513262,0.722239,0.599951,0.838714,2525,2360,964,13273
3,HGBR|x0|0.04-120-7,mean,test,0.002461,0.803003,0.481359,0.721689,0.577099,0.82029,2534,2720,955,12913


*****
**Random forest model**
*****
Tune random forest regressor model and assess results 

In [24]:
#create function to generate prediction and assess results from fitted model
def model_predict(model, results_df, df, set_name, pred_cols, y_var=y_var, m_name=m_name, fold=fold):
    X = df[pred_cols]
    
    df['y_pred_pct'] = model.predict(X)
    df['y_true'] = np.where(df[y_var] >= 0, 1, 0)
    df['y_pred'] = np.where(df['y_pred_pct'] >= 0, 1, 0)
    cm = confusion_matrix_class(confusion_matrix(df['y_true'], df['y_pred']))
    error = mae(df[y_var], df['y_pred_pct'])

    new_record = create_results_record(model=m_name, 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset=set_name, 
                                       cm=cm,
                                       error=error)
    results_df = pd.concat([results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    return(results_df)

In [25]:
#create function to train and test random forest model 
def RF(model_name, X_vars, y_var, cv_folds, RF_results_df, params):
    
    for idx in range(len(params)):
    
        n_est = params['n_estimators'][idx]
        min_samp_lf = params['min_samples_leaf'][idx]
        max_f = params['max_features'][idx]
        m_name = model_name+"|"+str(n_est)+"-"+str(min_samp_lf)+"-"+str(max_f)
        
        for i in range(len(cv_folds)):

            cv_fold = cv_folds.loc[i]
            fold = cv_fold['fold']
            train_df = sample_dfs[cv_fold['train_df']]
            test_df = sample_dfs[cv_fold['test_df']]

            #identify and drop columns with over 75% of data missing in training dataset
            drop_cols = train_df.isna().sum().divide(len(train_df)).sort_values(ascending=False).reset_index().rename(columns={'index':'col',0:'missing'})
            #print(drop_cols.to_string())
            drop_cols['drop'] = np.where(drop_cols['missing'] > .70, "drop", "keep")
            drop_cols = list(drop_cols[drop_cols['drop'] == 'drop'].col)
            train_df = train_df.drop(columns=drop_cols)
            test_df = test_df.drop(columns=drop_cols)
            X_cols_rf = [col for col in X_vars if col not in drop_cols]

            #define training set
            train_df = train_df.dropna()
            X = train_df[X_cols_rf[7:]]
            y = train_df[y_var]

            #define test data
            test_df = test_df.dropna()
            X_test = test_df[X_cols_rf[7:]]
            y_test = test_df[y_var]

            rf_model = RandomForestRegressor(n_estimators=n_est, 
                                             min_samples_leaf=params['min_samples_leaf'][idx],
                                             max_features=params['max_features'][idx],
                                             n_jobs=-1, 
                                             random_state=42)
            rf_model.fit(X, y)
            
            RF_results_df = model_predict(rf_model, RF_results_df, train_df, "train", X_cols_rf[7:], y_var, m_name, fold)
            RF_results_df = model_predict(rf_model, RF_results_df, test_df, "test", X_cols_rf[7:], y_var, m_name, fold)

    return(RF_results_df)

In [26]:
#set parameter ranges for tuning RF model
random.seed(42)

k=10
n_estimators_rdm = np.array(random.choices(range(10,101),k=k))
min_samples_leaf_rdm = np.array(random.choices(range(20,101),k=k))
max_features_rdm = np.array(random.choices(['sqrt','log2',None],k=k))

RF_params_df = pd.DataFrame({'n_estimators':n_estimators_rdm,
                             'min_samples_leaf':min_samples_leaf_rdm,
                             'max_features':max_features_rdm})

In [28]:
#test with all X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x0', X_vars=X_cols, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 401.45 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x0|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002407,0.864504,0.498854,0.702542,0.579672,0.889198,1672.333333,1769.333333,702.0,14865.0
1,RF|x0|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00203,0.865088,0.667235,0.668596,0.661811,0.922164,2313.333333,1128.333333,1270.666667,14296.333333
2,RF|x0|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002319,0.872005,0.517684,0.718851,0.601729,0.894786,1790.0,1651.666667,676.0,14891.0
3,RF|x0|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002188,0.878411,0.54456,0.739178,0.627091,0.899915,1884.333333,1557.333333,654.333333,14912.666667
4,RF|x0|35-22-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002299,0.874514,0.511035,0.733188,0.601541,0.895035,1765.666667,1676.0,628.666667,14938.333333
5,RF|x0|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001962,0.872909,0.676513,0.687774,0.676805,0.92486,2347.333333,1094.333333,1168.0,14399.0
6,RF|x0|68-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002086,0.857395,0.672716,0.638258,0.650102,0.921777,2337.0,1104.666667,1450.0,14117.0
7,RF|x0|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00237,0.867645,0.444269,0.748263,0.556616,0.881904,1522.333333,1919.333333,496.0,15071.0
8,RF|x0|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001986,0.868135,0.67034,0.679747,0.668378,0.923176,2328.666667,1113.0,1229.333333,14337.666667
9,RF|x0|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002198,0.880479,0.551304,0.745618,0.633754,0.901621,1909.0,1532.666667,643.666667,14923.333333


In [29]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x0|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002407,0.864504,0.498854,0.702542,0.579672,0.889198,1672.333333,1769.333333,702.0,14865.0
29,RF|x0|12-20-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002621,0.814173,0.447006,0.697396,0.544809,0.836327,1687.0,2087.0,732.0,10664.0
49,RF|x0|12-20-log2,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002274,0.887608,0.590724,0.668816,0.627349,0.923655,1452.0,1006.0,719.0,12171.0
69,RF|x0|12-20-log2,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002327,0.891731,0.458832,0.741413,0.566858,0.907612,1878.0,2215.0,655.0,21760.0
1,RF|x0|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00203,0.865088,0.667235,0.668596,0.661811,0.922164,2313.333333,1128.333333,1270.666667,14296.333333
21,RF|x0|12-60-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002464,0.784443,0.72708,0.550562,0.626627,0.898881,2744.0,1030.0,2240.0,9156.0
41,RF|x0|12-60-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.001797,0.902528,0.624491,0.728178,0.672361,0.930287,1535.0,923.0,573.0,12317.0
61,RF|x0|12-60-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.001829,0.908292,0.650134,0.727049,0.686444,0.937325,2661.0,1432.0,999.0,21416.0
2,RF|x0|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002319,0.872005,0.517684,0.718851,0.601729,0.894786,1790.0,1651.666667,676.0,14891.0
27,RF|x0|17-67-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002366,0.836981,0.532326,0.739418,0.619011,0.858267,2009.0,1765.0,708.0,10688.0


In [30]:
#test with first subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x1', X_vars=X_cols_1, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 169.09 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002146,0.877048,0.636085,0.692261,0.661123,0.917367,2212.0,1229.666667,1022.0,14545.0
1,RF|x1|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.831251,0.707301,0.546618,0.612448,0.925612,2461.0,980.666667,2219.333333,13347.666667
2,RF|x1|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002189,0.879782,0.578141,0.724549,0.641843,0.907181,2017.333333,1424.333333,774.0,14793.0
3,RF|x1|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00214,0.883828,0.61131,0.728866,0.662669,0.912998,2137.0,1304.666667,830.666667,14736.333333
4,RF|x1|35-22-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002168,0.88316,0.5985,0.731698,0.656669,0.91103,2088.666667,1353.0,791.0,14776.0
5,RF|x1|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002128,0.859263,0.694038,0.630542,0.654909,0.925955,2424.333333,1017.333333,1580.0,13987.0
6,RF|x1|68-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002203,0.836741,0.70655,0.558322,0.622489,0.925153,2443.0,998.666667,2017.0,13550.0
7,RF|x1|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.882644,0.558208,0.753879,0.637809,0.905119,1958.333333,1483.333333,655.333333,14911.666667
8,RF|x1|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002162,0.84399,0.694705,0.594419,0.630962,0.924723,2432.333333,1009.333333,1938.0,13629.0
9,RF|x1|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002113,0.885685,0.621749,0.7312,0.669675,0.915481,2177.666667,1264.0,828.666667,14738.333333


In [31]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002146,0.877048,0.636085,0.692261,0.661123,0.917367,2212.0,1229.666667,1022.0,14545.0
29,RF|x1|12-20-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002336,0.832169,0.67912,0.657517,0.668144,0.892566,2563.0,1211.0,1335.0,10061.0
49,RF|x1|12-20-log2,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002048,0.899661,0.585842,0.733945,0.651584,0.92395,1440.0,1018.0,522.0,12368.0
69,RF|x1|12-20-log2,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002053,0.899313,0.643293,0.68532,0.663642,0.935586,2633.0,1460.0,1209.0,21206.0
1,RF|x1|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.831251,0.707301,0.546618,0.612448,0.925612,2461.0,980.666667,2219.333333,13347.666667
21,RF|x1|12-60-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.00252,0.780092,0.709592,0.54453,0.616199,0.893094,2678.0,1096.0,2240.0,9156.0
41,RF|x1|12-60-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.001906,0.881157,0.657852,0.621923,0.639383,0.934029,1617.0,841.0,983.0,11907.0
61,RF|x1|12-60-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002147,0.832503,0.754459,0.473402,0.581763,0.949712,3088.0,1005.0,3435.0,18980.0
2,RF|x1|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002189,0.879782,0.578141,0.724549,0.641843,0.907181,2017.333333,1424.333333,774.0,14793.0
27,RF|x1|17-67-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002304,0.843968,0.625596,0.712217,0.666102,0.88081,2361.0,1413.0,954.0,10442.0


In [32]:
#test with second subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 188.84 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.872054,0.605586,0.68947,0.641932,0.911695,2105.333333,1336.333333,976.333333,14590.666667
1,RF|x2|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002029,0.864882,0.669876,0.666969,0.662276,0.922771,2322.333333,1119.333333,1282.666667,14284.333333
2,RF|x2|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002113,0.880863,0.599633,0.723224,0.653692,0.911684,2073.666667,1368.0,807.666667,14759.333333
3,RF|x2|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00209,0.881252,0.6143,0.720897,0.660236,0.913657,2154.0,1287.666667,871.333333,14695.666667
4,RF|x2|35-22-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002127,0.879135,0.606569,0.71283,0.652316,0.912462,2128.666667,1313.0,887.333333,14679.666667
5,RF|x2|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001969,0.873211,0.677756,0.688601,0.677647,0.925274,2352.0,1089.666667,1169.333333,14397.666667
6,RF|x2|68-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002087,0.858681,0.673396,0.64189,0.652183,0.922148,2340.666667,1101.0,1432.666667,14134.333333
7,RF|x2|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002153,0.880939,0.576952,0.733981,0.644014,0.907672,2008.333333,1433.333333,737.666667,14829.333333
8,RF|x2|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001993,0.869143,0.669968,0.680951,0.669009,0.923344,2328.333333,1113.333333,1216.0,14351.0
9,RF|x2|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002094,0.883223,0.611686,0.728453,0.662817,0.913618,2137.333333,1304.333333,820.333333,14746.666667


In [33]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.872054,0.605586,0.68947,0.641932,0.911695,2105.333333,1336.333333,976.333333,14590.666667
29,RF|x2|12-20-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002426,0.821556,0.670906,0.633475,0.651654,0.888839,2532.0,1242.0,1465.0,9931.0
49,RF|x2|12-20-log2,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.002052,0.891256,0.554109,0.703876,0.620077,0.918288,1362.0,1096.0,573.0,12317.0
69,RF|x2|12-20-log2,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.002096,0.90335,0.591742,0.731059,0.654064,0.927959,2422.0,1671.0,891.0,21524.0
1,RF|x2|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002029,0.864882,0.669876,0.666969,0.662276,0.922771,2322.333333,1119.333333,1282.666667,14284.333333
21,RF|x2|12-60-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002458,0.784311,0.731585,0.55,0.627928,0.900197,2761.0,1013.0,2259.0,9137.0
41,RF|x2|12-60-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.001803,0.902007,0.626932,0.724154,0.672045,0.930635,1541.0,917.0,587.0,12303.0
61,RF|x2|12-60-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.001826,0.90833,0.651112,0.726752,0.686856,0.937481,2665.0,1428.0,1002.0,21413.0
2,RF|x2|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002113,0.880863,0.599633,0.723224,0.653692,0.911684,2073.666667,1368.0,807.666667,14759.333333
27,RF|x2|17-67-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002277,0.840804,0.662427,0.686625,0.674309,0.889496,2500.0,1274.0,1141.0,10255.0


Summary of cross validation results for Random Forest Model

In [34]:
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df[compare_df.model.str.contains("RF")].sort_values('recall', ascending=False).drop(columns=['y'])

Unnamed: 0,model,cv_fold,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,RF|x1|71-64-log2,mean,test,0.002191,0.882644,0.558208,0.753879,0.637809,0.905119,1958,1483,655,14911
7,RF|x0|71-64-log2,mean,test,0.00237,0.867645,0.444269,0.748263,0.556616,0.881904,1522,1919,496,15071
9,RF|x0|91-37-sqrt,mean,test,0.002198,0.880479,0.551304,0.745618,0.633754,0.901621,1909,1532,643,14923
3,RF|x0|30-36-sqrt,mean,test,0.002188,0.878411,0.54456,0.739178,0.627091,0.899915,1884,1557,654,14912
7,RF|x2|71-64-log2,mean,test,0.002153,0.880939,0.576952,0.733981,0.644014,0.907672,2008,1433,737,14829
4,RF|x0|35-22-log2,mean,test,0.002299,0.874514,0.511035,0.733188,0.601541,0.895035,1765,1676,628,14938
4,RF|x1|35-22-log2,mean,test,0.002168,0.88316,0.5985,0.731698,0.656669,0.91103,2088,1353,791,14776
9,RF|x1|91-37-sqrt,mean,test,0.002113,0.885685,0.621749,0.7312,0.669675,0.915481,2177,1264,828,14738
3,RF|x1|30-36-sqrt,mean,test,0.00214,0.883828,0.61131,0.728866,0.662669,0.912998,2137,1304,830,14736
9,RF|x2|91-37-sqrt,mean,test,0.002094,0.883223,0.611686,0.728453,0.662817,0.913618,2137,1304,820,14746


*****
**Final comparison**
*****
Identify which model optimizes for recall

In [35]:
compare_df = compare_df.sort_values('recall', ascending=False).reset_index().drop(columns='index')
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.882644,0.558208,0.753879,0.637809,0.905119,1958,1483,655,14911
1,RF|x0|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00237,0.867645,0.444269,0.748263,0.556616,0.881904,1522,1919,496,15071
2,RF|x0|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002198,0.880479,0.551304,0.745618,0.633754,0.901621,1909,1532,643,14923
3,RF|x0|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002188,0.878411,0.54456,0.739178,0.627091,0.899915,1884,1557,654,14912
4,HGBR|x1|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002519,0.790946,0.462651,0.734987,0.5656,0.804238,2562,2965,927,12669
5,HGBR|x2|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002478,0.79402,0.472557,0.734321,0.571796,0.807743,2558,2812,931,12822
6,HGBR|x0|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002475,0.794638,0.473873,0.734303,0.572662,0.808568,2556,2796,933,12838
7,RF|x2|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002153,0.880939,0.576952,0.733981,0.644014,0.907672,2008,1433,737,14829
8,RF|x0|35-22-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002299,0.874514,0.511035,0.733188,0.601541,0.895035,1765,1676,628,14938
9,HGBR|x0|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002432,0.800359,0.477358,0.733019,0.576322,0.815277,2580,2899,909,12734


In [36]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002191,0.882644,0.558208,0.753879,0.637809,0.905119,1958,1483,655,14911


Identify which model optimizes for mean absolute error

In [37]:
compare_df.sort_values('mae', ascending=False).head(5)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
60,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004159,0.724844,0.218602,0.241648,0.228088,0.821684,758,2731,2307,13326
52,HGBR|x1|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002621,0.831488,0.593792,0.605165,0.58388,0.878639,2105,1549,1384,14085
51,HGBR|x1|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002602,0.833657,0.595026,0.61087,0.588151,0.880401,2123,1536,1366,14097
56,HGBR|x2|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00259,0.832526,0.599066,0.591531,0.580178,0.883339,2060,1485,1429,14149
57,HGBR|x0|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002588,0.830657,0.59521,0.589586,0.576825,0.881427,2055,1509,1434,14125


Identify which model optimizes for F1 score

In [38]:
compare_df = compare_df.sort_values('F1', ascending=False)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
41,RF|x2|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001969,0.873211,0.677756,0.688601,0.677647,0.925274,2352,1089,1169,14397
42,RF|x0|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001962,0.872909,0.676513,0.687774,0.676805,0.92486,2347,1094,1168,14399
12,RF|x1|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002113,0.885685,0.621749,0.7312,0.669675,0.915481,2177,1264,828,14738
43,RF|x2|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001993,0.869143,0.669968,0.680951,0.669009,0.923344,2328,1113,1216,14351
44,RF|x0|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001986,0.868135,0.67034,0.679747,0.668378,0.923176,2328,1113,1229,14337
15,RF|x2|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002094,0.883223,0.611686,0.728453,0.662817,0.913618,2137,1304,820,14746
14,RF|x1|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00214,0.883828,0.61131,0.728866,0.662669,0.912998,2137,1304,830,14736
47,RF|x2|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002029,0.864882,0.669876,0.666969,0.662276,0.922771,2322,1119,1282,14284
46,RF|x0|12-60-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00203,0.865088,0.667235,0.668596,0.661811,0.922164,2313,1128,1270,14296
37,RF|x1|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002146,0.877048,0.636085,0.692261,0.661123,0.917367,2212,1229,1022,14545


In [39]:
#model with best F1 score will be used

In [40]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
41,RF|x2|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.001969,0.873211,0.677756,0.688601,0.677647,0.925274,2352,1089,1169,14397


*****
**Train and test selected model with validation set**
*****
Selected model details

Estimator: Random Forest

Set of predictor variables: Set 2 (all pct change lag and normalized amounts for current period)

Parameters:

    n_estimators = 48
    
    min_samples_leaf = 85
    
    max_features = None

In [41]:
#test best model with validation dataset
start_time = time.time()
val_results_df = pd.DataFrame()
val_params_df = pd.DataFrame({'n_estimators':[48],
                             'min_samples_leaf':[85],
                             'max_features':[None]})

val_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=val, RF_results_df=val_results_df, params=val_params_df)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

val_results_df

Execution time: 23.55 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|48-85-None,val,frwd01_mon_metro_hvi_pct_chg,train,0.001657,0.89586,0.744298,0.822133,0.781282,0.917417,14717,5056,3184,56167
1,RF|x2|48-85-None,val,frwd01_mon_metro_hvi_pct_chg,test,0.002568,0.919113,0.643914,0.805758,0.715801,0.935517,5094,2817,1228,40869


In [76]:
print("training y | mean:",format(sample_dfs[val['train_df'][0]][y_var].mean(),".5%"),
      "  std:",format(sample_dfs[val['train_df'][0]][y_var].std(),".5%"),
      "  mae:",format(val_results_df['mae'][0]/(sample_dfs[val['test_df'][0]][y_var].std()),".0%"),"of std")
print("test y | mean:",format(sample_dfs[val['test_df'][0]][y_var].mean(),".5%"),
      "  std:",format(sample_dfs[val['test_df'][0]][y_var].std(),".5%"),
      "  mae:",format(val_results_df['mae'][1]/(sample_dfs[val['test_df'][0]][y_var].std()),".0%"),"of std")

training y | mean: 0.22401%   std: 0.65472%   mae: 21% of std
test y | mean: 0.62979%   std: 0.77989%   mae: 33% of std


**Validation test results**
 - Out of the 6,322 periods where home value declined, the model correctly predicted a decline 80.5% of the time (recall).
 - Out of the 7,911 periods where the model predicted a decline, only 64.4% of the time did a decline actually occur (precision).
 - The model was able to predict the percentage of home value change within 1/3 of the standard deviation of the actual home value change for the test period.
