In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

import time
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) #suppress runtime warnings

import random
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error as mae 

In [2]:
#Load (unpickle) the dictionary of DataFrames from the file
with open('../Data/test_train_datasets.pkl', 'rb') as f:
    sample_dfs = pickle.load(f)  # 'rb' for reading in binary mode

sample_dfs.keys()

dict_keys(['metro_samp_1', 'metro_samp_1_train', 'metro_samp_1_test', 'metro_samp_2', 'metro_samp_2_train', 'metro_samp_2_test', 'metro_samp_3', 'metro_samp_3_train', 'metro_samp_3_test', 'metro_samp_val', 'metro_samp_val_train', 'metro_samp_val_test', 'metro_samp_1_train_normalized', 'metro_samp_1_test_normalized', 'metro_samp_2_train_normalized', 'metro_samp_2_test_normalized', 'metro_samp_3_train_normalized', 'metro_samp_3_test_normalized', 'metro_samp_val_train_normalized', 'metro_samp_val_test_normalized'])

In [3]:
X_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" not in col]
y_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" in col]

In [4]:
#set folds for cross validation testing
cv_folds = pd.DataFrame({'fold':['samp_1','samp_2','samp_3'],
                         'train_df':['metro_samp_1_train_normalized','metro_samp_2_train_normalized','metro_samp_3_train_normalized'],
                         'test_df':['metro_samp_1_test_normalized','metro_samp_2_test_normalized','metro_samp_3_test_normalized']})

val = pd.DataFrame({'fold':['val'],
                    'train_df':['metro_samp_val_train_normalized'],
                    'test_df':['metro_samp_val_test_normalized']})

In [5]:
#create function to produce confusion matrix from test results
def conf_matrix(test_results):
    pos = test_results[test_results.y_test < 0]
    true_pos = pos[pos.y_pred < 0]
    false_neg = pos[pos.y_pred >= 0]

    neg = test_results[test_results.y_test >= 0]
    true_neg = neg[neg.y_pred >= 0]
    false_pos = neg[neg.y_pred < 0]

    confusion_matrix = np.array([[len(true_pos), len(false_pos)],
                                [len(false_neg), len(true_neg)]])

    confusion_matrix_pct = confusion_matrix / len(test_results)
    
    return(confusion_matrix, confusion_matrix_pct)

In [6]:
#create class to generate key statistics from confusion matrix
class confusion_matrix_class:
    def __init__(self, cm):
        self.cm = cm
        self.tp = cm[0,0]
        self.fp = cm[0,1]
        self.fn = cm[1,0]
        self.tn = cm[1,1]
        self.accuracy = ((self.tp+self.tn)/(self.tp+self.fp+self.fn+self.tn))
        self.precision = (self.tp/(self.tp+self.fp))
        self.recall = (self.tp/(self.tp+self.fn))
        self.F1 = (2*self.precision*self.recall)/(self.precision+self.recall)
        self.specificity = (self.tn/(self.tn+self.fp))

In [7]:
compare_df = pd.DataFrame()

#create function to produce consistent results for comparison across models
def create_results_record(model, cv_fold, y_var, dataset, cm, error):                
    new_record_dict = {'model': model,
                      'cv_fold': cv_fold,
                      'y': y_var,
                      'dataset': dataset,
                      'mae': error,
                      'accuracy': cm.accuracy,
                      'precision': cm.precision,
                      'recall': cm.recall,
                      'F1': cm.F1,
                      'specificity': cm.specificity,
                      'tp': cm.tp,
                      'fp': cm.fp,
                      'fn': cm.fn,
                      'tn': cm.tn}
    return(new_record_dict)

In [8]:
HGBR_results = pd.DataFrame()

#create function to test Gradient Boosting model
def HGBR(cv_fold, 
         X_vars=X_cols,
         y_var='frwd01_mon_metro_hvi_pct_chg',
         learning_rate=.1,
         min_samples_leaf=20,
         max_depth=None,
         seed=42, 
         HGBR_results_df=HGBR_results):

    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']]
    test_df = sample_dfs[cv_fold['test_df']]
    
    #identify and drop columns with over 75% of data missing in training dataset
    drop_cols = train_df.isna().sum().divide(len(train_df)).sort_values(ascending=False).reset_index().rename(columns={'index':'col',0:'missing'})
    drop_cols['drop'] = np.where(drop_cols['missing'] > .70, "drop", "keep")
    drop_cols = list(drop_cols[drop_cols['drop'] == 'drop'].col)
    train_df = train_df.drop(columns=drop_cols)
    test_df = test_df.drop(columns=drop_cols)
    X_cols_rf = [col for col in X_vars if col not in drop_cols]

    #define training set
    train_df = train_df.replace(np.inf, np.nan)
    train_df = train_df.dropna(subset=[y_var])
    X = train_df[X_cols_rf[7:]]
    y = train_df[y_var]

    #train and score model with training data
    hgbr = HistGradientBoostingRegressor(learning_rate=learning_rate,
                                         min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth,
                                         random_state=seed)
    hgbr.fit(X, y)
    train_score = hgbr.score(X, y)

    #predict and score with test data
    test_df = test_df.replace(np.inf, np.nan)
    test_df = test_df.dropna(subset=[y_var])
    X_test = test_df[X_cols_rf[7:]]
    y_test = test_df[y_var]
    test_score = hgbr.score(X_test, y_test)
    y_pred = hgbr.predict(X_test)
    error = mae(y_test, y_pred)
    
    #create categorization of prediction
    results_index = test_df[X_vars[:7]].reset_index()
    results = pd.DataFrame(zip(y_test,y_pred), columns=['y_test','y_pred']).reset_index()
    test_results = pd.concat([results_index, results], axis=1)
    test_results['y_diff'] = test_results['y_pred'] - test_results['y_test']
    test_results['direction'] = ((test_results['y_test']*test_results['y_pred'])/
                                  abs(test_results['y_test']*test_results['y_pred']))
    test_results['direction'] = np.where(test_results['direction'] == 1, "same", "diff")
    cm, cm_pct = conf_matrix(test_results)
    hgbr_cm = confusion_matrix_class(cm)

    #add record to test results df
    new_record = create_results_record(model='HGBR', 
                                       cv_fold=fold, 
                                       y_var=y_var, 
                                       dataset='test', 
                                       cm=hgbr_cm,
                                       error=error)
                  
    HGBR_results_df = pd.concat([HGBR_results_df, pd.DataFrame([new_record])], ignore_index=True).drop_duplicates()
    HGBR_results_df[['tp','fp','fn','tn']] = HGBR_results_df[['tp','fp','fn','tn']].astype(int)
    
    return(HGBR_results_df, test_results)

In [9]:
y_focus = [col for col in y_cols if "mon_metro_hvi_pct_chg" in col]

In [10]:
for y in y_focus:

    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_results, test_results = HGBR(cv_fold,
                                          y_var=y,
                                          HGBR_results_df=HGBR_results)

In [11]:
mean_df = HGBR_results.groupby(['model','y','dataset'])[HGBR_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

HGBR_results = pd.concat([HGBR_results, mean_df], ignore_index=True)
HGBR_results = HGBR_results.sort_values(['y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')
HGBR_results.sort_values(by=['recall'], ascending=False)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
3,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003025,0.930649,0.545611,0.745882,0.630219,0.946548,634.0,528.0,216.0,9350.0
2,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004635,0.967002,0.494845,0.687679,0.57554,0.976395,240.0,245.0,109.0,10134.0
0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003167,0.941462,0.567282,0.650071,0.595295,0.964738,472.333333,350.333333,277.666667,9627.666667
7,HGBR,samp_3,frwd02_mon_metro_hvi_pct_chg,test,0.008934,0.889914,0.398561,0.61624,0.484054,0.914946,554.0,836.0,345.0,8993.0
6,HGBR,samp_2,frwd02_mon_metro_hvi_pct_chg,test,0.012409,0.962528,0.430288,0.520349,0.471053,0.977176,179.0,237.0,165.0,10147.0
1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.00184,0.926734,0.661389,0.516651,0.580128,0.971272,543.0,278.0,508.0,9399.0
4,HGBR,mean,frwd02_mon_metro_hvi_pct_chg,test,0.008707,0.922912,0.437155,0.511356,0.463679,0.951458,360.0,481.666667,345.333333,9541.0
5,HGBR,samp_1,frwd02_mon_metro_hvi_pct_chg,test,0.004777,0.916294,0.482615,0.39748,0.43593,0.962253,347.0,372.0,526.0,9483.0
11,HGBR,samp_3,frwd03_mon_metro_hvi_pct_chg,test,0.016925,0.877983,0.343721,0.387397,0.364254,0.926639,375.0,716.0,593.0,9044.0
9,HGBR,samp_1,frwd03_mon_metro_hvi_pct_chg,test,0.008768,0.915362,0.314241,0.303892,0.30898,0.955964,203.0,443.0,465.0,9617.0


Will focus on predicting whether price will increase or decrease in the following month

# Tuning

Set y variable for focus of analysis

In [12]:
y_var = 'frwd01_mon_metro_hvi_pct_chg'

Identify variations of features to use for tuning

In [13]:
#set X features (all pct change lag from 1 month, selective other pct lag, clusters, and normalized amounts for current period)
X_cols_1 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            ['prev02_mon_metro_hvi']+
            ['prev02_mon_metro_rent']+
            ['prev02_mon_state_job_openings_pct_chg']+
            ['prev02_mon_state_population_pct_chg']+
            ['prev02_mon_state_personal_income_pct_chg']+
            ['prev03_mon_metro_hvi']+
            ['prev03_mon_metro_rent']+
            ['prev03_mon_state_job_openings_pct_chg']+
            ['prev03_mon_state_population_pct_chg']+
            ['prev03_mon_state_personal_income_pct_chg']+
            ['prev06_mon_metro_hvi']+
            ['prev06_mon_metro_rent']+
            ['prev06_mon_state_job_openings_pct_chg']+
            ['prev06_mon_state_population_pct_chg']+
            ['prev06_mon_state_personal_income_pct_chg']+
            X_cols[-5:])
X_cols_1.remove('state_job_openings_szn_adjd_normalized')
X_cols_1.remove('state_personal_income_per_capita_normalized')
X_cols_1.remove('prev01_mon_state_personal_income_per_capita_pct_chg')

In [14]:
#set X features (all pct change lag and normalized amounts for current period)
X_cols_2 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev02_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev02_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev03_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev03_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev06_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev06_mon_state_job_openings_szn_adjd_pct_chg')])

*****
**Baseline model**
*****
Create baseline model (prediction constant as average % change for each metro area)

In [15]:
start_time = time.time()
baseline_results_df = pd.DataFrame()

for i in range(len(cv_folds)):
    #set datasets for cross validation fold
    cv_fold = cv_folds.loc[i]
    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']].dropna(subset=y_var)
    train_df = train_df.replace(np.inf, np.nan)
    test_df = sample_dfs[cv_fold['test_df']].dropna(subset=y_var)
    test_df = test_df.replace(np.inf, np.nan)
    
    #train model
    baseline_y_pred = train_df.groupby('metro_id')[y_var].agg(['mean','median']).reset_index()
    train_df = train_df.merge(baseline_y_pred, on='metro_id')

    #use mean for baseline
    train_df['y_true'] = np.where(train_df[y_var] >= 0, 0, 1)
    train_df['y_pred'] = np.where(train_df['mean'] >= 0, 0, 1)
    baseline_cm = confusion_matrix_class(confusion_matrix(train_df['y_true'], train_df['y_pred']))
    train_error = mae(train_df[y_var],train_df['mean'])

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="train", 
                                       cm=baseline_cm,
                                       error=train_error)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)

    #test model
    test_df = test_df.merge(baseline_y_pred, on='metro_id')
    test_df['y_true'] = np.where(test_df[y_var] >= 0, 1, 0)
    test_df['y_pred'] = np.where(test_df['mean'] >= 0, 1, 0)
    baseline_cm = confusion_matrix_class(confusion_matrix(test_df['y_true'], test_df['y_pred']))
    test_error = mae(test_df[y_var],test_df['mean'])

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="test", 
                                       cm=baseline_cm,
                                       error=test_error)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    
mean_df = baseline_results_df.groupby(['model','y','dataset'])[baseline_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

baseline_results_df = pd.concat([baseline_results_df, mean_df], ignore_index=True)
baseline_results_df = baseline_results_df.sort_values(['dataset','y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        baseline_results_df[(baseline_results_df['cv_fold'] == 'mean') & 
                                            (baseline_results_df['dataset'] == 'test')]],
                       axis=0)
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df = compare_df.drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

baseline_results_df

Execution time: 0.25 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,test,0.006199,0.923254,0.06396,0.29698,0.096363,0.933224,45.333333,704.666667,118.666667,9859.333333
1,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002956,0.891033,0.086584,0.303333,0.134715,0.90794,91.0,960.0,209.0,9468.0
2,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.009248,0.958706,0.088825,0.198718,0.122772,0.969921,31.0,318.0,125.0,10254.0
3,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.006392,0.920022,0.016471,0.388889,0.031603,0.921811,14.0,836.0,22.0,9856.0
4,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,train,0.003374,0.894469,0.992705,0.899377,0.943732,0.540653,24693.666667,143.333333,2684.666667,175.333333
5,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,train,0.002753,0.883965,0.987137,0.892375,0.937367,0.580786,14734.0,192.0,1777.0,266.0
6,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,train,0.002744,0.888941,0.992521,0.894109,0.940748,0.523316,24419.0,184.0,2892.0,202.0
7,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,train,0.004623,0.910501,0.998456,0.911649,0.95308,0.517857,34928.0,54.0,3385.0,58.0


In [16]:
baseline_results_df[baseline_results_df.cv_fold=='mean']

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,test,0.006199,0.923254,0.06396,0.29698,0.096363,0.933224,45.333333,704.666667,118.666667,9859.333333
4,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,train,0.003374,0.894469,0.992705,0.899377,0.943732,0.540653,24693.666667,143.333333,2684.666667,175.333333


*****
**Historical Gradient Boosting Model (HGBM)**
*****
Tune historical gradient boosting model and assess results 

In [17]:
#set parameter ranges for tuning HGBR model
random.seed(42)
k=15

learning_rate_rdm = np.array(random.choices(range(0,15),k=k)) / 100 + .01
min_samples_leaf_rdm = np.array(random.choices(range(100,201),k=k))
max_depth_rdm = np.array(random.choices(range(5,21),k=k))

params_df = pd.DataFrame({'learning_rate':learning_rate_rdm,
                          'min_samples_leaf':min_samples_leaf_rdm,
                          'max_depth':max_depth_rdm})

In [18]:
#test with all X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols
m_name = 'HGBR|x0|'

for idx in range(k):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 135.16 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0.1,102,20,HGBR|x0|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003309,0.939411,0.546813,0.649848,0.58218,0.96297,468,368,281,9609
6,0.14,181,17,HGBR|x0|0.14-181-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003239,0.940654,0.558574,0.649626,0.590403,0.963636,474,361,275,9616
4,0.12,122,18,HGBR|x0|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003347,0.941151,0.556751,0.649514,0.588403,0.964934,468,349,282,9628
8,0.07,181,13,HGBR|x0|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003324,0.944879,0.590451,0.644869,0.60605,0.968714,469,311,280,9667
5,0.11,159,14,HGBR|x0|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003252,0.944848,0.587991,0.639749,0.604945,0.969382,462,304,287,9673
11,0.08,115,13,HGBR|x0|0.08-115-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003264,0.944041,0.578704,0.639617,0.599886,0.968169,466,316,284,9661
2,0.05,165,6,HGBR|x0|0.05-165-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003199,0.945439,0.603217,0.60925,0.602745,0.970748,454,289,296,9688
10,0.04,134,11,HGBR|x0|0.04-134-11,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003224,0.945004,0.605621,0.602765,0.598458,0.970786,449,289,301,9689
3,0.04,155,6,HGBR|x0|0.04-155-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003201,0.94634,0.621013,0.588611,0.601612,0.973423,437,262,313,9715
7,0.02,100,16,HGBR|x0|0.02-100-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003369,0.949167,0.701353,0.473958,0.564416,0.985575,347,142,402,9835


In [19]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003309,0.939411,0.546813,0.649848,0.58218,0.96297,468.0,368.0,281.0,9609.0
12,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001842,0.924124,0.640237,0.514748,0.570675,0.968585,541.0,304.0,510.0,9373.0
24,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.00505,0.961596,0.443447,0.707736,0.545254,0.970132,247.0,310.0,102.0,10069.0
36,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003034,0.932513,0.556757,0.727059,0.630612,0.950192,618.0,492.0,232.0,9386.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003868,0.942953,0.841731,0.260535,0.386439,0.996903,169.0,31.0,581.0,9947.0
13,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002018,0.917506,0.86087,0.188392,0.309133,0.996693,198.0,32.0,853.0,9645.0
25,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.006024,0.97651,0.777143,0.389685,0.519084,0.996242,136.0,39.0,213.0,10340.0
37,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003563,0.934843,0.887179,0.203529,0.3311,0.997773,173.0,22.0,677.0,9856.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003199,0.945439,0.603217,0.60925,0.602745,0.970748,454.0,289.0,296.0,9688.0
14,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001876,0.92459,0.640371,0.525214,0.577104,0.967965,552.0,310.0,499.0,9367.0


In [20]:
#test with first subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_1
m_name = 'HGBR|x1|'

for idx in range(k):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 53.60 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0.1,102,20,HGBR|x1|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003676,0.928132,0.484059,0.659046,0.538951,0.950359,474,495,275,9482
4,0.12,122,18,HGBR|x1|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003726,0.929623,0.488598,0.651575,0.538822,0.952645,468,473,281,9504
5,0.11,159,14,HGBR|x1|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003706,0.928909,0.484103,0.649582,0.535163,0.951836,469,482,280,9496
6,0.14,181,17,HGBR|x1|0.14-181-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003527,0.932606,0.500445,0.642789,0.547652,0.956041,466,439,283,9538
11,0.08,115,13,HGBR|x1|0.08-115-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003638,0.927635,0.481653,0.639421,0.534389,0.950744,464,490,285,9487
8,0.07,181,13,HGBR|x1|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003596,0.929654,0.489763,0.631816,0.533788,0.953614,458,463,291,9514
10,0.04,134,11,HGBR|x1|0.04-134-11,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003581,0.934377,0.536074,0.60845,0.560621,0.959281,449,403,300,9574
2,0.05,165,6,HGBR|x1|0.05-165-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003463,0.938821,0.545443,0.587906,0.55957,0.966258,429,335,321,9642
3,0.04,155,6,HGBR|x1|0.04-155-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003469,0.939597,0.555889,0.574349,0.558608,0.967927,420,318,329,9659
7,0.02,100,16,HGBR|x1|0.02-100-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003578,0.945532,0.634524,0.502111,0.558089,0.979414,369,204,380,9774


In [21]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003676,0.928132,0.484059,0.659046,0.538951,0.950359,474.0,495.0,275.0,9482.0
12,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001938,0.923285,0.630734,0.523311,0.572023,0.966725,550.0,322.0,501.0,9355.0
24,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.005433,0.947241,0.349515,0.722063,0.471028,0.954813,252.0,469.0,97.0,9910.0
36,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003658,0.91387,0.471927,0.731765,0.573801,0.92954,622.0,696.0,228.0,9182.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003975,0.941368,0.817418,0.240593,0.359798,0.996645,154.0,33.0,595.0,9944.0
13,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002109,0.914616,0.849741,0.156042,0.263666,0.997003,164.0,29.0,887.0,9648.0
25,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.006139,0.975019,0.736842,0.361032,0.484615,0.995664,126.0,45.0,223.0,10334.0
37,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003678,0.934471,0.865672,0.204706,0.331113,0.997267,174.0,27.0,676.0,9851.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003463,0.938821,0.545443,0.587906,0.55957,0.966258,429.0,335.0,321.0,9642.0
14,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.00194,0.92226,0.625144,0.515699,0.565172,0.966415,542.0,325.0,509.0,9352.0


In [22]:
#test with second subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_2
m_name = 'HGBR|x2|'

for idx in range(k):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 69.76 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
4,0.12,122,18,HGBR|x2|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003284,0.941337,0.560742,0.656879,0.58893,0.964717,472,352,277,9626
0,0.1,102,20,HGBR|x2|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003236,0.941586,0.562446,0.653103,0.589544,0.965546,467,343,283,9634
8,0.07,181,13,HGBR|x2|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003254,0.942332,0.565752,0.647021,0.591769,0.965926,470,339,279,9638
5,0.11,159,14,HGBR|x2|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003278,0.944103,0.581804,0.643229,0.600574,0.968096,467,317,282,9660
6,0.14,181,17,HGBR|x2|0.14-181-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003232,0.942549,0.56684,0.64237,0.59249,0.966511,467,333,283,9644
11,0.08,115,13,HGBR|x2|0.08-115-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003245,0.944942,0.58615,0.637399,0.597818,0.969871,460,300,290,9677
2,0.05,165,6,HGBR|x2|0.05-165-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003217,0.946651,0.619766,0.597783,0.605571,0.972674,447,270,302,9708
10,0.04,134,11,HGBR|x2|0.04-134-11,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00319,0.946122,0.612578,0.595423,0.599173,0.97258,443,271,306,9706
3,0.04,155,6,HGBR|x2|0.04-155-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00321,0.947179,0.628219,0.581083,0.6018,0.974592,434,251,315,9727
7,0.02,100,16,HGBR|x2|0.02-100-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003307,0.948359,0.688272,0.472511,0.55901,0.984671,347,151,402,9826


In [23]:
#test results for each fold
HGBR_cv_results.sort_values(by=['idx','cv_fold'])

Unnamed: 0,idx,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,0,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003236,0.941586,0.562446,0.653103,0.589544,0.965546,467.0,343.0,283.0,9634.0
12,0,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001835,0.925708,0.65995,0.498573,0.568022,0.972099,524.0,270.0,527.0,9407.0
24,0,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004945,0.961037,0.440208,0.727794,0.548596,0.968879,254.0,323.0,95.0,10056.0
36,0,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.00293,0.938013,0.587182,0.732941,0.652015,0.955659,623.0,438.0,227.0,9440.0
1,1,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003871,0.942953,0.837631,0.263718,0.388613,0.996772,170.0,32.0,579.0,9945.0
13,1,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002021,0.917506,0.857759,0.189343,0.31021,0.99659,199.0,33.0,852.0,9644.0
25,1,HGBR,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.006024,0.97651,0.767956,0.398281,0.524528,0.995953,139.0,42.0,210.0,10337.0
37,1,HGBR,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.00357,0.934843,0.887179,0.203529,0.3311,0.997773,173.0,22.0,677.0,9856.0
2,2,HGBR,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003217,0.946651,0.619766,0.597783,0.605571,0.972674,447.0,270.0,302.0,9708.0
14,2,HGBR,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001856,0.925802,0.649824,0.526166,0.581493,0.969205,553.0,298.0,498.0,9379.0


Summary of cross validation results for Historical Gradient Boosting Model

In [24]:
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df[compare_df.model.str.contains("HGBR")].sort_values('recall', ascending=False).drop(columns=['y'])

Unnamed: 0,model,cv_fold,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,HGBR|x1|0.1-102-20,mean,test,0.003676,0.928132,0.484059,0.659046,0.538951,0.950359,474,495,275,9482
4,HGBR|x2|0.12-122-18,mean,test,0.003284,0.941337,0.560742,0.656879,0.58893,0.964717,472,352,277,9626
0,HGBR|x2|0.1-102-20,mean,test,0.003236,0.941586,0.562446,0.653103,0.589544,0.965546,467,343,283,9634
4,HGBR|x1|0.12-122-18,mean,test,0.003726,0.929623,0.488598,0.651575,0.538822,0.952645,468,473,281,9504
0,HGBR|x0|0.1-102-20,mean,test,0.003309,0.939411,0.546813,0.649848,0.58218,0.96297,468,368,281,9609
6,HGBR|x0|0.14-181-17,mean,test,0.003239,0.940654,0.558574,0.649626,0.590403,0.963636,474,361,275,9616
5,HGBR|x1|0.11-159-14,mean,test,0.003706,0.928909,0.484103,0.649582,0.535163,0.951836,469,482,280,9496
4,HGBR|x0|0.12-122-18,mean,test,0.003347,0.941151,0.556751,0.649514,0.588403,0.964934,468,349,282,9628
8,HGBR|x2|0.07-181-13,mean,test,0.003254,0.942332,0.565752,0.647021,0.591769,0.965926,470,339,279,9638
8,HGBR|x0|0.07-181-13,mean,test,0.003324,0.944879,0.590451,0.644869,0.60605,0.968714,469,311,280,9667


*****
**Random forest model**
*****
Tune random forest regressor model and assess results 

In [25]:
#create function to generate prediction and assess results from fitted model
def model_predict(model, results_df, df, set_name, pred_cols, y_var=y_var, m_name=m_name, fold=fold):
    X = df[pred_cols]
    
    df['y_pred_pct'] = model.predict(X)
    df['y_true'] = np.where(df[y_var] >= 0, 1, 0)
    df['y_pred'] = np.where(df['y_pred_pct'] >= 0, 1, 0)
    cm = confusion_matrix_class(confusion_matrix(df['y_true'], df['y_pred']))
    error = mae(df[y_var], df['y_pred_pct'])

    new_record = create_results_record(model=m_name, 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset=set_name, 
                                       cm=cm,
                                       error=error)
    results_df = pd.concat([results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    return(results_df)

In [26]:
#create function to train and test random forest model 
def RF(model_name, X_vars, y_var, cv_folds, RF_results_df, params):
    
    for idx in range(len(params)):
    
        n_est = params['n_estimators'][idx]
        min_samp_lf = params['min_samples_leaf'][idx]
        max_f = params['max_features'][idx]
        m_name = model_name+"|"+str(n_est)+"-"+str(min_samp_lf)+"-"+str(max_f)
        
        for i in range(len(cv_folds)):

            cv_fold = cv_folds.loc[i]
            fold = cv_fold['fold']
            train_df = sample_dfs[cv_fold['train_df']]
            test_df = sample_dfs[cv_fold['test_df']]

            #identify and drop columns with over 75% of data missing in training dataset
            drop_cols = train_df.isna().sum().divide(len(train_df)).sort_values(ascending=False).reset_index().rename(columns={'index':'col',0:'missing'})
            #print(drop_cols.to_string())
            drop_cols['drop'] = np.where(drop_cols['missing'] > .70, "drop", "keep")
            drop_cols = list(drop_cols[drop_cols['drop'] == 'drop'].col)
            train_df = train_df.drop(columns=drop_cols)
            test_df = test_df.drop(columns=drop_cols)
            X_cols_rf = [col for col in X_vars if col not in drop_cols]

            #define training set
            train_df = train_df.replace(np.inf, np.nan)
            train_df = train_df.dropna()
            X = train_df[X_cols_rf[7:]]
            y = train_df[y_var]

            #define test data
            test_df = test_df.replace(np.inf, np.nan)
            test_df = test_df.dropna()
            X_test = test_df[X_cols_rf[7:]]
            y_test = test_df[y_var]

            rf_model = RandomForestRegressor(n_estimators=n_est, 
                                             min_samples_leaf=params['min_samples_leaf'][idx],
                                             max_features=params['max_features'][idx],
                                             n_jobs=-1, 
                                             random_state=42)
            rf_model.fit(X, y)
            
            RF_results_df = model_predict(rf_model, RF_results_df, train_df, "train", X_cols_rf[7:], y_var, m_name, fold)
            RF_results_df = model_predict(rf_model, RF_results_df, test_df, "test", X_cols_rf[7:], y_var, m_name, fold)

    return(RF_results_df)

In [27]:
#set parameter ranges for tuning RF model
random.seed(42)

k=15
n_estimators_rdm = np.array(random.choices(range(10,101),k=k))
min_samples_leaf_rdm = np.array(random.choices(range(20,101),k=k))
max_features_rdm = np.array(random.choices(['sqrt','log2',None],k=k))

RF_params_df = pd.DataFrame({'n_estimators':n_estimators_rdm,
                             'min_samples_leaf':min_samples_leaf_rdm,
                             'max_features':max_features_rdm})

In [28]:
#test with all X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x0', X_vars=X_cols, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 70.03 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x0|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004699,0.960557,0.064985,0.320635,0.147737,0.963654,14.0,179.666667,15.333333,4723.666667
1,RF|x0|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002922,0.96334,0.44829,0.48711,0.444424,0.9775,85.0,108.666667,71.0,4668.0
2,RF|x0|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005425,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
3,RF|x0|17-47-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005249,0.959832,0.001323,0.125,0.007812,0.960788,0.333333,193.333333,4.666667,4734.333333
4,RF|x0|28-88-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004944,0.959916,0.0,0.0,,0.960729,0.0,193.666667,4.0,4735.0
5,RF|x0|29-47-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002955,0.966384,0.419671,0.551055,0.438684,0.97549,75.0,118.666667,46.666667,4692.333333
6,RF|x0|30-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002991,0.967292,0.394694,0.554894,0.437985,0.975101,74.333333,119.333333,42.333333,4696.666667
7,RF|x0|35-67-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005163,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
8,RF|x0|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002919,0.963461,0.470024,0.483248,0.454103,0.97814,88.666667,105.0,74.0,4665.0
9,RF|x0|55-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004691,0.960153,0.047619,0.264706,0.225,0.963209,12.0,181.666667,15.0,4724.0


In [29]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x0|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004699,0.960557,0.064985,0.320635,0.147737,0.963654,14.0,179.666667,15.333333,4723.666667
42,RF|x0|12-27-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001655,0.945205,0.162698,0.585714,0.254658,0.951044,41.0,211.0,29.0,4099.0
72,RF|x0|12-27-sqrt,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.007865,0.990392,0.032258,0.055556,0.040816,0.993845,1.0,30.0,17.0,4844.0
102,RF|x0|12-27-sqrt,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.004578,0.946073,0.0,,,0.946073,0.0,298.0,0.0,5228.0
1,RF|x0|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002922,0.96334,0.44829,0.48711,0.444424,0.9775,85.0,108.666667,71.0,4668.0
31,RF|x0|12-37-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001399,0.946119,0.547619,0.530769,0.539062,0.97233,138.0,114.0,122.0,4006.0
61,RF|x0|12-37-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004187,0.988962,0.451613,0.27451,0.341463,0.996488,14.0,17.0,37.0,4824.0
91,RF|x0|12-37-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003179,0.95494,0.345638,0.656051,0.452747,0.96368,103.0,195.0,54.0,5174.0
2,RF|x0|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005425,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
39,RF|x0|12-97-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002049,0.942466,0.0,,,0.942466,0.0,252.0,0.0,4128.0


In [30]:
#test with first subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x1', X_vars=X_cols_1, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 25.01 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005147,0.960053,0.027948,0.291667,0.070018,0.961678,4.666667,189.0,8.0,4731.0
1,RF|x1|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003015,0.962894,0.421444,0.496381,0.417634,0.976119,77.0,116.666667,65.333333,4673.666667
2,RF|x1|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005238,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
3,RF|x1|17-47-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005317,0.960189,0.0,0.0,,0.960731,0.0,193.666667,2.666667,4736.333333
4,RF|x1|28-88-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005298,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
5,RF|x1|29-47-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003057,0.962835,0.426293,0.493962,0.415685,0.975835,76.0,117.666667,64.666667,4674.333333
6,RF|x1|30-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003056,0.966786,0.391543,0.544725,0.428489,0.974935,73.333333,120.333333,44.0,4695.0
7,RF|x1|35-67-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005194,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
8,RF|x1|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003045,0.962971,0.449408,0.474769,0.436668,0.977558,85.333333,108.333333,73.333333,4665.666667
9,RF|x1|55-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005117,0.960357,0.025303,0.347222,0.063997,0.961559,4.0,189.666667,6.0,4733.0


In [31]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005147,0.960053,0.027948,0.291667,0.070018,0.961678,4.666667,189.0,8.0,4731.0
42,RF|x1|12-27-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001821,0.942466,0.051587,0.5,0.093525,0.945108,13.0,239.0,13.0,4115.0
72,RF|x1|12-27-sqrt,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.009473,0.991619,0.032258,0.083333,0.046512,0.993852,1.0,30.0,11.0,4850.0
102,RF|x1|12-27-sqrt,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.004147,0.946073,0.0,,,0.946073,0.0,298.0,0.0,5228.0
1,RF|x1|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003015,0.962894,0.421444,0.496381,0.417634,0.976119,77.0,116.666667,65.333333,4673.666667
31,RF|x1|12-37-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001439,0.946119,0.547619,0.530769,0.539062,0.97233,138.0,114.0,122.0,4006.0
61,RF|x1|12-37-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004309,0.988348,0.451613,0.259259,0.329412,0.996486,14.0,17.0,40.0,4821.0
91,RF|x1|12-37-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003296,0.954216,0.265101,0.699115,0.384428,0.959542,79.0,219.0,34.0,5194.0
2,RF|x1|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005238,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
39,RF|x1|12-97-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.002001,0.942466,0.0,,,0.942466,0.0,252.0,0.0,4128.0


In [32]:
#test with second subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 44.03 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00461,0.960345,0.048942,0.293651,0.234921,0.963303,12.333333,181.333333,14.666667,4724.333333
1,RF|x2|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002915,0.963031,0.438223,0.484683,0.432267,0.976978,82.0,111.666667,69.666667,4669.333333
2,RF|x2|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005124,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
3,RF|x2|17-47-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005066,0.961715,0.046467,0.821429,0.123087,0.962691,9.333333,184.333333,5.0,4734.0
4,RF|x2|28-88-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004772,0.959576,0.0,0.0,,0.960727,0.0,193.666667,5.666667,4733.333333
5,RF|x2|29-47-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002947,0.965358,0.414617,0.532042,0.428189,0.975726,76.0,117.666667,52.333333,4686.666667
6,RF|x2|30-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00299,0.966914,0.39022,0.548766,0.430939,0.97486,73.0,120.666667,43.0,4696.0
7,RF|x2|35-67-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004882,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
8,RF|x2|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002908,0.963173,0.457034,0.476395,0.442685,0.977954,87.666667,106.0,74.333333,4664.666667
9,RF|x2|55-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.004428,0.960697,0.079706,0.364072,0.171829,0.963962,15.333333,178.333333,16.333333,4722.666667


In [33]:
RF_results_df.sort_values(by=['dataset','model','cv_fold']).head(k*4)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|12-27-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00461,0.960345,0.048942,0.293651,0.234921,0.963303,12.333333,181.333333,14.666667,4724.333333
42,RF|x2|12-27-sqrt,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001642,0.944977,0.146825,0.587302,0.234921,0.950197,37.0,215.0,26.0,4102.0
72,RF|x2|12-27-sqrt,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.008132,0.989984,0.0,0.0,,0.99364,0.0,31.0,18.0,4843.0
102,RF|x2|12-27-sqrt,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.004054,0.946073,0.0,,,0.946073,0.0,298.0,0.0,5228.0
1,RF|x2|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002915,0.963031,0.438223,0.484683,0.432267,0.976978,82.0,111.666667,69.666667,4669.333333
31,RF|x2|12-37-None,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001396,0.946347,0.547619,0.532819,0.540117,0.972337,138.0,114.0,121.0,4007.0
61,RF|x2|12-37-None,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.004189,0.988348,0.451613,0.259259,0.329412,0.996486,14.0,17.0,40.0,4821.0
91,RF|x2|12-37-None,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.003159,0.954397,0.315436,0.661972,0.427273,0.96211,94.0,204.0,48.0,5180.0
2,RF|x2|12-97-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005124,0.960734,0.0,,,0.960734,0.0,193.666667,0.0,4739.0
39,RF|x2|12-97-log2,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.001868,0.942466,0.0,,,0.942466,0.0,252.0,0.0,4128.0


Summary of cross validation results for Random Forest Model

In [34]:
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df[compare_df.model.str.contains("RF")].sort_values('recall', ascending=False).drop(columns=['y'])

Unnamed: 0,model,cv_fold,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
3,RF|x2|17-47-log2,mean,test,0.005066,0.961715,0.046467,0.821429,0.123087,0.962691,9,184,5,4734
6,RF|x0|30-85-None,mean,test,0.002991,0.967292,0.394694,0.554894,0.437985,0.975101,74,119,42,4696
5,RF|x0|29-47-None,mean,test,0.002955,0.966384,0.419671,0.551055,0.438684,0.97549,75,118,46,4692
6,RF|x2|30-85-None,mean,test,0.00299,0.966914,0.39022,0.548766,0.430939,0.97486,73,120,43,4696
14,RF|x2|91-76-None,mean,test,0.002971,0.966823,0.389812,0.547351,0.430224,0.974827,73,120,43,4695
6,RF|x1|30-85-None,mean,test,0.003056,0.966786,0.391543,0.544725,0.428489,0.974935,73,120,44,4695
10,RF|x2|68-64-None,mean,test,0.002958,0.966678,0.39093,0.543295,0.427591,0.974886,73,120,44,4694
14,RF|x0|91-76-None,mean,test,0.002974,0.96671,0.394286,0.542615,0.433476,0.975052,74,119,45,4693
10,RF|x0|68-64-None,mean,test,0.002961,0.966513,0.402801,0.54228,0.434472,0.975002,74,119,46,4693
10,RF|x1|68-64-None,mean,test,0.003065,0.966324,0.399446,0.539924,0.430267,0.974824,73,120,46,4693


*****
**Final comparison**
*****
Identify which model optimizes for recall

In [35]:
compare_df = compare_df.sort_values('recall', ascending=False).reset_index().drop(columns='index').drop_duplicates()
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|17-47-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005066,0.961715,0.046467,0.821429,0.123087,0.962691,9,184,5,4734
1,HGBR|x1|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003676,0.928132,0.484059,0.659046,0.538951,0.950359,474,495,275,9482
2,HGBR|x2|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003284,0.941337,0.560742,0.656879,0.58893,0.964717,472,352,277,9626
3,HGBR|x2|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003236,0.941586,0.562446,0.653103,0.589544,0.965546,467,343,283,9634
4,HGBR|x1|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003726,0.929623,0.488598,0.651575,0.538822,0.952645,468,473,281,9504
5,HGBR|x0|0.1-102-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003309,0.939411,0.546813,0.649848,0.58218,0.96297,468,368,281,9609
6,HGBR|x0|0.14-181-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003239,0.940654,0.558574,0.649626,0.590403,0.963636,474,361,275,9616
7,HGBR|x1|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003706,0.928909,0.484103,0.649582,0.535163,0.951836,469,482,280,9496
8,HGBR|x0|0.12-122-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003347,0.941151,0.556751,0.649514,0.588403,0.964934,468,349,282,9628
9,HGBR|x2|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003254,0.942332,0.565752,0.647021,0.591769,0.965926,470,339,279,9638


In [36]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|17-47-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.005066,0.961715,0.046467,0.821429,0.123087,0.962691,9,184,5,4734


Identify which model optimizes for F1 score

In [37]:
compare_df = compare_df.sort_values('F1', ascending=False)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
10,HGBR|x0|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003324,0.944879,0.590451,0.644869,0.60605,0.968714,469,311,280,9667
22,HGBR|x2|0.05-165-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003217,0.946651,0.619766,0.597783,0.605571,0.972674,447,270,302,9708
14,HGBR|x0|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003252,0.944848,0.587991,0.639749,0.604945,0.969382,462,304,287,9673
19,HGBR|x0|0.05-165-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003199,0.945439,0.603217,0.60925,0.602745,0.970748,454,289,296,9688
26,HGBR|x2|0.04-155-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00321,0.947179,0.628219,0.581083,0.6018,0.974592,434,251,315,9727
24,HGBR|x0|0.04-155-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003201,0.94634,0.621013,0.588611,0.601612,0.973423,437,262,313,9715
11,HGBR|x2|0.11-159-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003278,0.944103,0.581804,0.643229,0.600574,0.968096,467,317,282,9660
15,HGBR|x0|0.08-115-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003264,0.944041,0.578704,0.639617,0.599886,0.968169,466,316,284,9661
23,HGBR|x2|0.04-134-11,mean,frwd01_mon_metro_hvi_pct_chg,test,0.00319,0.946122,0.612578,0.595423,0.599173,0.97258,443,271,306,9706
21,HGBR|x0|0.04-134-11,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003224,0.945004,0.605621,0.602765,0.598458,0.970786,449,289,301,9689


In [38]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
10,HGBR|x0|0.07-181-13,mean,frwd01_mon_metro_hvi_pct_chg,test,0.003324,0.944879,0.590451,0.644869,0.60605,0.968714,469,311,280,9667


Identify which model optimizes for mean absolute error

In [48]:
compare_df = compare_df.sort_values('mae', ascending=True)
print(compare_df.to_string())

                  model cv_fold                             y dataset       mae  accuracy  precision    recall        F1  specificity   tp   fp   fn    tn
45     RF|x2|48-32-None    mean  frwd01_mon_metro_hvi_pct_chg    test  0.002908  0.963173   0.457034  0.476395  0.442685     0.977954   87  106   74  4664
43     RF|x2|12-37-None    mean  frwd01_mon_metro_hvi_pct_chg    test  0.002915  0.963031   0.438223  0.484683  0.432267     0.976978   82  111   69  4669
44     RF|x0|48-32-None    mean  frwd01_mon_metro_hvi_pct_chg    test  0.002919  0.963461   0.470024  0.483248  0.454103     0.978140   88  105   74  4665
42     RF|x0|12-37-None    mean  frwd01_mon_metro_hvi_pct_chg    test  0.002922  0.963340   0.448290  0.487110  0.444424     0.977500   85  108   71  4668
38     RF|x2|29-47-None    mean  frwd01_mon_metro_hvi_pct_chg    test  0.002947  0.965358   0.414617  0.532042  0.428189     0.975726   76  117   52  4686
29     RF|x0|29-47-None    mean  frwd01_mon_metro_hvi_pct_chg    test 

In [40]:
compare_df[compare_df.tp > 5].head(25)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
45,RF|x2|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002908,0.963173,0.457034,0.476395,0.442685,0.977954,87,106,74,4664
43,RF|x2|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002915,0.963031,0.438223,0.484683,0.432267,0.976978,82,111,69,4669
44,RF|x0|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002919,0.963461,0.470024,0.483248,0.454103,0.97814,88,105,74,4665
42,RF|x0|12-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002922,0.96334,0.44829,0.48711,0.444424,0.9775,85,108,71,4668
38,RF|x2|29-47-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002947,0.965358,0.414617,0.532042,0.428189,0.975726,76,117,52,4686
29,RF|x0|29-47-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002955,0.966384,0.419671,0.551055,0.438684,0.97549,75,118,46,4692
33,RF|x2|68-64-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002958,0.966678,0.39093,0.543295,0.427591,0.974886,73,120,44,4694
35,RF|x0|68-64-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002961,0.966513,0.402801,0.54228,0.434472,0.975002,74,119,46,4693
31,RF|x2|91-76-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002971,0.966823,0.389812,0.547351,0.430224,0.974827,73,120,43,4695
34,RF|x0|91-76-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002974,0.96671,0.394286,0.542615,0.433476,0.975052,74,119,45,4693


In [41]:
#model with lowest mae will be used

In [42]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
45,RF|x2|48-32-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.002908,0.963173,0.457034,0.476395,0.442685,0.977954,87,106,74,4664


*****
**Train and test selected model with validation set**
*****
Selected model details

Estimator: Random Forest

Set of predictor variables: Set 2 (all pct change lag and normalized amounts for current period)

Parameters:

    n_estimators = 48
    
    min_samples_leaf = 32
    
    max_features = None

In [49]:
#test best model with validation dataset
start_time = time.time()
val_results_df = pd.DataFrame()
val_params_df = pd.DataFrame({'n_estimators':[48],
                             'min_samples_leaf':[32],
                             'max_features':[None]})

val_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=val, RF_results_df=val_results_df, params=val_params_df)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

val_results_df.sort_values('dataset')

Execution time: 5.98 seconds


Unnamed: 0,model,cv_fold,y,dataset,mae,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
1,RF|x2|48-32-None,val,frwd01_mon_metro_hvi_pct_chg,test,0.002576,0.849621,0.660153,0.943076,0.776651,0.813773,1723,887,104,3876
0,RF|x2|48-32-None,val,frwd01_mon_metro_hvi_pct_chg,train,0.001225,0.972988,0.518029,0.833656,0.638992,0.977101,431,401,86,17111


In [44]:
print("training y | mean:",format(sample_dfs[val['train_df'][0]][y_var].mean(),".5%"),
      "  std:",format(sample_dfs[val['train_df'][0]][y_var].std(),".5%"),
      "  mae:",format(val_results_df['mae'][0]/(sample_dfs[val['test_df'][0]][y_var].std()),".0%"),"of std")
print("test y | mean:",format(sample_dfs[val['test_df'][0]][y_var].mean(),".5%"),
      "  std:",format(sample_dfs[val['test_df'][0]][y_var].std(),".5%"),
      "  mae:",format(val_results_df['mae'][1]/(sample_dfs[val['test_df'][0]][y_var].std()),".0%"),"of std")

training y | mean: 0.75662%   std: 0.69519%   mae: 19% of std
test y | mean: 0.08406%   std: 0.63287%   mae: 41% of std


**Validation test results**
 - The model was able to predict the percentage of home value change within 41% of one standard deviation of the actual home value change for the test period.
 - Out of the 1,827 periods where home value declined, the model correctly predicted a decline 94.3% of the time (recall).
 - Out of the 2,610 periods where the model predicted a decline, only 66% of the time did a decline actually occur (precision).
 - Out of the 4,763 periods where home value increased, the model correctly predicted an increase 81.4% of the time (specificity).