In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import time

import random
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import confusion_matrix

In [2]:
#Load (unpickle) the dictionary of DataFrames from the file
with open('../Data/test_train_datasets.pkl', 'rb') as f:
    sample_dfs = pickle.load(f)  # 'rb' for reading in binary mode

sample_dfs.keys()

dict_keys(['metro_samp_1', 'metro_samp_1_train', 'metro_samp_1_test', 'metro_samp_2', 'metro_samp_2_train', 'metro_samp_2_test', 'metro_samp_3', 'metro_samp_3_train', 'metro_samp_3_test', 'metro_samp_val', 'metro_samp_val_train', 'metro_samp_val_test', 'metro_samp_1_train_normalized', 'metro_samp_1_test_normalized', 'metro_samp_2_train_normalized', 'metro_samp_2_test_normalized', 'metro_samp_3_train_normalized', 'metro_samp_3_test_normalized', 'metro_samp_val_train_normalized', 'metro_samp_val_test_normalized'])

In [3]:
X_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" not in col]
y_cols = [col for col in sample_dfs['metro_samp_val_train_normalized'].columns if "frwd" in col]

In [50]:
#set folds for cross validation testing
cv_folds = pd.DataFrame({'fold':['samp_1','samp_2','samp_3'],
                         'train_df':['metro_samp_1_train_normalized','metro_samp_2_train_normalized','metro_samp_3_train_normalized'],
                         'test_df':['metro_samp_1_test_normalized','metro_samp_2_test_normalized','metro_samp_3_test_normalized']})

val = pd.DataFrame({'fold':['val'],
                    'train_df':['metro_samp_val_train_normalized'],
                    'test_df':['metro_samp_val_test_normalized']})

In [5]:
#create function to produce confusion matrix from test results
def conf_matrix(test_results):
    pos = test_results[test_results.y_test < 0]
    true_pos = pos[pos.y_pred < 0]
    false_neg = pos[pos.y_pred >= 0]

    neg = test_results[test_results.y_test >= 0]
    true_neg = neg[neg.y_pred >= 0]
    false_pos = neg[neg.y_pred < 0]

    confusion_matrix = np.array([[len(true_pos), len(false_pos)],
                                [len(false_neg), len(true_neg)]])

    confusion_matrix_pct = confusion_matrix / len(test_results)
    
    return(confusion_matrix, confusion_matrix_pct)

In [6]:
#create class to generate key statistics from confusion matrix
class confusion_matrix_class:
    def __init__(self, cm):
        self.cm = cm
        self.tp = cm[0,0]
        self.fp = cm[0,1]
        self.fn = cm[1,0]
        self.tn = cm[1,1]
        self.accuracy = ((self.tp+self.tn)/(self.tp+self.fp+self.fn+self.tn))
        self.precision = (self.tp/(self.tp+self.fp))
        self.recall = (self.tp/(self.tp+self.fn))
        self.F1 = (2*self.precision*self.recall)/(self.precision+self.recall)
        self.specificity = (self.tn/(self.tn+self.fp))

In [7]:
compare_df = pd.DataFrame()

#create function to produce consistent results for comparison across models
def create_results_record(model, cv_fold, y_var, dataset, cm):                
    new_record_dict = {'model': model,
                      'cv_fold': cv_fold,
                      'y': y_var,
                      'dataset': dataset,
                      'accuracy': cm.accuracy,
                      'precision': cm.precision,
                      'recall': cm.recall,
                      'F1': cm.F1,
                      'specificity': cm.specificity,
                      'tp': cm.tp,
                      'fp': cm.fp,
                      'fn': cm.fn,
                      'tn': cm.tn}
    return(new_record_dict)

In [8]:
HGBR_results = pd.DataFrame()

#create function to test Gradient Boosting model
def HGBR(cv_fold, 
         X_vars=X_cols,
         y_var='frwd01_mon_metro_hvi_pct_chg',
         learning_rate=.1,
         min_samples_leaf=20,
         max_depth=None,
         seed=42, 
         HGBR_results_df=HGBR_results):

    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']]
    test_df = sample_dfs[cv_fold['test_df']]
    
    #define training set
    train_df = train_df.dropna(subset=[y_var])
    X = train_df[X_vars[7:]]
    y = train_df[y_var]

    #train and score model with training data
    hgbr = HistGradientBoostingRegressor(learning_rate=learning_rate,
                                         min_samples_leaf=min_samples_leaf,
                                         max_depth=max_depth,
                                         random_state=seed)
    hgbr.fit(X, y)
    train_score = hgbr.score(X, y)

    #predict and score with test data
    test_df = test_df.dropna(subset=[y_var])
    X_test = test_df[X_vars[7:]]
    y_test = test_df[y_var]
    test_score = hgbr.score(X_test, y_test)
    y_pred = hgbr.predict(X_test)
    
    #create categorization of prediction
    results_index = test_df[X_vars[:7]].reset_index()
    results = pd.DataFrame(zip(y_test,y_pred), columns=['y_test','y_pred']).reset_index()
    test_results = pd.concat([results_index, results], axis=1)
    test_results['y_diff'] = test_results['y_pred'] - test_results['y_test']
    test_results['direction'] = ((test_results['y_test']*test_results['y_pred'])/
                                  abs(test_results['y_test']*test_results['y_pred']))
    test_results['direction'] = np.where(test_results['direction'] == 1, "same", "diff")
    cm, cm_pct = conf_matrix(test_results)
    hgbr_cm = confusion_matrix_class(cm)

    #add record to test results df
    new_record = create_results_record(model='HGBR', 
                                       cv_fold=fold, 
                                       y_var=y_var, 
                                       dataset='test', 
                                       cm=hgbr_cm)
                  
    HGBR_results_df = pd.concat([HGBR_results_df, pd.DataFrame([new_record])], ignore_index=True).drop_duplicates()
    HGBR_results_df[['tp','fp','fn','tn']] = HGBR_results_df[['tp','fp','fn','tn']].astype(int)
    
    return(HGBR_results_df, test_results)

In [9]:
y_focus = [col for col in y_cols if "mon_metro_hvi_pct_chg" in col]

In [10]:
for y in y_focus:

    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_results, test_results = HGBR(cv_fold,
                                          y_var=y,
                                          HGBR_results_df=HGBR_results)

In [11]:
mean_df = HGBR_results.groupby(['model','y','dataset'])[HGBR_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

HGBR_results = pd.concat([HGBR_results, mean_df], ignore_index=True)
HGBR_results = HGBR_results.sort_values(['y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')
HGBR_results.sort_values(by=['recall'])

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
22,HGBR,samp_2,frwd07_mon_metro_hvi_pct_chg,test,0.881213,0.481544,0.159091,0.239167,0.977226,287.0,309.0,1517.0,13259.0
21,HGBR,samp_1,frwd07_mon_metro_hvi_pct_chg,test,0.812808,0.455056,0.176663,0.254517,0.953283,486.0,582.0,2265.0,11876.0
17,HGBR,samp_1,frwd06_mon_metro_hvi_pct_chg,test,0.802617,0.498615,0.18006,0.264576,0.955528,540.0,543.0,2459.0,11667.0
20,HGBR,mean,frwd07_mon_metro_hvi_pct_chg,test,0.861414,0.462008,0.206192,0.280284,0.963545,518.333333,616.333333,1921.666667,16067.333333
18,HGBR,samp_2,frwd06_mon_metro_hvi_pct_chg,test,0.866445,0.410329,0.234694,0.298599,0.953516,437.0,628.0,1425.0,12882.0
16,HGBR,mean,frwd06_mon_metro_hvi_pct_chg,test,0.852883,0.469287,0.242178,0.315642,0.956746,632.666667,698.666667,1972.333333,15820.0
13,HGBR,samp_1,frwd04_mon_metro_hvi_pct_chg,test,0.789007,0.58362,0.245655,0.34577,0.948541,848.0,605.0,2604.0,11152.0
23,HGBR,samp_3,frwd07_mon_metro_hvi_pct_chg,test,0.89022,0.449425,0.282821,0.34717,0.960125,782.0,958.0,1983.0,23067.0
19,HGBR,samp_3,frwd06_mon_metro_hvi_pct_chg,test,0.889586,0.498917,0.311781,0.38375,0.961193,921.0,925.0,2033.0,22911.0
14,HGBR,samp_2,frwd04_mon_metro_hvi_pct_chg,test,0.86534,0.483565,0.326914,0.3901,0.947029,662.0,707.0,1363.0,12640.0


Will focus on predicting whether price will increase or decrease in the following month

# Tuning

Set y variable for focus of analysis

In [12]:
y_var = 'frwd01_mon_metro_hvi_pct_chg'

Identify variations of features to use for tuning

In [13]:
#set X features (all pct change lag from 1 month, selective other pct lag, clusters, and normalized amounts for current period)
X_cols_1 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            ['prev04_mon_metro_hvi']+
            ['prev04_mon_metro_rent']+
            ['prev04_mon_state_job_openings_pct_chg']+
            ['prev04_mon_state_population_pct_chg']+
            ['prev04_mon_state_personal_income_pct_chg']+
            ['prev07_mon_metro_hvi']+
            ['prev07_mon_metro_rent']+
            ['prev07_mon_state_job_openings_pct_chg']+
            ['prev07_mon_state_population_pct_chg']+
            ['prev07_mon_state_personal_income_pct_chg']+
            ['prev12_mon_metro_hvi']+
            ['prev12_mon_metro_rent']+
            ['prev12_mon_state_job_openings_pct_chg']+
            ['prev12_mon_state_population_pct_chg']+
            ['prev12_mon_state_personal_income_pct_chg']+
            X_cols[-5:])
X_cols_1.remove('state_job_openings_szn_adjd_normalized')
X_cols_1.remove('state_personal_income_per_capita_normalized')
X_cols_1.remove('prev01_mon_state_personal_income_per_capita_pct_chg')

In [14]:
#set X features (all pct change lag and normalized amounts for current period)
X_cols_2 = (X_cols[:7]+
            X_cols[X_cols.index('metro_for_sale_inventory_normalized'):
                   X_cols.index('prev01_mon_metro_for_sale_inventory')]+
            X_cols[X_cols.index('prev01_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev01_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev04_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev04_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev07_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev07_mon_state_job_openings_szn_adjd_pct_chg')]+
            X_cols[X_cols.index('prev12_mon_metro_for_sale_inventory_pct_chg'):
                   X_cols.index('prev12_mon_state_job_openings_szn_adjd_pct_chg')])

In [15]:
#X_cols

*****
**Baseline model**
*****
Create baseline model (prediction constant as average % change for each metro area)

In [16]:
start_time = time.time()
baseline_results_df = pd.DataFrame()

for i in range(len(cv_folds)):
    #set datasets for cross validation fold
    cv_fold = cv_folds.loc[i]
    fold = cv_fold['fold']
    train_df = sample_dfs[cv_fold['train_df']]
    test_df = sample_dfs[cv_fold['test_df']]

    #train model
    baseline_y_pred = train_df.groupby('metro_id')[y_var].agg(['mean','median']).reset_index()
    train_df = train_df.merge(baseline_y_pred, on='metro_id')

    #use mean for baseline
    train_df['y_true'] = np.where(train_df[y_var] >= 0, 0, 1)
    train_df['y_pred'] = np.where(train_df['mean'] >= 0, 0, 1)
    baseline_cm = confusion_matrix_class(confusion_matrix(train_df['y_true'], train_df['y_pred']))

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="train", 
                                       cm=baseline_cm)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)

    #test model
    test_df = test_df.merge(baseline_y_pred, on='metro_id')
    test_df['y_true'] = np.where(test_df[y_var] >= 0, 1, 0)
    test_df['y_pred'] = np.where(test_df['mean'] >= 0, 1, 0)
    baseline_cm = confusion_matrix_class(confusion_matrix(test_df['y_true'], test_df['y_pred']))

    new_record = create_results_record(model="baseline_mean", 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset="test", 
                                       cm=baseline_cm)
    baseline_results_df = pd.concat([baseline_results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    
mean_df = baseline_results_df.groupby(['model','y','dataset'])[baseline_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'

baseline_results_df = pd.concat([baseline_results_df, mean_df], ignore_index=True)
baseline_results_df = baseline_results_df.sort_values(['dataset','y','cv_fold']).drop_duplicates(subset=['cv_fold','y','dataset']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        baseline_results_df[(baseline_results_df['cv_fold'] == 'mean') & 
                                            (baseline_results_df['dataset'] == 'test')]],
                       axis=0)
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df = compare_df.drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

baseline_results_df

Execution time: 0.63 seconds


Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,test,0.725072,0.22442,0.249211,0.23468,0.821662,788.333333,2731.666667,2307.666667,13326.333333
1,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,test,0.653715,0.284658,0.29648,0.290449,0.765941,1078.0,2709.0,2558.0,8865.0
2,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,test,0.738355,0.210057,0.199846,0.204824,0.847574,518.0,1948.0,2074.0,10832.0
3,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,test,0.783147,0.178547,0.251307,0.208769,0.851469,769.0,3538.0,2291.0,20282.0
4,baseline_mean,mean,frwd01_mon_metro_hvi_pct_chg,train,0.576721,0.933031,0.525365,0.672084,0.819368,68030.333333,4694.333333,61695.666667,21585.666667
5,baseline_mean,samp_1,frwd01_mon_metro_hvi_pct_chg,train,0.605939,0.913555,0.532535,0.672849,0.839598,55472.0,5249.0,48694.0,27475.0
6,baseline_mean,samp_2,frwd01_mon_metro_hvi_pct_chg,train,0.571825,0.929178,0.525,0.67092,0.802701,67095.0,5114.0,60705.0,20806.0
7,baseline_mean,samp_3,frwd01_mon_metro_hvi_pct_chg,train,0.552399,0.956361,0.518561,0.672485,0.815805,81524.0,3720.0,75688.0,16476.0


*****
**Historical Gradient Boosting Model (HGBM)**
*****
Tune historical gradient boosting model and assess results 

In [17]:
#set parameter ranges for tuning HGBR model
random.seed(42)
learning_rate_rdm = np.array(random.choices(range(0,15),k=10)) / 100 + .01
min_samples_leaf_rdm = np.array(random.choices(range(100,201),k=10))
max_depth_rdm = np.array(random.choices(range(5,21),k=10))

params_df = pd.DataFrame({'learning_rate':learning_rate_rdm,
                          'min_samples_leaf':min_samples_leaf_rdm,
                          'max_depth':max_depth_rdm})

In [18]:
#test with all X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols
m_name = 'HGBR|x0|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 368.73 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x0|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.794638,0.473873,0.734303,0.572662,0.808568,2556,2796,933,12838
8,0.07,181,18,HGBR|x0|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.800359,0.477358,0.733019,0.576322,0.815277,2580,2899,909,12734
3,0.04,120,7,HGBR|x0|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.803003,0.481359,0.721689,0.577099,0.82029,2534,2720,955,12913
2,0.05,102,10,HGBR|x0|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.817962,0.511574,0.71594,0.595756,0.83988,2519,2456,970,13178
5,0.11,155,10,HGBR|x0|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.814739,0.512255,0.710716,0.589789,0.837219,2525,2621,964,13013
4,0.12,165,20,HGBR|x0|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.82019,0.521712,0.698989,0.592076,0.845946,2494,2446,995,13187
0,0.1,122,17,HGBR|x0|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.8187,0.514717,0.69368,0.588103,0.84558,2463,2413,1026,13220
6,0.14,122,6,HGBR|x0|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.833986,0.547755,0.693563,0.610869,0.864878,2456,2050,1033,13584
1,0.01,151,16,HGBR|x0|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.835024,0.601629,0.599109,0.586291,0.885107,2086,1470,1403,14163
9,0.01,100,14,HGBR|x0|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.830657,0.59521,0.589586,0.576825,0.881427,2055,1509,1434,14125


In [19]:
#test with first subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_1
m_name = 'HGBR|x1|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 118.43 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x1|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.790946,0.462651,0.734987,0.5656,0.804238,2562,2965,927,12669
2,0.05,102,10,HGBR|x1|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.821525,0.520265,0.726953,0.606013,0.841318,2553,2385,936,13248
8,0.07,181,18,HGBR|x1|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.814111,0.502865,0.724706,0.593537,0.833376,2541,2478,948,13156
3,0.04,120,7,HGBR|x1|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.818082,0.513262,0.722239,0.599951,0.838714,2525,2360,964,13273
4,0.12,165,20,HGBR|x1|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.821003,0.516059,0.705228,0.595268,0.846788,2486,2334,1003,13299
0,0.1,122,17,HGBR|x1|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.821623,0.521923,0.703413,0.598962,0.847129,2472,2273,1017,13361
5,0.11,155,10,HGBR|x1|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.830279,0.540665,0.701046,0.609433,0.858561,2467,2150,1022,13484
6,0.14,122,6,HGBR|x1|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.833591,0.558164,0.676151,0.609323,0.866282,2398,1946,1091,13687
1,0.01,151,16,HGBR|x1|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.833657,0.595026,0.61087,0.588151,0.880401,2123,1536,1366,14097
9,0.01,100,14,HGBR|x1|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.831488,0.593792,0.605165,0.58388,0.878639,2105,1549,1384,14085


In [20]:
#test with second subset of X columns
start_time = time.time()
HGBR_cv_results = pd.DataFrame(columns=['idx'])
X_vars = X_cols_2
m_name = 'HGBR|x2|'

for idx in range(10):
    for i in range(len(cv_folds)):

        cv_fold = cv_folds.loc[i]
        HGBR_cv_results, test_results = HGBR(cv_fold,
                                             X_vars=X_vars,
                                             y_var=y_var,
                                             learning_rate=learning_rate_rdm[idx],
                                             min_samples_leaf=min_samples_leaf_rdm[idx],
                                             max_depth=max_depth_rdm[idx],
                                             HGBR_results_df=HGBR_cv_results)
    
    HGBR_cv_results['idx'] = HGBR_cv_results['idx'].fillna(str(idx).zfill(2))
    mean_df = HGBR_cv_results.groupby(['model','y','dataset','idx'])[HGBR_cv_results.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
    mean_df['cv_fold'] = 'mean'
    

    HGBR_cv_results = pd.concat([HGBR_cv_results, mean_df], ignore_index=True)
    HGBR_cv_results = HGBR_cv_results.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','idx','dataset']).reset_index().drop(columns='index')


cv_results = pd.concat([params_df,HGBR_cv_results[HGBR_cv_results.cv_fold == 'mean']], axis=1).drop(columns=['idx'])
cv_results['model'] = m_name+cv_results['learning_rate'].round(2).astype(str)+"-"+cv_results['min_samples_leaf'].astype(str)+"-"+cv_results['max_depth'].astype(str)
cv_results[['tp','fp','fn','tn']] = cv_results[['tp','fp','fn','tn']].astype(int)

compare_df = pd.concat([compare_df, 
                        cv_results.drop(columns=['learning_rate','min_samples_leaf','max_depth'])],
                       axis=0).drop_duplicates()

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

cv_results.sort_values('recall', ascending=False)

Execution time: 173.73 seconds


Unnamed: 0,learning_rate,min_samples_leaf,max_depth,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,0.02,159,6,HGBR|x2|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.79402,0.472557,0.734321,0.571796,0.807743,2558,2812,931,12822
3,0.04,120,7,HGBR|x2|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.799319,0.474738,0.732812,0.575205,0.813957,2571,2851,918,12782
8,0.07,181,18,HGBR|x2|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.785902,0.454628,0.730101,0.557348,0.798742,2569,3213,920,12420
2,0.05,102,10,HGBR|x2|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.819253,0.515471,0.715767,0.597638,0.841521,2522,2457,967,13177
5,0.11,155,10,HGBR|x2|0.11-155-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.817366,0.518918,0.701974,0.592046,0.842471,2488,2490,1001,13143
0,0.1,122,17,HGBR|x2|0.1-122-17,mean,frwd01_mon_metro_hvi_pct_chg,test,0.810436,0.499407,0.697548,0.577935,0.834576,2479,2643,1010,12990
6,0.14,122,6,HGBR|x2|0.14-122-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.830881,0.538165,0.691659,0.604852,0.861594,2449,2077,1040,13556
4,0.12,165,20,HGBR|x2|0.12-165-20,mean,frwd01_mon_metro_hvi_pct_chg,test,0.816215,0.520806,0.688622,0.585437,0.843792,2447,2550,1042,13083
1,0.01,151,16,HGBR|x2|0.01-151-16,mean,frwd01_mon_metro_hvi_pct_chg,test,0.835732,0.603402,0.598434,0.58693,0.886122,2084,1457,1405,14176
9,0.01,100,14,HGBR|x2|0.01-100-14,mean,frwd01_mon_metro_hvi_pct_chg,test,0.832526,0.599066,0.591531,0.580178,0.883339,2060,1485,1429,14149


In [21]:
compare_df.sort_values(by=['recall'], ascending=False).head(15)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
7,HGBR|x1|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.790946,0.462651,0.734987,0.5656,0.804238,2562,2965,927,12669
7,HGBR|x2|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.79402,0.472557,0.734321,0.571796,0.807743,2558,2812,931,12822
7,HGBR|x0|0.02-159-6,mean,frwd01_mon_metro_hvi_pct_chg,test,0.794638,0.473873,0.734303,0.572662,0.808568,2556,2796,933,12838
8,HGBR|x0|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.800359,0.477358,0.733019,0.576322,0.815277,2580,2899,909,12734
3,HGBR|x2|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.799319,0.474738,0.732812,0.575205,0.813957,2571,2851,918,12782
8,HGBR|x2|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.785902,0.454628,0.730101,0.557348,0.798742,2569,3213,920,12420
2,HGBR|x1|0.05-102-10,mean,frwd01_mon_metro_hvi_pct_chg,test,0.821525,0.520265,0.726953,0.606013,0.841318,2553,2385,936,13248
8,HGBR|x1|0.07-181-18,mean,frwd01_mon_metro_hvi_pct_chg,test,0.814111,0.502865,0.724706,0.593537,0.833376,2541,2478,948,13156
3,HGBR|x1|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.818082,0.513262,0.722239,0.599951,0.838714,2525,2360,964,13273
3,HGBR|x0|0.04-120-7,mean,frwd01_mon_metro_hvi_pct_chg,test,0.803003,0.481359,0.721689,0.577099,0.82029,2534,2720,955,12913


*****
**Random forest model**
*****
Tune random forest regressor model and assess results 

In [23]:
#create function to generate prediction and assess results from fitted model
def model_predict(model, results_df, df, set_name, pred_cols, y_var=y_var, m_name=m_name, fold=fold):
    X = df[pred_cols]
    
    df['y_pred_pct'] = model.predict(X)
    df['y_true'] = np.where(df[y_var] >= 0, 1, 0)
    df['y_pred'] = np.where(df['y_pred_pct'] >= 0, 1, 0)
    cm = confusion_matrix_class(confusion_matrix(df['y_true'], df['y_pred']))

    new_record = create_results_record(model=m_name, 
                                       cv_fold=fold,
                                       y_var=y_var, 
                                       dataset=set_name, 
                                       cm=cm)
    results_df = pd.concat([results_df, pd.DataFrame([new_record])], ignore_index=True)
    
    return(results_df)

In [26]:
#create function to train and test random forest model 
def RF(model_name, X_vars, y_var, cv_folds, RF_results_df, params):
    
    for idx in range(len(params)):
    
        n_est = params['n_estimators'][idx]
        min_samp_lf = params['min_samples_leaf'][idx]
        max_f = params['max_features'][idx]
        m_name = model_name+"|"+str(n_est)+"-"+str(min_samp_lf)+"-"+str(max_f)
        
        for i in range(len(cv_folds)):

            cv_fold = cv_folds.loc[i]
            fold = cv_fold['fold']
            train_df = sample_dfs[cv_fold['train_df']]
            test_df = sample_dfs[cv_fold['test_df']]

            #identify and drop columns with over 75% of data missing in training dataset
            drop_cols = train_df.isna().sum().divide(len(train_df)).sort_values(ascending=False).reset_index().rename(columns={'index':'col',0:'missing'})
            #print(drop_cols.to_string())
            drop_cols['drop'] = np.where(drop_cols['missing'] > .70, "drop", "keep")
            drop_cols = list(drop_cols[drop_cols['drop'] == 'drop'].col)
            train_df = train_df.drop(columns=drop_cols)
            test_df = test_df.drop(columns=drop_cols)
            X_cols_rf = [col for col in X_vars if col not in drop_cols]

            #define training set
            train_df = train_df.dropna()
            X = train_df[X_cols_rf[7:]]
            y = train_df[y_var]

            #define test data
            test_df = test_df.dropna()
            X_test = test_df[X_cols_rf[7:]]
            y_test = test_df[y_var]

            rf_model = RandomForestRegressor(n_estimators=n_est, 
                                             min_samples_leaf=params['min_samples_leaf'][idx],
                                             max_features=params['max_features'][idx],
                                             n_jobs=-1, 
                                             random_state=42)
            rf_model.fit(X, y)
            
            RF_results_df = model_predict(rf_model, RF_results_df, train_df, "train", X_cols_rf[7:], y_var, m_name, fold)
            RF_results_df = model_predict(rf_model, RF_results_df, test_df, "test", X_cols_rf[7:], y_var, m_name, fold)

    return(RF_results_df)

In [40]:
#set parameter ranges for tuning RF model
random.seed(42)

k=10
n_estimators_rdm = np.array(random.choices(range(10,101),k=k))
min_samples_leaf_rdm = np.array(random.choices(range(20,101),k=k))
max_features_rdm = np.array(random.choices(['sqrt','log2'],k=k))

RF_params_df = pd.DataFrame({'n_estimators':n_estimators_rdm,
                             'min_samples_leaf':min_samples_leaf_rdm,
                             'max_features':max_features_rdm})

In [41]:
#test with all X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x0', X_vars=X_cols, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 90.62 seconds


Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x0|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.864504,0.498854,0.702542,0.579672,0.889198,1672.333333,1769.333333,702.0,14865.0
1,RF|x0|12-60-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.867137,0.476715,0.721192,0.572161,0.887311,1623.333333,1818.333333,613.666667,14953.333333
2,RF|x0|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.872005,0.517684,0.718851,0.601729,0.894786,1790.0,1651.666667,676.0,14891.0
3,RF|x0|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.878411,0.54456,0.739178,0.627091,0.899915,1884.333333,1557.333333,654.333333,14912.666667
4,RF|x0|35-22-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.879191,0.560553,0.732869,0.634642,0.903509,1949.333333,1492.333333,707.333333,14859.666667
5,RF|x0|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.869082,0.447808,0.755967,0.561573,0.882499,1533.0,1908.666667,480.0,15087.0
6,RF|x0|68-37-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.870459,0.47409,0.743022,0.578701,0.886986,1633.666667,1808.0,551.666667,15015.333333
7,RF|x0|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.878678,0.524466,0.752056,0.617451,0.897259,1823.666667,1618.0,595.666667,14971.333333
8,RF|x0|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.868794,0.455405,0.749743,0.565519,0.883515,1554.666667,1887.0,505.666667,15061.333333
9,RF|x0|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880479,0.551304,0.745618,0.633754,0.901621,1909.0,1532.666667,643.666667,14923.333333


In [42]:
#test with first subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x1', X_vars=X_cols_1, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 66.30 seconds


Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.877048,0.636085,0.692261,0.661123,0.917367,2212.0,1229.666667,1022.0,14545.0
1,RF|x1|12-60-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.87533,0.568277,0.713415,0.627196,0.906267,1939.333333,1502.333333,782.666667,14784.333333
2,RF|x1|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.879782,0.578141,0.724549,0.641843,0.907181,2017.333333,1424.333333,774.0,14793.0
3,RF|x1|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.883828,0.61131,0.728866,0.662669,0.912998,2137.0,1304.666667,830.666667,14736.333333
4,RF|x1|35-22-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.88237,0.625426,0.714688,0.664264,0.915193,2192.666667,1249.0,911.333333,14655.666667
5,RF|x1|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880615,0.516001,0.767767,0.614366,0.897554,1780.666667,1661.0,541.333333,15025.666667
6,RF|x1|68-37-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.883437,0.588605,0.735701,0.651616,0.910289,2057.333333,1384.333333,758.333333,14808.666667
7,RF|x1|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.885932,0.59966,0.745334,0.663627,0.911599,2077.0,1364.666667,718.333333,14848.666667
8,RF|x1|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.884482,0.554971,0.764257,0.641458,0.90397,1929.666667,1512.0,601.666667,14965.333333
9,RF|x1|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.885685,0.621749,0.7312,0.669675,0.915481,2177.666667,1264.0,828.666667,14738.333333


In [43]:
#test with second subset of X columns
start_time = time.time()
RF_results_df = pd.DataFrame()

RF_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=cv_folds, RF_results_df=RF_results_df, params=RF_params_df)

mean_df = RF_results_df.groupby(['model','y','dataset'])[RF_results_df.select_dtypes(include=np.number).columns.tolist()].mean().reset_index()
mean_df['cv_fold'] = 'mean'
RF_results_df = pd.concat([RF_results_df, mean_df], ignore_index=True)
RF_results_df = RF_results_df.sort_values(['y','cv_fold','dataset']).drop_duplicates(subset=['cv_fold','y','dataset','model']).reset_index().drop(columns='index')

compare_df = pd.concat([compare_df, 
                        RF_results_df[(RF_results_df['cv_fold'] == 'mean') & 
                                      (RF_results_df['dataset'] == 'test')]],
                       axis=0)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

RF_results_df.head(k)

Execution time: 75.40 seconds


Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|12-20-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.872054,0.605586,0.68947,0.641932,0.911695,2105.333333,1336.333333,976.333333,14590.666667
1,RF|x2|12-60-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880144,0.585383,0.72543,0.64456,0.909844,2048.666667,1393.0,791.333333,14775.666667
2,RF|x2|17-67-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880863,0.599633,0.723224,0.653692,0.911684,2073.666667,1368.0,807.666667,14759.333333
3,RF|x2|30-36-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.881252,0.6143,0.720897,0.660236,0.913657,2154.0,1287.666667,871.333333,14695.666667
4,RF|x2|35-22-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.881138,0.63134,0.710327,0.666667,0.916445,2203.666667,1238.0,927.666667,14639.333333
5,RF|x2|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880373,0.56643,0.738879,0.638502,0.905979,1973.666667,1468.0,712.333333,14854.666667
6,RF|x2|68-37-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.881142,0.597775,0.72373,0.652308,0.911401,2093.666667,1348.0,812.0,14755.0
7,RF|x2|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.883178,0.605229,0.729645,0.660595,0.91213,2105.0,1336.666667,788.333333,14778.666667
8,RF|x2|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.88096,0.580865,0.732097,0.645474,0.908319,2029.666667,1412.0,756.0,14811.0
9,RF|x2|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.883223,0.611686,0.728453,0.662817,0.913618,2137.333333,1304.333333,820.333333,14746.666667


In [44]:
compare_df[compare_df.model.str.contains("RF")].sort_values('recall', ascending=False)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
5,RF|x1|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880615,0.516001,0.767767,0.614366,0.897554,1780.666667,1661.000000,541.333333,15025.666667
8,RF|x1|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.884482,0.554971,0.764257,0.641458,0.903970,1929.666667,1512.000000,601.666667,14965.333333
5,RF|x0|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.869082,0.447808,0.755967,0.561573,0.882499,1533.000000,1908.666667,480.000000,15087.000000
0,RF|x1|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.882644,0.558208,0.753879,0.637809,0.905119,1958.000000,1483.000000,655.000000,14911.000000
7,RF|x0|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.878678,0.524466,0.752056,0.617451,0.897259,1823.666667,1618.000000,595.666667,14971.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,RF|x0|35-22-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.851431,0.681499,0.606391,0.639588,0.922122,2365.000000,1076.000000,1610.000000,13956.000000
62,RF|x0|12-20-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.850441,0.677840,0.600797,0.635550,0.921438,2348.000000,1093.000000,1611.000000,13956.000000
65,RF|x1|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.843990,0.694705,0.594419,0.630962,0.924723,2432.000000,1009.000000,1938.000000,13629.000000
68,RF|x1|68-37-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.836741,0.706550,0.558322,0.622489,0.925153,2443.000000,998.000000,2017.000000,13550.000000


*****
**Final comparison**
*****
Identify which model optimizes for recall

In [45]:
compare_df = compare_df.sort_values('recall', ascending=False).reset_index().drop(columns='index')
compare_df[['tp','fp','fn','tn']] = compare_df[['tp','fp','fn','tn']].astype(int)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880615,0.516001,0.767767,0.614366,0.897554,1780,1661,541,15025
1,RF|x1|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.884482,0.554971,0.764257,0.641458,0.90397,1929,1512,601,14965
2,RF|x0|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.869082,0.447808,0.755967,0.561573,0.882499,1533,1908,480,15087
3,RF|x1|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.882644,0.558208,0.753879,0.637809,0.905119,1958,1483,655,14911
4,RF|x0|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.878678,0.524466,0.752056,0.617451,0.897259,1823,1618,595,14971
5,RF|x0|77-72-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.868794,0.455405,0.749743,0.565519,0.883515,1554,1887,505,15061
6,RF|x0|71-64-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.867645,0.444269,0.748263,0.556616,0.881904,1522,1919,496,15071
7,RF|x0|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880479,0.551304,0.745618,0.633754,0.901621,1909,1532,643,14923
8,RF|x0|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880479,0.551304,0.745618,0.633754,0.901621,1909,1532,643,14923
9,RF|x1|71-64-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.885932,0.59966,0.745334,0.663627,0.911599,2077,1364,718,14848


In [46]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x1|48-85-log2,mean,frwd01_mon_metro_hvi_pct_chg,test,0.880615,0.516001,0.767767,0.614366,0.897554,1780,1661,541,15025


Identify which model optimizes for F1 score

In [52]:
compare_df = compare_df.sort_values('F1', ascending=False)
compare_df.head(25)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
71,RF|x2|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.873211,0.677756,0.688601,0.677647,0.925274,2352,1089,1169,14397
72,RF|x0|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.872909,0.676513,0.687774,0.676805,0.92486,2347,1094,1168,14399
73,RF|x0|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.872909,0.676513,0.687774,0.676805,0.92486,2347,1094,1168,14399
25,RF|x1|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.885685,0.621749,0.7312,0.669675,0.915481,2177,1264,828,14738
26,RF|x1|91-37-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.885685,0.621749,0.7312,0.669675,0.915481,2177,1264,828,14738
74,RF|x2|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.869143,0.669968,0.680951,0.669009,0.923344,2328,1113,1216,14351
75,RF|x0|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.868135,0.67034,0.679747,0.668378,0.923176,2328,1113,1229,14337
76,RF|x0|77-72-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.868135,0.67034,0.679747,0.668378,0.923176,2328,1113,1229,14337
54,RF|x2|35-22-sqrt,mean,frwd01_mon_metro_hvi_pct_chg,test,0.881138,0.63134,0.710327,0.666667,0.916445,2203,1238,927,14639
78,RF|x0|71-64-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.8665,0.66827,0.674159,0.664653,0.922572,2321,1120,1256,14310


In [53]:
compare_df.head(1)

Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
71,RF|x2|48-85-None,mean,frwd01_mon_metro_hvi_pct_chg,test,0.873211,0.677756,0.688601,0.677647,0.925274,2352,1089,1169,14397


In [None]:
#model with best F1 score will be used

*****
**Train and test selected model with validation set**
*****
Selected model details

Estimator: Random Forest

Set of predictor variables: Set 2 (all pct change lag and normalized amounts for current period)

Parameters:

    n_estimators = 48
    
    min_samples_leaf = 85
    
    max_features = None

In [54]:
#test best model with validation dataset
start_time = time.time()
val_results_df = pd.DataFrame()
val_params_df = pd.DataFrame({'n_estimators':[48],
                             'min_samples_leaf':[85],
                             'max_features':[None]})

val_results_df = RF(model_name='RF|x2', X_vars=X_cols_2, y_var=y_var, cv_folds=val, RF_results_df=val_results_df, params=val_params_df)

end_time = time.time()
execution_duration = end_time - start_time
print(f"Execution time: {execution_duration:.2f} seconds")

val_results_df

Execution time: 22.94 seconds


Unnamed: 0,model,cv_fold,y,dataset,accuracy,precision,recall,F1,specificity,tp,fp,fn,tn
0,RF|x2|48-85-None,val,frwd01_mon_metro_hvi_pct_chg,train,0.89586,0.744298,0.822133,0.781282,0.917417,14717,5056,3184,56167
1,RF|x2|48-85-None,val,frwd01_mon_metro_hvi_pct_chg,test,0.919113,0.643914,0.805758,0.715801,0.935517,5094,2817,1228,40869


**Validation test results**
 - Out of the 6,322 periods where home value declined, the model correctly predicted a decline 80.5% of the time (recall).
 - Out of the 7,911 periods where the model predicted a decline, only 64.4% of the time did a decline actually occur (precision).
