In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [106]:
df = pd.read_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Intermediate Data/EDA')

In [107]:
df.drop('Unnamed: 0',axis=1, inplace=True)

In [108]:
df.head()

Unnamed: 0,IDCyc,IDMemb,WhenReg,CDLocLast,CkFounder,HowHear,CDRegMemb,SaleNom,latitude,longitude,...,Non-Food Items_Cum,Nuts_Cum,Other Protein Sources_Cum,Personal Care_Cum,Pet + Animal Care_Cum,Prepared Foods_Cum,Snacks_Cum,The Garden Center_Cum,Cycles_as_member,order_per_cycle
0,285,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0
1,286,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0
2,287,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0
3,288,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0.0
4,289,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0


In [109]:
df.columns

Index(['IDCyc', 'IDMemb', 'WhenReg', 'CDLocLast', 'CkFounder', 'HowHear',
       'CDRegMemb', 'SaleNom', 'latitude', 'longitude', 'Pickup_lat',
       'Pickup_long', 'Distance_to_pickup', 'Years_member', 'quarter', 'month',
       'holiday', 'Cumulative_Sum', 'Ordered', 'Cumulative_Number_of_Orders',
       'order_per_cycle_when_ordering', 'Baked Goods_Cum', 'Beverages_Cum',
       'Classes and Events_Cum', 'Condiments + Sauces_Cum', 'Dairy_Cum',
       'Dried Herbs + Spices_Cum', 'Eggs_Cum',
       'Grains, Flours, Cereal + Pastas_Cum',
       'Handmade Home Goods + Gifts_Cum', 'Honey, Syrups, Jams + Jellies_Cum',
       'Iowa Food Co-op Shop_Cum', 'Local Produce_Cum', 'Meat - Beef_Cum',
       'Meat - Chicken + Capon_Cum', 'Meat - Pork_Cum', 'Meats - Other_Cum',
       'Non-Food Items_Cum', 'Nuts_Cum', 'Other Protein Sources_Cum',
       'Personal Care_Cum', 'Pet + Animal Care_Cum', 'Prepared Foods_Cum',
       'Snacks_Cum', 'The Garden Center_Cum', 'Cycles_as_member',
       'order_

In [110]:
df['Pickup_lat'].fillna(df['latitude'], inplace=True)
df['Pickup_long'].fillna(df['longitude'],inplace=True)

In [111]:
df['latitude'] = df['latitude'].astype('float')
df['longitude'] = df['longitude'].astype('float')
df['Pickup_lat'] = df['Pickup_lat'].astype('float')
df['Pickup_long'] = df['Pickup_long'].astype('float')

In [112]:
#function to convert the date time quarters to float
def convert_to_float(date_time_quarter):
    year, quarter = date_time_quarter.split('Q')
    quarter_value = int(quarter)
    float_value = float(year) + quarter_value / 10
    return float_value
    
#Converting the datetime quarter column to float
df['Float_Year_Quarter'] = df['quarter'].apply(convert_to_float)

In [113]:
df.sort_values(['IDMemb','IDCyc'], inplace=True)

In [114]:
#These people have been mmebers for 3 cycles or fewer. I am choosing to drop them because I need at least 2 ordering cycles for training data, 1 for validation
#and 1 for testing data, so I need at least 4 data points per person.
members_to_drop = df[(df['IDCyc']==df['IDCyc'].max()) & (df['Cycles_as_member']<4)]
ids_to_drop = members_to_drop['IDMemb'].unique()

In [115]:
df = df[~df['IDMemb'].isin(ids_to_drop)]

In [116]:
df.shape

(45037, 48)

In [117]:
from sklearn.model_selection import train_test_split

In [118]:
#First, I will build a dummy model to use as a test to see how good my model is.
#Adding rolling averages of window size 4, 6 and 8 to be used as a dumb model baseline.
df['dumb_pred_4'] = df.groupby('IDMemb')['SaleNom'].transform(lambda x: x.rolling(window=4, min_periods= 0).mean().shift(1))
df['dumb_pred_6']= df.groupby('IDMemb')['SaleNom'].transform(lambda x: x.rolling(window=6, min_periods= 0).mean().shift(1))
df['dumb_pred_8'] = df.groupby('IDMemb')['SaleNom'].transform(lambda x: x.rolling(window=8, min_periods= 0).mean().shift(1))

In [119]:
df.columns

Index(['IDCyc', 'IDMemb', 'WhenReg', 'CDLocLast', 'CkFounder', 'HowHear',
       'CDRegMemb', 'SaleNom', 'latitude', 'longitude', 'Pickup_lat',
       'Pickup_long', 'Distance_to_pickup', 'Years_member', 'quarter', 'month',
       'holiday', 'Cumulative_Sum', 'Ordered', 'Cumulative_Number_of_Orders',
       'order_per_cycle_when_ordering', 'Baked Goods_Cum', 'Beverages_Cum',
       'Classes and Events_Cum', 'Condiments + Sauces_Cum', 'Dairy_Cum',
       'Dried Herbs + Spices_Cum', 'Eggs_Cum',
       'Grains, Flours, Cereal + Pastas_Cum',
       'Handmade Home Goods + Gifts_Cum', 'Honey, Syrups, Jams + Jellies_Cum',
       'Iowa Food Co-op Shop_Cum', 'Local Produce_Cum', 'Meat - Beef_Cum',
       'Meat - Chicken + Capon_Cum', 'Meat - Pork_Cum', 'Meats - Other_Cum',
       'Non-Food Items_Cum', 'Nuts_Cum', 'Other Protein Sources_Cum',
       'Personal Care_Cum', 'Pet + Animal Care_Cum', 'Prepared Foods_Cum',
       'Snacks_Cum', 'The Garden Center_Cum', 'Cycles_as_member',
       'order_

In [120]:
df.head(55)

Unnamed: 0,IDCyc,IDMemb,WhenReg,CDLocLast,CkFounder,HowHear,CDRegMemb,SaleNom,latitude,longitude,...,Pet + Animal Care_Cum,Prepared Foods_Cum,Snacks_Cum,The Garden Center_Cum,Cycles_as_member,order_per_cycle,Float_Year_Quarter,dumb_pred_4,dumb_pred_6,dumb_pred_8
0,285,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,1,0.0,2021.3,,,
1,286,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,2,0.0,2021.3,0.0,0.0,0.0
2,287,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,3,0.0,2021.3,0.0,0.0,0.0
3,288,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,4,0.0,2021.3,0.0,0.0,0.0
4,289,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,5,0.0,2021.3,0.0,0.0,0.0
5,290,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,6,0.0,2021.3,0.0,0.0,0.0
6,291,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,7,0.0,2021.3,0.0,0.0,0.0
7,292,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,8,0.0,2021.4,0.0,0.0,0.0
8,293,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,9,0.0,2021.4,0.0,0.0,0.0
9,294,1016,2008-09-10,ANK,1,Other,Approv,0.0,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,10,0.0,2021.4,0.0,0.0,0.0


In [121]:
df['dumb_pred_4'].fillna(0, inplace=True)
df['dumb_pred_6'].fillna(0, inplace=True)
df['dumb_pred_8'].fillna(0, inplace=True)

In [122]:
grouped_data = df.groupby('IDMemb')
grouped_data.head()

Unnamed: 0,IDCyc,IDMemb,WhenReg,CDLocLast,CkFounder,HowHear,CDRegMemb,SaleNom,latitude,longitude,...,Pet + Animal Care_Cum,Prepared Foods_Cum,Snacks_Cum,The Garden Center_Cum,Cycles_as_member,order_per_cycle,Float_Year_Quarter,dumb_pred_4,dumb_pred_6,dumb_pred_8
0,285,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,1,0.000000,2021.3,0.000000,0.000000,0.000000
1,286,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,2,0.000000,2021.3,0.000000,0.000000,0.000000
2,287,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,3,0.000000,2021.3,0.000000,0.000000,0.000000
3,288,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,4,0.000000,2021.3,0.000000,0.000000,0.000000
4,289,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,5,0.000000,2021.3,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45032,334,6753,2023-04-04 02:14:40,FRAN,0,Online,Pend,28.75,41.622999,-93.567472,...,0.0,0.0,0.0,0.0,4,11.366667,2023.2,11.366667,11.366667,11.366667
45033,331,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,0.00,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,1,0.000000,2023.2,0.000000,0.000000,0.000000
45034,332,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,201.28,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,2,0.000000,2023.2,0.000000,0.000000,0.000000
45035,333,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,0.00,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,3,100.640000,2023.2,100.640000,100.640000,100.640000


In [123]:
grouped_data.head()

Unnamed: 0,IDCyc,IDMemb,WhenReg,CDLocLast,CkFounder,HowHear,CDRegMemb,SaleNom,latitude,longitude,...,Pet + Animal Care_Cum,Prepared Foods_Cum,Snacks_Cum,The Garden Center_Cum,Cycles_as_member,order_per_cycle,Float_Year_Quarter,dumb_pred_4,dumb_pred_6,dumb_pred_8
0,285,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,1,0.000000,2021.3,0.000000,0.000000,0.000000
1,286,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,2,0.000000,2021.3,0.000000,0.000000,0.000000
2,287,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,3,0.000000,2021.3,0.000000,0.000000,0.000000
3,288,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,4,0.000000,2021.3,0.000000,0.000000,0.000000
4,289,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,5,0.000000,2021.3,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45032,334,6753,2023-04-04 02:14:40,FRAN,0,Online,Pend,28.75,41.622999,-93.567472,...,0.0,0.0,0.0,0.0,4,11.366667,2023.2,11.366667,11.366667,11.366667
45033,331,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,0.00,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,1,0.000000,2023.2,0.000000,0.000000,0.000000
45034,332,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,201.28,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,2,0.000000,2023.2,0.000000,0.000000,0.000000
45035,333,6755,2023-04-05 01:58:05,ANK,0,Other,Pend,0.00,41.772040,-93.599026,...,0.0,0.0,0.0,0.0,3,100.640000,2023.2,100.640000,100.640000,100.640000


In [124]:
#Splitting data into training, validation and test sets

from sklearn.model_selection import train_test_split

#Creating empty data frames for each set
train_data = pd.DataFrame()
val_data = pd.DataFrame()
test_data = pd.DataFrame()

#Want my training data to contain 70% of the data.

for _, group in grouped_data:
    total_length = len(group)
    train_length = int(total_length * 0.7)
    val_test_length = total_length - train_length

    if val_test_length < 2:
        train = group
        val = pd.DataFrame()
        test = pd.DataFrame()
    #Now, using train_test_split to divide the 30% of data into two sets with 15% in each.
    else:
        train, val_test = train_test_split(
            group,
            train_size=train_length,
            shuffle=False
        )
        val, test = train_test_split(
            val_test,
            test_size=0.5,
            shuffle=False
        )

    train_data = train_data.append(train)
    val_data = val_data.append(val)
    test_data = test_data.append(test)


  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
  train_data = train_data.append(train)
  val_data = val_data.append(val)
  test_data = test_data.append(test)
 

In [125]:
train_data.head(-5)

Unnamed: 0,IDCyc,IDMemb,WhenReg,CDLocLast,CkFounder,HowHear,CDRegMemb,SaleNom,latitude,longitude,...,Pet + Animal Care_Cum,Prepared Foods_Cum,Snacks_Cum,The Garden Center_Cum,Cycles_as_member,order_per_cycle,Float_Year_Quarter,dumb_pred_4,dumb_pred_6,dumb_pred_8
0,285,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,1,0.0,2021.3,0.0,0.0,0.0
1,286,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,2,0.0,2021.3,0.0,0.0,0.0
2,287,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,3,0.0,2021.3,0.0,0.0,0.0
3,288,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,4,0.0,2021.3,0.0,0.0,0.0
4,289,1016,2008-09-10,ANK,1,Other,Approv,0.00,42.032577,-93.685976,...,0.0,0.0,0.0,0.0,5,0.0,2021.3,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45017,331,6749,2023-04-02 14:24:43,FRAN,0,Chiropractor's office,Pend,279.10,41.560703,-93.770436,...,0.0,0.0,0.0,0.0,1,0.0,2023.2,0.0,0.0,0.0
45018,332,6749,2023-04-02 14:24:43,FRAN,0,Chiropractor's office,Pend,0.00,41.560703,-93.770436,...,0.0,0.0,0.0,0.0,2,279.1,2023.2,279.1,279.1,279.1
45021,331,6751,2023-04-03 16:59:06,FRAN,0,Online,Pend,0.00,41.645765,-93.745929,...,0.0,0.0,0.0,0.0,1,0.0,2023.2,0.0,0.0,0.0
45022,332,6751,2023-04-03 16:59:06,FRAN,0,Online,Pend,50.73,41.645765,-93.745929,...,0.0,0.0,0.0,0.0,2,0.0,2023.2,0.0,0.0,0.0


In [126]:
print(train_data.shape,val_data.shape,test_data.shape)

(31392, 51) (6376, 51) (7269, 51)


In [127]:
print(train_data['IDMemb'].nunique(), val_data['IDMemb'].nunique(), test_data['IDMemb'].nunique())

1031 1031 1031


In [128]:
print(train_data['HowHear'].nunique(), val_data['HowHear'].nunique(), test_data['HowHear'].nunique())

16 16 16


In [129]:
train_data.columns

Index(['IDCyc', 'IDMemb', 'WhenReg', 'CDLocLast', 'CkFounder', 'HowHear',
       'CDRegMemb', 'SaleNom', 'latitude', 'longitude', 'Pickup_lat',
       'Pickup_long', 'Distance_to_pickup', 'Years_member', 'quarter', 'month',
       'holiday', 'Cumulative_Sum', 'Ordered', 'Cumulative_Number_of_Orders',
       'order_per_cycle_when_ordering', 'Baked Goods_Cum', 'Beverages_Cum',
       'Classes and Events_Cum', 'Condiments + Sauces_Cum', 'Dairy_Cum',
       'Dried Herbs + Spices_Cum', 'Eggs_Cum',
       'Grains, Flours, Cereal + Pastas_Cum',
       'Handmade Home Goods + Gifts_Cum', 'Honey, Syrups, Jams + Jellies_Cum',
       'Iowa Food Co-op Shop_Cum', 'Local Produce_Cum', 'Meat - Beef_Cum',
       'Meat - Chicken + Capon_Cum', 'Meat - Pork_Cum', 'Meats - Other_Cum',
       'Non-Food Items_Cum', 'Nuts_Cum', 'Other Protein Sources_Cum',
       'Personal Care_Cum', 'Pet + Animal Care_Cum', 'Prepared Foods_Cum',
       'Snacks_Cum', 'The Garden Center_Cum', 'Cycles_as_member',
       'order_

In [130]:
#Getting dummy variables for each of our the categorical variables.
ohe_train = pd.get_dummies(train_data, columns = ['CDLocLast', 'HowHear','CDRegMemb'],drop_first=True)
ohe_val = pd.get_dummies(val_data, columns = ['CDLocLast', 'HowHear','CDRegMemb'],drop_first=True)
ohe_test = pd.get_dummies(test_data, columns = ['CDLocLast', 'HowHear','CDRegMemb'],drop_first=True)


In [131]:
#Dropping columns
X_train=ohe_train.drop(['WhenReg','SaleNom','Ordered','quarter'],axis=1)
y_train=ohe_train['SaleNom']
X_val=ohe_val.drop(['WhenReg','SaleNom','Ordered','quarter'],axis=1)
y_val=ohe_val['SaleNom']
X_test=ohe_test.drop(['WhenReg','SaleNom','Ordered','quarter'],axis=1)
y_test=ohe_test['SaleNom']

In [132]:
X_train.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/X_train', index=False)
y_train.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/y_train', index=False)
X_val.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/X_val', index=False)
y_val.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/y_val', index=False)
X_test.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/X_test', index=False)
y_test.to_csv('/Users/emilydanielbowser/Documents/Iowa Food Coop/Data/Final Data/y_test', index=False)


In [147]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import PredefinedSplit

In [134]:
dummy_predictions_y_train = ohe_train[['IDCyc','IDMemb','SaleNom','dumb_pred_4', 'dumb_pred_6', 'dumb_pred_8']]
dummy_predictions_y_val = ohe_val[['IDCyc','IDMemb','SaleNom','dumb_pred_4', 'dumb_pred_6', 'dumb_pred_8']]
dummy_predictions_y_test = ohe_test[['IDCyc','IDMemb','SaleNom','dumb_pred_4', 'dumb_pred_6', 'dumb_pred_8']]

In [135]:
dummy_predictions_y_train

Unnamed: 0,IDCyc,IDMemb,SaleNom,dumb_pred_4,dumb_pred_6,dumb_pred_8
0,285,1016,0.00,0.0,0.0,0.0
1,286,1016,0.00,0.0,0.0,0.0
2,287,1016,0.00,0.0,0.0,0.0
3,288,1016,0.00,0.0,0.0,0.0
4,289,1016,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...
45026,332,6752,0.00,0.0,0.0,0.0
45029,331,6753,0.00,0.0,0.0,0.0
45030,332,6753,34.10,0.0,0.0,0.0
45033,331,6755,0.00,0.0,0.0,0.0


In [136]:
mae_4 = mean_absolute_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_4'])
mae_6 = mean_absolute_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_6'])
mae_8 = mean_absolute_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_8'])
mse_4 = mean_squared_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_4'])
mse_6 = mean_squared_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_6'])
mse_8 = mean_squared_error(dummy_predictions_y_train['SaleNom'], dummy_predictions_y_train['dumb_pred_8'])

print(f'The mean absolute error for a rolling window of 4,6 and 8 are {mae_4}, {mae_6}, {mae_8}')
print(f'The root mean squared error for a rolling window of 4,6 and 8 are {np.sqrt(mse_4)}, {np.sqrt(mse_6)}, {np.sqrt(mse_8)}')


The mean absolute error for a rolling window of 4,6 and 8 are 19.567185907237512, 19.716207542261298, 20.003170804056843
The root mean squared error for a rolling window of 4,6 and 8 are 39.3461915675577, 38.63243090835238, 38.4863944937623


In [137]:
rf_base_model = RandomForestRegressor(n_estimators = 10, random_state=47, n_jobs=-1)

In [138]:
rf_base_model.fit(X_train,y_train)

In [142]:
y_train_base_preds = rf_base_model.predict(X_train)

In [145]:
train_mae = mean_absolute_error(y_train,y_train_base_preds)
train_rmse = np.sqrt(mean_squared_error(y_train,y_train_base_preds))
print(train_mae,train_rmse)

7.453211996687053 15.839438128645016


In [139]:
y_val_basepreds = rf_base_model.predict(X_val)

In [140]:
mae = mean_absolute_error(y_val,y_val_basepreds)
rmse = np.sqrt(mean_squared_error(y_val,y_val_basepreds))

print(mae,rmse)

20.927422835633628 38.189387848314404


In [153]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 288, 377, 466, 555, 644, 733, 822, 911, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 13, 24, 35, 46, 56, 67, 78, 89, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [149]:
rf=RandomForestRegressor()

In [148]:
# Concatenate the training and validation sets
X_train_val = np.concatenate((X_train, X_val))
y_train_val = np.concatenate((y_train, y_val))

# Create the predefined validation set indices
val_indices = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=val_indices)

In [154]:
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = ps, scoring='neg_mean_squared_error', verbose=2, random_state=42, n_jobs = -1)

In [155]:
rf_random.fit(X_train_val, y_train_val)

Fitting 1 folds for each of 50 candidates, totalling 50 fits


  warn(
  warn(


[CV] END bootstrap=True, max_depth=3, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=True, max_depth=56, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=644; total time= 3.6min
[CV] END bootstrap=True, max_depth=13, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=644; total time= 2.6min
[CV] END bootstrap=False, max_depth=67, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=466; total time= 3.2min


  warn(


[CV] END bootstrap=True, max_depth=46, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=822; total time= 3.5min
[CV] END bootstrap=True, max_depth=46, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 8.2min
[CV] END bootstrap=False, max_depth=46, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=377; total time= 2.3min


  warn(


[CV] END bootstrap=False, max_depth=89, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=555; total time= 2.7min


  warn(


[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=733; total time=21.6min


  warn(


[CV] END bootstrap=True, max_depth=24, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=911; total time=21.7min
[CV] END bootstrap=True, max_depth=13, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=555; total time=  55.3s
[CV] END bootstrap=False, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=288; total time= 1.1min
[CV] END bootstrap=False, max_depth=3, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=  43.4s
[CV] END bootstrap=False, max_depth=35, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  42.0s


  warn(


[CV] END bootstrap=False, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=27.7min
[CV] END bootstrap=False, max_depth=78, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=911; total time=30.2min
[CV] END bootstrap=False, max_depth=24, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=733; total time= 2.4min


  warn(


[CV] END bootstrap=True, max_depth=67, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time= 3.0min


  warn(


[CV] END bootstrap=False, max_depth=24, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 6.5min
[CV] END bootstrap=True, max_depth=67, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=288; total time=  49.3s
[CV] END bootstrap=False, max_depth=89, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time=29.9min
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END bootstrap=False, max_depth=67, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=644; total time=20.5min


  warn(


[CV] END bootstrap=True, max_depth=13, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  28.8s
[CV] END bootstrap=True, max_depth=89, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  34.6s
[CV] END bootstrap=False, max_depth=89, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=822; total time= 3.8min
[CV] END bootstrap=True, max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=555; total time=13.2min


  warn(


[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=644; total time= 1.8min
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=644; total time= 2.0min


  warn(


[CV] END bootstrap=True, max_depth=13, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=288; total time= 4.8min


  warn(


[CV] END bootstrap=False, max_depth=78, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=822; total time= 3.6min


  warn(


[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=555; total time= 2.0min
[CV] END bootstrap=False, max_depth=46, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  42.0s
[CV] END bootstrap=True, max_depth=100, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=288; total time= 6.5min
[CV] END bootstrap=False, max_depth=24, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  37.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=377; total time=  54.1s
[CV] END bootstrap=False, max_depth=78, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=288; total time= 1.0min


  warn(


[CV] END bootstrap=False, max_depth=13, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=911; total time= 2.2min


  warn(


[CV] END bootstrap=False, max_depth=3, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=466; total time= 2.4min


  warn(


[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 7.0min


  warn(


[CV] END bootstrap=False, max_depth=13, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=911; total time=19.8min
[CV] END bootstrap=True, max_depth=13, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 3.3min
[CV] END bootstrap=True, max_depth=46, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=911; total time= 3.2min


  warn(


[CV] END bootstrap=True, max_depth=56, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=377; total time= 9.6min


  warn(


[CV] END bootstrap=True, max_depth=3, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  58.2s
[CV] END bootstrap=True, max_depth=46, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=822; total time= 2.4min


  warn(


[CV] END bootstrap=True, max_depth=46, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time= 2.9min
[CV] END bootstrap=False, max_depth=67, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=1000; total time=27.5min
[CV] END bootstrap=True, max_depth=46, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=555; total time=12.1min
[CV] END bootstrap=False, max_depth=78, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=911; total time=20.2min


In [156]:
best_params = rf_random.best_params_
best_model = rf_random.best_estimator_

In [157]:
rf=RandomForestRegressor(**best_params)

In [158]:
print(rf)

RandomForestRegressor(max_depth=13, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=10, n_estimators=555)


In [159]:
rf.fit(X_train,y_train)

In [160]:
train_preds = rf.predict(X_train)

In [161]:
train_rf_mae = mean_absolute_error(y_train,train_preds)
train_rf_rmse = np.sqrt(mean_squared_error(y_train,train_preds))

print(train_rf_mae,train_rf_rmse)

17.024911952023473 30.682196915966674


In [162]:
val_preds = rf.predict(X_val)

In [163]:
best_rf_mae = mean_absolute_error(y_val,val_preds)
best_rf_rmse = np.sqrt(mean_squared_error(y_val,val_preds))

print(mae,rmse)

20.927422835633628 38.189387848314404


In [166]:
X_train_val = pd.concat([X_train,X_val])
y_train_val = pd.concat([y_train,y_val])

In [169]:
rf.fit(X_train_val,y_train_val)

In [171]:
y_test_preds = rf.predict(X_test)

In [62]:
mae_4 = mean_absolute_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_4'])
mae_6 = mean_absolute_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_6'])
mae_8 = mean_absolute_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_8'])
mse_4 = mean_squared_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_4'])
mse_6 = mean_squared_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_6'])
mse_8 = mean_squared_error(dummy_predictions_y_val['SaleNom'], dummy_predictions_y_val['dumb_pred_8'])

print(f'The mean absolute error for a rolling window of 4,6 and 8 are {mae_4}, {mae_6}, {mae_8}')
print(f'The root mean squared error for a rolling window of 4,6 and 8 are {np.sqrt(mse_4)}, {np.sqrt(mse_6)}, {np.sqrt(mse_8)}')

The mean absolute error for a rolling window of 4,6 and 8 are 13.201881142827268, 13.40491847030531, 13.477657713822667
The root mean squared error for a rolling window of 4,6 and 8 are 30.23898554625491, 29.66635572464746, 29.37284026722243


In [173]:
rf_test_mae = mean_absolute_error(y_test, y_test_preds)
rf_test_mse = mean_squared_error(y_test, y_test_preds)
print(rf_test_mae,rf_test_mse)

12.655346091699862 648.2975489519837
