In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns 

# for making the output constant across all run
np.random.seed(42)

# display settings & code formatting
pd.options.display.max_columns = 999
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for loading data
def read_data(filename, date_col=None, data_path=data_path):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_col)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)


In [2]:
train = read_data("train.csv", date_col=["Date"])
test = read_data("test.csv", date_col=["Date"])
stores = read_data("stores.csv")
features = read_data("features.csv", date_col=["Date"])
sample_submission = read_data("sampleSubmission.csv")

In [3]:
# Merge the stores data with train and test
train = pd.merge(train, stores, how="left", on="Store")
test = pd.merge(test, stores, how="left", on="Store")

# Merge the features data with train and test
train = pd.merge(train, features, how="left", on=["Store", "Date"])
test = pd.merge(test, features, how="left", on=["Store", "Date"])

train.drop(["IsHoliday_y"], axis=1, inplace=True)
test.drop(["IsHoliday_y"], axis=1, inplace=True)

# rename column
train.rename(columns={"IsHoliday_x": "IsHoliday"}, inplace=True)
test.rename(columns={"IsHoliday_x": "IsHoliday"}, inplace=True)

## Datetime features
train["Year"] = train["Date"].dt.year
train["Month"] = train["Date"].dt.month
train["Day"] = train["Date"].dt.day
train["WeekOfYear"] = train["Date"].dt.weekofyear
train["DayOfWeek"] = train["Date"].dt.dayofweek
train["Weekend"] = (train["Date"].dt.weekday >= 5).astype(int)

test["Year"] = test["Date"].dt.year
test["Month"] = test["Date"].dt.month
test["Day"] = test["Date"].dt.day
test["WeekOfYear"] = test["Date"].dt.weekofyear
test["DayOfWeek"] = test["Date"].dt.dayofweek
test["Weekend"] = (test["Date"].dt.weekday >= 5).astype(int)

# convert boolean column to categorical column
train["IsHoliday"] = train["IsHoliday"].map({True: "Yes", False: "No"})
test["IsHoliday"] = test["IsHoliday"].map({True: "Yes", False: "No"})
train["IsHoliday"] = train["IsHoliday"].astype("category")
test["IsHoliday"] = test["IsHoliday"].astype("category")

# ordered the categorical store type col
from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=["C", "B", "A"], ordered=True)
train["Type"] = train["Type"].astype(cat_type)
test["Type"] = test["Type"].astype(cat_type)

# convert to categorical columns
train["Store"] = train["Store"].astype("category")
train["Dept"] = train["Dept"].astype("category")
train["Year"] = train["Year"].astype("category")
train["Month"] = train["Month"].astype("category")
train["DayOfWeek"] = train["DayOfWeek"].astype("category")
train["Weekend"] = train["Weekend"].astype("category")

# convert to categorical columns
test["Store"] = test["Store"].astype("category")
test["Dept"] = test["Dept"].astype("category")
test["Year"] = test["Year"].astype("category")
test["Month"] = test["Month"].astype("category")
test["DayOfWeek"] = test["DayOfWeek"].astype("category")
test["Weekend"] = test["Weekend"].astype("category")

  train["WeekOfYear"] = train["Date"].dt.weekofyear
  test["WeekOfYear"] = test["Date"].dt.weekofyear


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder

# features and labels of train and test set
# labels of test are not provided as we need to predict them

X_train = train.drop(["Weekly_Sales"], axis=1).copy()
y_train = train["Weekly_Sales"].copy()

X_test = test.copy()

# drop and save the date column in a variable
train_date = X_train.pop("Date")
test_date = X_test.pop("Date")


#### Data preparation pipeline

# select numerical and categorical columns
num_cols = X_train.select_dtypes(exclude=["object", "category"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# numerical date preprocessing pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# categorical data preprocessing pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

# full pipeline
full_pipe = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)

full_pipe

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 ['Size', 'Temperature', 'Fuel_Price',
                                  'MarkDown1', 'MarkDown2', 'MarkDown3',
                                  'MarkDown4', 'MarkDown5', 'CPI',
                                  'Unemployment', 'Day', 'WeekOfYear']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='NA',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                           

## Build Models

In [5]:
%%time
from sklearn.ensemble import RandomForestRegressor

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

In [6]:
# make submission
y_pred = rf.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "rf_default_final.csv")

The wmae on the public leaderbaord is `3188` and the ranking is within `260` and the private leaderbaord the wmae is `3326` and the position here is also within `260`.

### Plot Learning Curves

As the data set is bigger and scikit-learn doesn't support gpu, it has been observed before that training the model and finding good hyper-paramters values is getting very difficult,so we will only use fraction of the data for building and evaluting the model to speed up the process. For this, we will plot the learning curve to find the optimal training data we need to build the model.

In [7]:
print("Number of samples in the training set:", X_train.shape[0])

Number of samples in the training set: 421570


In [8]:
def plot_learning_curves(estimator, X, y, cv):

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=estimator,
        X=X,
        y=y,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=cv,
        scoring="neg_mean_absolute_error",
        random_state=42
    )
    train_mean = np.mean(-train_scores, axis=1)
    test_mean = np.mean(-test_scores, axis=1)

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=train_mean,
            name="Training MAE",
            mode="lines",
            line=dict(color="blue"),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=train_sizes,
            y=test_mean,
            name="Validation MAE",
            mode="lines",
            line=dict(color="green"),
        )
    )

    fig.update_layout(
        title="Learning Curves",
        xaxis_title="Number of training examples",
        yaxis_title="Mean Absolute Error",
    )

    fig.show()

In [9]:
from sklearn.model_selection import learning_curve

plot_learning_curves(rf, X_train, y_train, 3)

After around 84000 (20%) of the samples, the mae stoped decreasing and mostly become flat after that. So, to speed up the model selection and hyper-parameter optimization process we will randomly select only 30% of the data from the training set , and further divide it into 80-20 split between training and validation set. We can also see that the model is ovefitting badly, the error on training set is way less than the error on the validation set, so we also have to take care of that.

In [5]:
from sklearn.model_selection import train_test_split

X_train_full = X_train.copy()
y_train_full = y_train.copy()

# randomly select 30% of the data only
train = pd.concat([X_train, y_train], axis='columns')
train = train.sample(frac=0.3, random_state=42)

X_train = train.drop(["Weekly_Sales"], axis=1).copy()
y_train = train["Weekly_Sales"].copy()

# now divide it to train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [11]:
%%time
from sklearn.ensemble import RandomForestRegressor

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))
rf.fit(X_train, y_train)

CPU times: user 18min 49s, sys: 4.92 s, total: 18min 54s
Wall time: 5min 9s


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Size', 'Temperature',
                                                   'Fuel_Price', 'MarkDown1',
                                                   'MarkDown2', 'MarkDown3',
                                                   'MarkDown4', 'MarkDown5',
                                                   'CPI', 'Unemployment', 'Day',
                                                   'WeekOfYear']),
                                                 ('cat',
                                   

In [12]:
# make submission
y_pred = rf.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "rf_default_reduced_data.csv")

Although the wmae on the leaderboard is `3611` which is much higer than before which was `3188` but we can also see that the training time has also reduced significantly from `27 min to 5 min` only, which we need very badly.Once we find good hyper-parameter values for the models, we will train them again using the full data to get better performance.

## Hyper-Parameter Optimization

In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print("mean absolute errors:", -scores)
print("mean of mae", np.mean(-scores))

mean absolute errors: [1740.07452545 1781.81323016 1733.58122614 1804.84640702 1758.20869911]
mean of mae 1763.7048175742482


This is the score we will try to beat.

In [14]:
# from sklearn.ensemble import RandomForestRegressor
# from skopt.space import Real, Integer, Categorical
# from skopt.utils import use_named_args
# from skopt import gp_minimize
# from sklearn.model_selection import cross_val_score

# # random forest model
# rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))

# # The list of hyper-parameters to optimize
# space = [
#     #Integer(low=100, high=1500, name="randomforestregressor__n_estimators"),
#     #Integer(low=5, high=20, name="randomforestregressor__max_depth"),
#     #Categorical(["auto","sqrt","log2"], name="randomforestregressor__max_features")
#     Categorical(["mse","mae"], name="randomforestregressor__criterion"),
#     Integer(low=2, high=10, name="randomforestregressor__min_samples_split"),
#     Integer(low=1, high=10, name="randomforestregressor__min_samples_leaf"),
    
# ]

# @use_named_args(space)
# def objective(**params):
#     rf.set_params(**params)
#     return -np.mean(cross_val_score(rf, X_train, y_train, cv=3,
#                                     scoring="neg_mean_absolute_error"))

# res_gp = gp_minimize(objective, space, n_calls=10, random_state=42)

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    "randomforestregressor__min_samples_split": randint(low=2, high=10),
    "randomforestregressor__min_samples_leaf": randint(low=1, high=10), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))

rf_rnd_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions= params,
    n_iter=10,
    scoring="neg_mean_absolute_error",
    cv=2,
    verbose=10,
    random_state=42)


rf_rnd_search.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV 1/2; 1/10] START randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5
[CV 1/2; 1/10] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5;, score=-2281.560 total time= 2.6min
[CV 2/2; 1/10] START randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5
[CV 2/2; 1/10] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5;, score=-2314.329 total time= 2.8min
[CV 1/2; 2/10] START randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6
[CV 1/2; 2/10] END randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6;, score=-2377.817 total time= 2.6min
[CV 2/2; 2/10] START randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6
[CV 2/2; 2/10] END randomforestregressor__min_samples_leaf=8, randomforestregressor__

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['Size',
                                                                                'Temperature',
                                                                                'Fuel_Price',
                                                                      

In [16]:
rf_rnd_search.best_score_

-2037.6653006070487

In [17]:
rf_rnd_search.best_params_

{'randomforestregressor__min_samples_leaf': 2,
 'randomforestregressor__min_samples_split': 9}

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__max_depth": list(range(5,21)), 
    "randomforestregressor__max_features": ["auto","sqrt","log2"]
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    min_samples_leaf= 2, 
                                                    min_samples_split = 9,
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_absolute_error",
    cv=2,
    verbose=4,    
)


rf_grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=auto;, score=-11521.008 total time=  34.6s
[CV 2/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=auto;, score=-11431.183 total time=  32.3s
[CV 1/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=sqrt;, score=-12946.335 total time=   4.8s
[CV 2/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=sqrt;, score=-13012.978 total time=   4.8s
[CV 1/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=log2;, score=-13484.995 total time=   3.4s
[CV 2/2] END randomforestregressor__max_depth=5, randomforestregressor__max_features=log2;, score=-13534.485 total time=   3.4s
[CV 1/2] END randomforestregressor__max_depth=6, randomforestregressor__max_features=auto;, score=-10889.150 total time=  36.9s
[CV 2/2] END randomforestregressor__max_dep

[CV 1/2] END randomforestregressor__max_depth=15, randomforestregressor__max_features=log2;, score=-10723.403 total time=   7.5s
[CV 2/2] END randomforestregressor__max_depth=15, randomforestregressor__max_features=log2;, score=-10672.791 total time=   7.1s
[CV 1/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=auto;, score=-6522.411 total time= 1.3min
[CV 2/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=auto;, score=-6581.521 total time= 1.4min
[CV 1/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=sqrt;, score=-9183.458 total time=   9.7s
[CV 2/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=sqrt;, score=-9268.748 total time=   9.4s
[CV 1/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=log2;, score=-10429.448 total time=   9.8s
[CV 2/2] END randomforestregressor__max_depth=16, randomforestregressor__max_features=log2;, score=-1

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['Size',
                                                                          'Temperature',
                                                                          'Fuel_Price',
                                                                          'MarkDown1',
                                           

In [24]:
rf_grid_search.best_params_

{'randomforestregressor__max_depth': 20,
 'randomforestregressor__max_features': 'auto'}

In [25]:
rf_grid_search.best_score_

-5569.0428275555805

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__n_estimators": list(range(100,1500,100)), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    min_samples_leaf= 2, 
                                                    min_samples_split = 9,
                                                    max_features="auto",
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_absolute_error",
    cv=2,
    verbose=4,    
)

rf_grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 14 candidates, totalling 28 fits
[CV 1/2] END randomforestregressor__n_estimators=100;, score=-2024.379 total time= 2.6min
[CV 2/2] END randomforestregressor__n_estimators=100;, score=-2050.952 total time= 2.5min
[CV 1/2] END randomforestregressor__n_estimators=200;, score=-2018.020 total time= 5.1min
[CV 2/2] END randomforestregressor__n_estimators=200;, score=-2046.180 total time= 5.0min
[CV 1/2] END randomforestregressor__n_estimators=300;, score=-2014.191 total time= 7.5min
[CV 2/2] END randomforestregressor__n_estimators=300;, score=-2046.790 total time= 7.4min
[CV 1/2] END randomforestregressor__n_estimators=400;, score=-2014.730 total time=13.1min
[CV 2/2] END randomforestregressor__n_estimators=400;, score=-2046.517 total time=12.2min
[CV 1/2] END randomforestregressor__n_estimators=500;, score=-2013.453 total time=13.7min
[CV 2/2] END randomforestregressor__n_estimators=500;, score=-2045.022 total time=13.6min
[CV 1/2] END randomforestregressor__n_e

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

rf = make_pipeline(full_pipe, RandomForestRegressor(min_samples_leaf=2,
                                                    min_samples_split=9,
                                                    random_state=42,
                                                    n_jobs=-1
                                                   ))

scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
print("mean absolute errors:", -scores)
print("mean of mae", np.mean(-scores))

mean absolute errors: [1791.46104239 1843.11797992 1782.74458526 1865.49642173 1811.753268  ]
mean of mae 1818.9146594617919


In [7]:
# train the rf tunned model on all of the data 
rf = make_pipeline(full_pipe, RandomForestRegressor(min_samples_leaf=2,
                                                    min_samples_split=9,
                                                    random_state=42,
                                                    n_jobs=-1
                                                   ))
rf.fit(X_train_full, y_train_full)

# make submission
y_pred = rf.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "rf_hp_tunned_final.csv")

The rf model with default hyper paramters is still doing better than this model, so we will stick with that.

## Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))
gbrt.fit(X_train_full, y_train_full)

# make submission
y_pred = gbrt.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "gbrt_default.csv")

In [22]:
# with early stopping

from sklearn.metrics import mean_absolute_error

gbrt = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42, warm_start=True))

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(100, 3050, 50):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_valid)
    val_error = mean_absolute_error(y_valid, y_pred)
    print(f"n_estimators: {n_estimators}, val_error: {val_error}")
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 10:
            break # early stopping

n_estimators: 100, val_error: 8016.77935995346
n_estimators: 150, val_error: 8016.77935995346
n_estimators: 200, val_error: 8016.77935995346
n_estimators: 250, val_error: 8016.77935995346
n_estimators: 300, val_error: 8016.77935995346
n_estimators: 350, val_error: 8016.77935995346
n_estimators: 400, val_error: 8016.77935995346
n_estimators: 450, val_error: 8016.77935995346
n_estimators: 500, val_error: 8016.77935995346
n_estimators: 550, val_error: 8016.77935995346
n_estimators: 600, val_error: 8016.77935995346


In [24]:
from sklearn.model_selection import GridSearchCV

gbrt = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42))

params = {
    "gradientboostingregressor__learning_rate":[0.0001, 0.001, 0.01, 0.015, 0.025, 0.05, 0.1, 0.2, 0.3],
    "gradientboostingregressor__loss": ["ls", "lad", "huber", "quantile"]
}

gbrt_grid = GridSearchCV(
    estimator= gbrt,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=4
)
gbrt_grid.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['Size',
                                                                          'Temperature',
                                                                          'Fuel_Price',
                                                                          'MarkDown1',
                                           

In [25]:
gbrt_grid.best_score_

-5598.5188403513885

In [27]:
gbrt_grid.best_params_

{'gradientboostingregressor__learning_rate': 0.3,
 'gradientboostingregressor__loss': 'ls'}

In [6]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gbrt = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42, n_estimators=1000))

params = {
    "gradientboostingregressor__learning_rate":[0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
}

gbrt_grid = GridSearchCV(
    estimator= gbrt,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1,
    verbose=4
)
gbrt_grid.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['Size',
                                                                          'Temperature',
                                                                          'Fuel_Price',
                                                                          'MarkDown1',
                                           

In [7]:
gbrt_grid.best_params_

{'gradientboostingregressor__learning_rate': 0.8}

In [8]:
gbrt_grid.best_score_

-2739.6919045060217

In [9]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = make_pipeline(full_pipe, GradientBoostingRegressor(random_state=42, learning_rate=0.8))
gbrt.fit(X_train_full, y_train_full)

# make submission
y_pred = gbrt.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "gbrt_testing_lr2.csv")

## Bagging and Pasting

In [6]:
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

bag_reg = make_pipeline(full_pipe, BaggingRegressor(base_estimator= RandomForestRegressor(),
                                                    n_estimators=200,
                                                    max_samples=5000,
                                                    bootstrap= True,
                                                    random_state=42,
                                                    n_jobs=-1))
bag_reg.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Size', 'Temperature',
                                                   'Fuel_Price', 'MarkDown1',
                                                   'MarkDown2', 'MarkDown3',
                                                   'MarkDown4', 'MarkDown5',
                                                   'CPI', 'Unemployment', 'Day',
                                                   'WeekOfYear']),
                                                 ('cat',
                                   

In [7]:
# make submission
y_pred = bag_reg.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "bagging_rf_samples5000.csv")



### Hist Gradient Boosting

In [10]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

hist_gbrt = make_pipeline(full_pipe, HistGradientBoostingRegressor(random_state=42, early_stopping=True))

hist_gbrt.fit(X_train_full, y_train_full)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Size', 'Temperature',
                                                   'Fuel_Price', 'MarkDown1',
                                                   'MarkDown2', 'MarkDown3',
                                                   'MarkDown4', 'MarkDown5',
                                                   'CPI', 'Unemployment', 'Day',
                                                   'WeekOfYear']),
                                                 ('cat',
                                   

In [11]:
# make submission
y_pred = hist_gbrt.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "hist_gbrt_sklearn.csv")

### Extra Trees

In [12]:
from sklearn.ensemble import ExtraTreesRegressor

extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42))
extra_tree.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['Size', 'Temperature',
                                                   'Fuel_Price', 'MarkDown1',
                                                   'MarkDown2', 'MarkDown3',
                                                   'MarkDown4', 'MarkDown5',
                                                   'CPI', 'Unemployment', 'Day',
                                                   'WeekOfYear']),
                                                 ('cat',
                                   

In [13]:
# make submission
y_pred = extra_tree.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "extra_tree_test.csv")

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "extratreesregressor__min_samples_split": list(range(2,10)),
}

extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42))

extree_grid = GridSearchCV(
    estimator= extra_tree,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=2,
    verbose=6,
    n_jobs=-1
    
)

extree_grid.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
