### Setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

### Prepare Data For ML

In [3]:
from prepare import prepare_data

In [4]:
X_train, y_train, X_test, full_pipe = prepare_data(train, test)

## Hyperparameter Tunning

In [5]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, reciprocal 
from sklearn.pipeline import make_pipeline


svr = make_pipeline(full_pipe, LinearSVR(random_state=42, max_iter=10000))

param_dist = {"linearsvr__C": uniform(1, 10),
              "linearsvr__tol": reciprocal(0.001, 0.1),
              "linearsvr__loss":["epsilon_insensitive","squared_epsilon_insensitive"]
             }

svr_rnd_search = RandomizedSearchCV(
    svr,
    param_distributions=param_dist,
    n_iter=20,
    cv=2,
    scoring="neg_mean_squared_log_error",
    random_state=42,
    verbose=2
)

svr_rnd_search.fit(X_train, y_train)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[CV] END linearsvr__C=4.745401188473625, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.002327392228062871; total time=   1.8s
[CV] END linearsvr__C=4.745401188473625, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.002327392228062871; total time=   1.9s
[CV] END linearsvr__C=8.796910002727692, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0020513382630874496; total time=   3.0s
[CV] END linearsvr__C=8.796910002727692, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0020513382630874496; total time=   3.0s
[CV] END linearsvr__C=2.5599452033620267, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.00828891686688514; total time=   1.0s
[CV] END linearsvr__C=2.5599452033620267, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.00828891686688514; total time=   1.0s




[CV] END linearsvr__C=4.337086111390218, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.026070247583707663; total time= 1.8min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=4.337086111390218, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.026070247583707663; total time= 1.7min




[CV] END linearsvr__C=1.2058449429580245, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.027796975515266813; total time= 1.8min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=1.2058449429580245, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.027796975515266813; total time= 1.8min




[CV] END linearsvr__C=10.385527090157503, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.002310201887845293; total time= 1.7min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=10.385527090157503, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.002310201887845293; total time= 1.8min




[CV] END linearsvr__C=2.834045098534338, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.016722697006183673; total time= 1.8min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=2.834045098534338, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.016722697006183673; total time= 1.7min
[CV] END linearsvr__C=1.070663052197174, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.003823475224675185; total time=   0.6s
[CV] END linearsvr__C=1.070663052197174, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.003823475224675185; total time=   0.6s




[CV] END linearsvr__C=7.118528947223795, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.001239742034078414; total time= 1.8min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=7.118528947223795, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.001239742034078414; total time= 1.8min
[CV] END linearsvr__C=10.737555188414591, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.008168455894760163; total time=   3.1s
[CV] END linearsvr__C=10.737555188414591, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.008168455894760163; total time=   3.2s
[CV] END linearsvr__C=8.851759613930136, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.005820013372709849; total time=   2.8s
[CV] END linearsvr__C=8.851759613930136, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.005820013372709849; total time=   2.7s
[CV] END linearsvr__C=10.832308858067883, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0012385137298860929; total time=   4.4s
[CV] END linearsvr__C=10.832308858067883, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0012385137298860929; total time=   4.1s
[CV] END linearsvr__C=7.075448519014383, linears



[CV] END linearsvr__C=3.3089382562214897, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.007591104805282692; total time= 1.8min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=3.3089382562214897, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.007591104805282692; total time= 1.8min
[CV] END linearsvr__C=2.2203823484477883, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.046386309723972764; total time=   0.6s
[CV] END linearsvr__C=2.2203823484477883, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.046386309723972764; total time=   0.6s
[CV] END linearsvr__C=2.733646535077721, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0032927591344236156; total time=   1.3s
[CV] END linearsvr__C=2.733646535077721, linearsvr__loss=epsilon_insensitive, linearsvr__tol=0.0032927591344236156; total time=   1.2s




[CV] END linearsvr__C=7.62522284353982, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.007084541505250228; total time= 1.7min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=7.62522284353982, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.007084541505250228; total time= 1.7min




[CV] END linearsvr__C=3.079416628681888, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.0023426581058204037; total time= 1.7min


Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END linearsvr__C=3.079416628681888, linearsvr__loss=squared_epsilon_insensitive, linearsvr__tol=0.0023426581058204037; total time= 1.7min


         nan -0.1446513          nan -0.14978293 -0.1443336  -0.14565593
 -0.15275757 -0.14418458 -0.14527968         nan -0.14185327 -0.14338048
         nan         nan]


RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [6]:
svr_rnd_search.best_score_

-0.14158182102872205

In [7]:
svr_rnd_search.best_params_

{'linearsvr__C': 2.5599452033620267,
 'linearsvr__loss': 'epsilon_insensitive',
 'linearsvr__tol': 0.00828891686688514}

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    "randomforestregressor__min_samples_split": randint(low=2, high=10),
    "randomforestregressor__min_samples_leaf": randint(low=1, high=10), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))

rf_rnd_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions= params,
    n_iter=10,
    scoring="neg_mean_squared_log_error",
    cv=2,
    verbose=2,
    random_state=42)


rf_rnd_search.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5; total time=   9.4s
[CV] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5; total time=   6.8s
[CV] END randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6; total time=   7.3s
[CV] END randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6; total time=   6.5s
[CV] END randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=8; total time=   7.7s
[CV] END randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=8; total time=   6.7s
[CV] END randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=8; total time=   7.8s
[CV] END randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=8; total time=   7.4s
[CV] END randomfore

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [9]:
rf_rnd_search.best_score_

-0.12414510460809519

In [10]:
rf_rnd_search.best_params_

{'randomforestregressor__min_samples_leaf': 8,
 'randomforestregressor__min_samples_split': 6}

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__max_depth": list(range(20,40)), 
    "randomforestregressor__max_features": ["auto","sqrt","log2"]
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    min_samples_leaf=8, 
                                                    min_samples_split=6,
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_squared_log_error",
    cv=3,
    verbose=2,    
)


rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  14.8s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  11.6s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  12.0s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.3s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.9s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=log2; total time=   3.3s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=log2; total time=   3.1s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=

[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=sqrt; total time=   3.5s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=sqrt; total time=   3.9s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.4s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.2s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  14.4s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  13.0s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  13.0s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=sqrt; total time=   3.8s
[CV] END randomforestregressor__max_d

[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  13.8s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  11.6s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  12.3s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.6s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.2s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_d

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [14]:
rf_grid_search.best_score_

-0.11696036790116093

In [15]:
rf_grid_search.best_params_

{'randomforestregressor__max_depth': 30,
 'randomforestregressor__max_features': 'sqrt'}

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__n_estimators": list(range(100,2000,100)), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    max_depth=30,
                                                    min_samples_leaf= 8, 
                                                    min_samples_split= 6,
                                                    max_features="sqrt",
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_squared_log_error",
    cv=3,
    verbose=2,    
)

rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 19 candidates, totalling 57 fits
[CV] END ............randomforestregressor__n_estimators=100; total time=   5.5s
[CV] END ............randomforestregressor__n_estimators=100; total time=   3.2s
[CV] END ............randomforestregressor__n_estimators=100; total time=   3.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   7.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   6.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   7.2s
[CV] END ............randomforestregressor__n_estimators=300; total time=  10.9s
[CV] END ............randomforestregressor__n_estimators=300; total time=   9.4s
[CV] END ............randomforestregressor__n_estimators=300; total time=   9.7s
[CV] END ............randomforestregressor__n_estimators=400; total time=  13.6s
[CV] END ............randomforestregressor__n_estimators=400; total time=  11.8s
[CV] END ............randomforestregressor__n_es

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [21]:
rf_grid_search.best_score_

-0.11696036790116093

In [22]:
rf_grid_search.best_params_

{'randomforestregressor__n_estimators': 100}

In [23]:
submission['Sales'] = rf_grid_search.best_estimator_.predict(X_test)
save_dataframe(submission,"rf_hyper1.csv")

### XGBoost

In [5]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "xgbregressor__max_depth": list(range(5,20))
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=2,
    verbose=2
)
xgb_grid.fit(X_train, y_train)

Fitting 2 folds for each of 135 candidates, totalling 270 fits
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=5; total time=   2.4s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=5; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=6; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=6; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=7; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=7; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=9; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=9; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.1, xgbregressor__m

[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=6; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=6; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=7; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=7; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=9; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=9; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=10; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=10; total time=   0.6s
[CV] END xgbregressor__learning_rate=0.4, xgbregressor__max_depth=11; total time=   0.5s
[CV] END xgbregressor__learni

[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=7; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=8; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=9; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=9; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=10; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=10; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=11; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=11; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=12; total time=   0.5s
[CV] END xgbregressor__learning_rate=0.7, xgbregressor__max_depth=12; total time=   0.5s
[CV] END xgbregressor__lea

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [6]:
xgb_grid.best_score_

-14.330466206973906

In [7]:
xgb_grid.best_params_

{'xgbregressor__learning_rate': 0.8, 'xgbregressor__max_depth': 5}

In [11]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                learning_rate=0.8, 
                                                max_depth= 5,
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__colsample_bytree": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "xgbregressor__colsample_bylevel": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.2; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.2; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.2; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.1, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsa

[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.8; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.8; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.8; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.9; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.9; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=0.9; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.3, xgbregressor__colsample_bytree=1; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.4, xgbregressor__colsample_bytree=0.1; total time=   0

[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.5; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.5; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.6; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.6; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.6; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.7; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.7; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.7; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.8; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.6, xgbregressor__colsample_bytree=0.8; total tim

[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.2; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.3; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.4; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.4; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.4; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.5; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.5; total time=   0.6s
[CV] END xgbregressor__colsample_bylevel=0.9, xgbregressor__colsample_bytree=0.5; total tim

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [12]:
xgb_grid.best_score_

-13.8486153571469

In [13]:
xgb_grid.best_params_

{'xgbregressor__colsample_bylevel': 0.1, 'xgbregressor__colsample_bytree': 0.1}

In [14]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                learning_rate=0.8, 
                                                max_depth= 5,
                                                colsample_bylevel=0.1,
                                                colsample_bytree=0.1,
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__tree_method": ["auto", "exact", "approx", "hist", "gpu_hist"],
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END .....................xgbregressor__tree_method=auto; total time=   0.9s
[CV] END .....................xgbregressor__tree_method=auto; total time=   0.9s
[CV] END .....................xgbregressor__tree_method=auto; total time=   0.9s
[CV] END ....................xgbregressor__tree_method=exact; total time=   0.9s
[CV] END ....................xgbregressor__tree_method=exact; total time=   0.9s
[CV] END ....................xgbregressor__tree_method=exact; total time=   0.9s
[CV] END ...................xgbregressor__tree_method=approx; total time=   1.2s
[CV] END ...................xgbregressor__tree_method=approx; total time=   1.2s
[CV] END ...................xgbregressor__tree_method=approx; total time=   1.3s
[CV] END .....................xgbregressor__tree_method=hist; total time=   0.7s
[CV] END .....................xgbregressor__tree_method=hist; total time=   0.7s
[CV] END .....................xgbregressor__tree_

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [15]:
xgb_grid.best_score_

-13.8486153571469

In [16]:
xgb_grid.best_params_

{'xgbregressor__tree_method': 'auto'}

In [17]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                learning_rate=0.8, 
                                                max_depth= 5,
                                                colsample_bylevel=0.1,
                                                colsample_bytree=0.1,
                                                random_state=42, 
                                                tree_method="auto", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__reg_lambda": [0.1, 0.5, 1, 1.3, 1.5, 1.7, 2, 3, 4],
    "xgbregressor__reg_alpha": [0, 0.3, 0.5, 0.7, 1, 1.5, 2, 2.5, 3]
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.1; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.5; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.5; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=0.5; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=1.3; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0, xgbregressor__reg_lambda=1.3; total time=   1.1s
[CV] END 

[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=1.5; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=1.7; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=1.7; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=1.7; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=2; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=2; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=2; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=3; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=3; total time=   1.0s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=3; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_lambda=4; total time=   0.9s
[CV] END xgbregressor__reg_alpha=0.7, xgbregressor__reg_l

[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=0.5; total time=   0.9s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=0.5; total time=   1.0s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=0.5; total time=   0.9s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1; total time=   1.0s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1; total time=   0.9s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1.3; total time=   0.9s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1.3; total time=   1.1s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1.3; total time=   1.2s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1.5; total time=   1.1s
[CV] END xgbregressor__reg_alpha=2.5, xgbregressor__reg_lambda=1.5; total time=   1.0s
[CV] END xgbregressor__reg_alpha=2.5, xgbregresso

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [18]:
xgb_grid.best_score_

-12.893688727253684

In [20]:
xgb_grid.best_params_

{'xgbregressor__reg_alpha': 0, 'xgbregressor__reg_lambda': 0.5}

In [22]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                learning_rate=0.8, 
                                                max_depth= 5,
                                                colsample_bylevel=0.1,
                                                colsample_bytree=0.1,
                                                reg_alpha=0,
                                                random_state=42, 
                                                tree_method="auto", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__reg_lambda": [0.1,0.2,0.3,0.4,0.5, 0.6,0.7,0.8,0.9,1,2,3,5,10]
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 14 candidates, totalling 42 fits
[CV] END .......................xgbregressor__reg_lambda=0.1; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.1; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.1; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.2; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.2; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.2; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.3; total time=   1.1s
[CV] END .......................xgbregressor__reg_lambda=0.3; total time=   0.9s
[CV] END .......................xgbregressor__reg_lambda=0.3; total time=   1.0s
[CV] END .......................xgbregressor__reg_lambda=0.4; total time=   1.0s
[CV] END .......................xgbregressor__reg_lambda=0.4; total time=   0.9s
[CV] END .......................xgbregressor__re

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [23]:
xgb_grid.best_score_

-12.893688727253684

In [24]:
xgb_grid.best_params_

{'xgbregressor__reg_lambda': 0.5}

In [25]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                learning_rate=0.8, 
                                                max_depth= 5,
                                                colsample_bylevel=0.1,
                                                colsample_bytree=0.1,
                                                reg_alpha=0,
                                                reg_lambda=0.5,
                                                random_state=42, 
                                                tree_method="auto", 
                                                n_jobs=-1,))
xgb_reg.fit(X_train, y_train)
submission['Sales'] = xgb_reg.predict(X_test)
save_dataframe(submission, "xgb_hyper1.csv")

This model performs very poorly. 

## Extra Trees

In [27]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42))
scores = cross_val_score(extra_tree, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.11244326 0.09640252 0.13539122 0.22002627 0.16880093]
Average score: 0.14661283864178518


In [12]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    "extratreesregressor__min_samples_split": randint(low=2, high=10),
    "extratreesregressor__min_samples_leaf": randint(low=1, high=10), 
}

extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42, n_jobs=-1))

extra_tree_search = RandomizedSearchCV(
    estimator=extra_tree,
    param_distributions= params,
    n_iter=10,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2,
    random_state=42)

extra_tree_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END extratreesregressor__min_samples_leaf=7, extratreesregressor__min_samples_split=5; total time=  20.5s
[CV] END extratreesregressor__min_samples_leaf=7, extratreesregressor__min_samples_split=5; total time=  15.3s
[CV] END extratreesregressor__min_samples_leaf=7, extratreesregressor__min_samples_split=5; total time=  17.3s
[CV] END extratreesregressor__min_samples_leaf=8, extratreesregressor__min_samples_split=6; total time=  19.9s
[CV] END extratreesregressor__min_samples_leaf=8, extratreesregressor__min_samples_split=6; total time=  16.9s
[CV] END extratreesregressor__min_samples_leaf=8, extratreesregressor__min_samples_split=6; total time=  17.6s
[CV] END extratreesregressor__min_samples_leaf=5, extratreesregressor__min_samples_split=8; total time=  19.7s
[CV] END extratreesregressor__min_samples_leaf=5, extratreesregressor__min_samples_split=8; total time=  17.0s
[CV] END extratreesregressor__min_samples_leaf=5, e

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [13]:
extra_tree_search.best_score_

-0.13263071578452862

In [14]:
extra_tree_search.best_params_

{'extratreesregressor__min_samples_leaf': 8,
 'extratreesregressor__min_samples_split': 6}

In [21]:
params = {
    "extratreesregressor__max_depth": list(range(3,30)) 
}

extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42,
                                                          min_samples_leaf=8,
                                                          min_samples_split=6,
                                                          n_jobs=-1))

extra_tree_grid = GridSearchCV(
    estimator= extra_tree,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
extra_tree_grid.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ...................extratreesregressor__max_depth=3; total time=  11.4s
[CV] END ...................extratreesregressor__max_depth=3; total time=  10.1s
[CV] END ...................extratreesregressor__max_depth=3; total time=  10.8s
[CV] END ...................extratreesregressor__max_depth=4; total time=  13.5s
[CV] END ...................extratreesregressor__max_depth=4; total time=  11.6s
[CV] END ...................extratreesregressor__max_depth=4; total time=  12.3s
[CV] END ...................extratreesregressor__max_depth=5; total time=  15.2s
[CV] END ...................extratreesregressor__max_depth=5; total time=  12.9s
[CV] END ...................extratreesregressor__max_depth=5; total time=  13.8s
[CV] END ...................extratreesregressor__max_depth=6; total time=  16.4s
[CV] END ...................extratreesregressor__max_depth=6; total time=  14.5s
[CV] END ...................extratreesregressor_

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [22]:
extra_tree_grid.best_score_

-0.12581450950362952

In [23]:
extra_tree_grid.best_params_

{'extratreesregressor__max_depth': 8}

In [None]:
extra_tree = make_pipeline(full_pipe, ExtraTreesRegressor(random_state=42,
                                                         max_depth=8,
                                                         min_samples_leaf=8,
                                                          min_samples_split=6))

## Hist Gradient Boosting

In [28]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

hist_gbrt = make_pipeline(full_pipe, HistGradientBoostingRegressor(random_state=42, early_stopping=True))

scores = cross_val_score(hist_gbrt, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.09379386 0.07244025 0.1013186  0.20144597 0.11611935]
Average score: 0.11702360851888935


In [29]:
hist_gbrt.fit(X_train, y_train)
submission['Sales'] = hist_gbrt.predict(X_test)
save_dataframe(submission, "hist_gbrt_default.csv")

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

hist_gbrt = make_pipeline(full_pipe, HistGradientBoostingRegressor(random_state=42,early_stopping=True))

params = {
    "histgradientboostingregressor__max_iter":[100,200,300,400,500]
}

hist_gbrt_grid = GridSearchCV(
    estimator= hist_gbrt,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3,
    verbose=2
)
hist_gbrt_grid.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END ........histgradientboostingregressor__max_iter=100; total time=   2.1s
[CV] END ........histgradientboostingregressor__max_iter=100; total time=   2.0s
[CV] END ........histgradientboostingregressor__max_iter=100; total time=   2.1s
[CV] END ........histgradientboostingregressor__max_iter=200; total time=   3.7s
[CV] END ........histgradientboostingregressor__max_iter=200; total time=   3.6s
[CV] END ........histgradientboostingregressor__max_iter=200; total time=   3.8s
[CV] END ........histgradientboostingregressor__max_iter=300; total time=   5.2s
[CV] END ........histgradientboostingregressor__max_iter=300; total time=   5.0s
[CV] END ........histgradientboostingregressor__max_iter=300; total time=   5.3s
[CV] END ........histgradientboostingregressor__max_iter=400; total time=   6.8s
[CV] END ........histgradientboostingregressor__max_iter=400; total time=   6.6s
[CV] END ........histgradientboostingregressor__m

Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV] END ........histgradientboostingregressor__max_iter=500; total time=   7.8s
[CV] END ........histgradientboostingregressor__max_iter=500; total time=   8.1s




GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc