### Setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

### Prepare Data For ML

In [3]:
from prepare import prepare_data

In [4]:
X_train, y_train, X_test, full_pipe = prepare_data(train, test)

## Hyperparameter Tunning

In [5]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, reciprocal 
from sklearn.pipeline import make_pipeline


svr = make_pipeline(full_pipe, LinearSVR(random_state=42, max_iter=10000))

param_dist = {"linearsvr__C": uniform(1, 10),
              "linearsvr__tol": reciprocal(0.0001, 0.1)
             }

svr_rnd_search = RandomizedSearchCV(
    svr,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring="neg_mean_squared_log_error",
    random_state=42,
    verbose=2
)

svr_rnd_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.0s
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.1s
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.0s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.5s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.5s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.8s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   2.0s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   2.1s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   2.2s
[CV] END linearsvr__C=1.5808361216819946, linearsvr__tol=0.0396760

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [6]:
svr_rnd_search.best_score_

-0.1307540115941013

In [7]:
svr_rnd_search.best_params_

{'linearsvr__C': 1.5808361216819946, 'linearsvr__tol': 0.039676050770529867}

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

params = {
    "randomforestregressor__min_samples_split": randint(low=2, high=10),
    "randomforestregressor__min_samples_leaf": randint(low=1, high=10), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1))

rf_rnd_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions= params,
    n_iter=10,
    scoring="neg_mean_squared_log_error",
    cv=2,
    verbose=2,
    random_state=42)


rf_rnd_search.fit(X_train, y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits
[CV] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5; total time=   9.4s
[CV] END randomforestregressor__min_samples_leaf=7, randomforestregressor__min_samples_split=5; total time=   6.8s
[CV] END randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6; total time=   7.3s
[CV] END randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6; total time=   6.5s
[CV] END randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=8; total time=   7.7s
[CV] END randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=8; total time=   6.7s
[CV] END randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=8; total time=   7.8s
[CV] END randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=8; total time=   7.4s
[CV] END randomfore

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [9]:
rf_rnd_search.best_score_

-0.12414510460809519

In [10]:
rf_rnd_search.best_params_

{'randomforestregressor__min_samples_leaf': 8,
 'randomforestregressor__min_samples_split': 6}

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__max_depth": list(range(20,40)), 
    "randomforestregressor__max_features": ["auto","sqrt","log2"]
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    min_samples_leaf=8, 
                                                    min_samples_split=6,
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_squared_log_error",
    cv=3,
    verbose=2,    
)


rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  14.8s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  11.6s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=auto; total time=  12.0s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.3s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=sqrt; total time=   3.9s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=log2; total time=   3.3s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=log2; total time=   3.1s
[CV] END randomforestregressor__max_depth=20, randomforestregressor__max_features=

[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=sqrt; total time=   3.5s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=sqrt; total time=   3.9s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.4s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.2s
[CV] END randomforestregressor__max_depth=28, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  14.4s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  13.0s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=auto; total time=  13.0s
[CV] END randomforestregressor__max_depth=29, randomforestregressor__max_features=sqrt; total time=   3.8s
[CV] END randomforestregressor__max_d

[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  13.8s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  11.6s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=auto; total time=  12.3s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.6s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=sqrt; total time=   3.7s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.2s
[CV] END randomforestregressor__max_depth=37, randomforestregressor__max_features=log2; total time=   3.5s
[CV] END randomforestregressor__max_d

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [14]:
rf_grid_search.best_score_

-0.11696036790116093

In [15]:
rf_grid_search.best_params_

{'randomforestregressor__max_depth': 30,
 'randomforestregressor__max_features': 'sqrt'}

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

params = {
    "randomforestregressor__n_estimators": list(range(100,2000,100)), 
}

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42, 
                                                    max_depth=30,
                                                    min_samples_leaf= 8, 
                                                    min_samples_split= 6,
                                                    max_features="sqrt",
                                                    n_jobs=-1))


rf_grid_search = GridSearchCV(
    estimator= rf,
    param_grid= params,
    scoring= "neg_mean_squared_log_error",
    cv=3,
    verbose=2,    
)

rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 19 candidates, totalling 57 fits
[CV] END ............randomforestregressor__n_estimators=100; total time=   5.5s
[CV] END ............randomforestregressor__n_estimators=100; total time=   3.2s
[CV] END ............randomforestregressor__n_estimators=100; total time=   3.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   7.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   6.7s
[CV] END ............randomforestregressor__n_estimators=200; total time=   7.2s
[CV] END ............randomforestregressor__n_estimators=300; total time=  10.9s
[CV] END ............randomforestregressor__n_estimators=300; total time=   9.4s
[CV] END ............randomforestregressor__n_estimators=300; total time=   9.7s
[CV] END ............randomforestregressor__n_estimators=400; total time=  13.6s
[CV] END ............randomforestregressor__n_estimators=400; total time=  11.8s
[CV] END ............randomforestregressor__n_es

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         ['Store_id']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='NA',
                                                                                                        strategy='constant')),
                                                                                         ('onehotenc

In [21]:
rf_grid_search.best_score_

-0.11696036790116093

In [22]:
rf_grid_search.best_params_

{'randomforestregressor__n_estimators': 100}

In [23]:
submission['Sales'] = rf_grid_search.best_estimator_.predict(X_test)
save_dataframe(submission,"rf_hyper1.csv")

### XGBoost

In [24]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squaredlogerror",
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "xgbregressor__max_depth": list(range(5,30))
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_squared_log_error",
    cv=3
)
xgb_grid.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\core.py", line 436, in inner_f
    return f(**kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\sklearn.py", line 736, in fit
    self._Booster = train(
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\training.py", line 189, in train
    bst = _train_internal(params, dtrain,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\training.py", line 81, in _train_internal
    bst.update(dtrain, i, obj)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\core.py", line 1499, in update
    _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
  

Traceback (most recent call last):
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\sklearn\pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\core.py", line 436, in inner_f
    return f(**kwargs)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\sklearn.py", line 736, in fit
    self._Booster = train(
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\training.py", line 189, in train
    bst = _train_internal(params, dtrain,
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\training.py", line 81, in _train_internal
    bst.update(dtrain, i, obj)
  File "C:\Users\BHOLA\anaconda3\lib\site-packages\xgboost\core.py", line 1499, in update
    _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
  

KeyboardInterrupt: 