## Setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

### Prepare data for ML

In [3]:
from prepare import prepare_data
X_train, y_train, X_test, full_pipe = prepare_data(train, test)

# ML Models

### Linear SVR

In [4]:
from sklearn.svm import LinearSVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, reciprocal

In [5]:
lin_svr = make_pipeline(full_pipe, LinearSVR(random_state=42))

scores = cross_val_score(lin_svr, X_train, y_train,cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.10788181 0.07460828 0.08580756 0.20434581 0.13214094]
Average score: 0.12095687881569946


In [6]:
lin_svr = make_pipeline(full_pipe, LinearSVR(random_state=42, max_iter=100000))

param_dist = {"linearsvr__C": uniform(1, 10),
              "linearsvr__tol": reciprocal(0.0001, 0.1)
             }

svr_rnd_search = RandomizedSearchCV(
    lin_svr,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring="neg_mean_squared_log_error",
    random_state=42,
    verbose=2
)

svr_rnd_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.0s
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.0s
[CV] END linearsvr__C=4.745401188473625, linearsvr__tol=0.07114476009343418; total time=   1.0s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.2s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.2s
[CV] END linearsvr__C=8.31993941811405, linearsvr__tol=0.006251373574521747; total time=   3.4s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   1.8s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   1.8s
[CV] END linearsvr__C=2.560186404424365, linearsvr__tol=0.00029375384576328287; total time=   1.9s
[CV] END linearsvr__C=1.5808361216819946, linearsvr__tol=0.0396760

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [7]:
svr_rnd_search.best_score_

-0.1307540115941013

In [8]:
svr_rnd_search.best_params_

{'linearsvr__C': 1.5808361216819946, 'linearsvr__tol': 0.039676050770529867}

In [9]:
submission['Sales'] = svr_rnd_search.best_estimator_.predict(X_test)
save_dataframe(submission, "lin_svr_hyper1.csv")

## Random Forest

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

param_dist = {
    "randomforestregressor__max_depth": randint(low=5, high=30),
    "randomforestregressor__min_samples_split": randint(low=2, high=10),
    "randomforestregressor__min_samples_leaf": randint(low=1, high=10),
    "randomforestregressor__n_estimators": randint(low=100,high=300)
}

rf = make_pipeline(
    full_pipe, RandomForestRegressor(random_state=42, n_jobs=-1)
)

rf_rnd_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=30,
    cv=2,
    scoring="neg_mean_squared_log_error",
    random_state=42,
    verbose=2
)

rf_rnd_search.fit(X_train, y_train)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
[CV] END randomforestregressor__max_depth=11, randomforestregressor__min_samples_leaf=4, randomforestregressor__min_samples_split=6, randomforestregressor__n_estimators=114; total time=   7.5s
[CV] END randomforestregressor__max_depth=11, randomforestregressor__min_samples_leaf=4, randomforestregressor__min_samples_split=6, randomforestregressor__n_estimators=114; total time=   6.8s
[CV] END randomforestregressor__max_depth=15, randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6, randomforestregressor__n_estimators=120; total time=   8.1s
[CV] END randomforestregressor__max_depth=15, randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=6, randomforestregressor__n_estimators=120; total time=   7.2s
[CV] END randomforestregressor__max_depth=11, randomforestregressor__min_samples_leaf=3, randomforestregressor__min_samples_split=8, randomforestregressor__n_estimator

[CV] END randomforestregressor__max_depth=8, randomforestregressor__min_samples_leaf=8, randomforestregressor__min_samples_split=7, randomforestregressor__n_estimators=289; total time=  16.6s
[CV] END randomforestregressor__max_depth=12, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=9, randomforestregressor__n_estimators=181; total time=  13.1s
[CV] END randomforestregressor__max_depth=12, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=9, randomforestregressor__n_estimators=181; total time=  10.8s
[CV] END randomforestregressor__max_depth=19, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=9, randomforestregressor__n_estimators=253; total time=  18.3s
[CV] END randomforestregressor__max_depth=19, randomforestregressor__min_samples_leaf=5, randomforestregressor__min_samples_split=9, randomforestregressor__n_estimators=253; total time=  15.9s
[CV] END randomforestregressor__max_

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer())]),
                                                                               ['Store_id']),
                                                                              ('cat',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='NA',
                                                                                                              strategy='constant')),
                                        

In [7]:
rf_rnd_search.best_score_

-0.12344975773349076

In [8]:
rf_rnd_search.best_params_

{'randomforestregressor__max_depth': 7,
 'randomforestregressor__min_samples_leaf': 3,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__n_estimators': 222}

In [9]:
submission['Sales'] = rf_rnd_search.best_estimator_.predict(X_test)
save_dataframe(submission,"rf_hyper2.csv")

## Stacking

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error

rf_reg = RandomForestRegressor(random_state=42,
                                 max_depth=30,
                                 min_samples_leaf= 8,
                                 min_samples_split= 6,
                                 max_features="sqrt",
                                 n_jobs=-1)

estimators = [
    ("rf", rf_reg),
    ("svr",LinearSVR(random_state=42)),
    ("lgb",LGBMRegressor(random_state=42))
]

reg = StackingRegressor(estimators=estimators, final_estimator=LGBMRegressor(random_state=42))

stacked_reg = make_pipeline(full_pipe, reg)

scores = cross_val_score(stacked_reg, X_train, y_train, cv=3, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.08974682 0.1487915  0.21321321]
Average score: 0.1505838418378722


In [6]:
stacked_reg.fit(X_train, y_train)
submission['Sales'] = stacked_reg.predict(X_test)
save_dataframe(submission, "stacking1.csv")