In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import multiprocessing as mp
import gc
import datetime
from sklearn.preprocessing import LabelEncoder
import calendar
from scipy.sparse import csr_matrix,hstack
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from tqdm import tqdm
import pickle
import sys
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import plotly.graph_objects as go
import catboost as cb

In [4]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [5]:
!cp /content/gdrive/MyDrive/PredictiveAnalyticsData/utility/generate_submission_file.py .

In [6]:
from generate_submission_file import generate_submission

Mounted at /content/gdrive
Training Data Shape: (58327370, 43)
Testing Data Shape: (853720, 43)
Prediction Data Shape: (853720, 43)


In [7]:
def separate_features_and_parameters():
  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_train.pkl")
  # train_df = df.drop(columns=['id'], axis=1)
  train_df = df.copy()
  print(f"Training Data Shape: {train_df.shape}")
  y_train = train_df['sales']
  X_train = train_df.drop(columns=['sales'], axis=1)
  X_train[['id','sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']] = X_train[['id', 'sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']].fillna(0.0)

  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_test.pkl")
  # test_df = df.drop(columns=['id'], axis=1)
  test_df = df.copy()
  print(f"Testing Data Shape: {test_df.shape}")
  y_test = test_df['sales']
  X_test = test_df.drop(columns=['sales'], axis=1)
  X_test[['id','sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']] = X_test[['id', 'sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']].fillna(0.0)

  df = pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_prediction_data.pkl")
  # prediction_df = df.drop(columns=['id'], axis=1)
  prediction_df = df.copy()
  print(f"Prediction Data Shape: {prediction_df.shape}")
  y_prediction = prediction_df['sales']
  X_prediction = prediction_df.drop(columns=['sales'], axis=1)
  X_prediction[['id','sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']] = X_prediction[['id', 'sold_lag_7', 'sold_lag_14', 'sold_lag_21', 'sold_lag_28', 'sold_lag_35', 'sold_lag_42', 'sold_lag_49', 'sold_lag_56', 'roll_mean_7', 'roll_mean_14', 'roll_mean_28']].fillna(0.0)

  del df

  return X_train, y_train, X_test, y_test, X_prediction, y_prediction

In [8]:
X_train, y_train, X_test, y_test, X_prediction, y_prediction = separate_features_and_parameters()

Training Data Shape: (58327370, 43)
Testing Data Shape: (853720, 43)
Prediction Data Shape: (853720, 43)


In [9]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_prediction.shape, y_prediction.shape

((58327370, 42), (58327370,), (853720, 42), (853720,), (853720, 42), (853720,))

##### Time Series Cross-Validation

In [10]:
tscv = TimeSeriesSplit(n_splits=5).split(pd.read_pickle("/content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/final_train.pkl"))

##### CatBoost Regressor

In [11]:
lr = [(np.round(np.random.rand()/10,3)) for i in range(5)]

In [12]:
lr

[0.031, 0.002, 0.069, 0.093, 0.053]

In [13]:
param_search = {'learning_rate': lr, 'depth': [2,4,7]}

In [14]:
gsearch = GridSearchCV(estimator=cb.CatBoostRegressor(task_type='GPU'), cv=tscv, param_grid=param_search, scoring="neg_mean_squared_error")

In [None]:
gsearch.fit(X_train, y_train)

0:	learn: 3.4414122	total: 141ms	remaining: 2m 20s
1:	learn: 3.2588219	total: 255ms	remaining: 2m 7s
2:	learn: 3.0932894	total: 358ms	remaining: 1m 58s
3:	learn: 2.9441507	total: 449ms	remaining: 1m 51s
4:	learn: 2.8126420	total: 537ms	remaining: 1m 46s
5:	learn: 2.6949566	total: 625ms	remaining: 1m 43s
6:	learn: 2.5878119	total: 716ms	remaining: 1m 41s
7:	learn: 2.4931999	total: 807ms	remaining: 1m 40s
8:	learn: 2.4098331	total: 895ms	remaining: 1m 38s
9:	learn: 2.3351150	total: 984ms	remaining: 1m 37s
10:	learn: 2.2691587	total: 1.07s	remaining: 1m 36s
11:	learn: 2.2124056	total: 1.16s	remaining: 1m 35s
12:	learn: 2.1622091	total: 1.25s	remaining: 1m 35s
13:	learn: 2.1177072	total: 1.34s	remaining: 1m 34s
14:	learn: 2.0784154	total: 1.43s	remaining: 1m 34s
15:	learn: 2.0435588	total: 1.52s	remaining: 1m 33s
16:	learn: 2.0134488	total: 1.61s	remaining: 1m 33s
17:	learn: 1.9879020	total: 1.71s	remaining: 1m 33s
18:	learn: 1.9649615	total: 1.79s	remaining: 1m 32s
19:	learn: 1.9453034	to

In [None]:
gsearch.cv_results_


In [None]:
gsearch.best_params_

learning_rate=0.0207
depth=7
num_leaves=94

In [17]:
cb_regressor_model = cb.CatBoostRegressor(learning_rate=0.0207, depth=7)
cb_regressor_model.fit(X_train, y_train)

0:	learn: 3.8172312	total: 3.24s	remaining: 54m
1:	learn: 3.7628161	total: 5.91s	remaining: 49m 9s
2:	learn: 3.7098654	total: 8.78s	remaining: 48m 38s
3:	learn: 3.6580600	total: 11.6s	remaining: 47m 58s
4:	learn: 3.6075669	total: 14s	remaining: 46m 28s
5:	learn: 3.5584653	total: 16.6s	remaining: 45m 51s
6:	learn: 3.5105140	total: 19.1s	remaining: 45m 9s
7:	learn: 3.4635301	total: 21.8s	remaining: 45m 7s
8:	learn: 3.4178867	total: 24.8s	remaining: 45m 25s
9:	learn: 3.3736493	total: 27.5s	remaining: 45m 24s
10:	learn: 3.3302149	total: 30.1s	remaining: 45m 6s
11:	learn: 3.2877039	total: 32.7s	remaining: 44m 49s
12:	learn: 3.2465540	total: 35.2s	remaining: 44m 30s
13:	learn: 3.2063344	total: 38s	remaining: 44m 35s
14:	learn: 3.1673973	total: 41.1s	remaining: 44m 56s
15:	learn: 3.1293126	total: 43.6s	remaining: 44m 39s
16:	learn: 3.0922715	total: 46s	remaining: 44m 20s
17:	learn: 3.0560685	total: 48.6s	remaining: 44m 11s
18:	learn: 3.0209028	total: 51.2s	remaining: 44m 1s
19:	learn: 2.98675

<catboost.core.CatBoostRegressor at 0x7e28cd936050>

In [None]:
cb_regressor_model.feature_importances_

In [18]:
pickle.dump(cb_regressor_model, open('/content/gdrive/MyDrive/PredictiveAnalyticsData/trained_models/catboost_model_lr_0207_depth_7.pkl', 'wb'))

In [19]:
generate_submission('/content/gdrive/MyDrive/PredictiveAnalyticsData/trained_models/catboost_model_lr_0207_depth_7.pkl', take_imp_cols=False)

Generating submission file for model: 
<catboost.core.CatBoostRegressor object at 0x7e29dcc40340> at location /content/gdrive/MyDrive/PredictiveAnalyticsData/output_data/sample_submission_new_<catboost.core.CatBoostRegressor object at 0x7e29dcc40340>_05122023_234339_.csv



100%|██████████| 28/28 [00:00<00:00, 522.70it/s]


(60980, 29)
