In [29]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Necessary imports 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 

from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

import lightgbm as lgb 

In [2]:
train_data = pd.read_csv('train_data.csv')

In [3]:
X = train_data.drop(['Customer_ID', 'Expected_Sales'], axis=1)
y = train_data['Expected_Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(720, 11) (720,)
(80, 11) (80,)


## XGboost

In [5]:
params = {
    'eta': [0.1, 0.3, 0.5],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5], 
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

xgb = XGBRegressor()

grid_search = GridSearchCV(xgb, 
                           param_grid=params,
                           scoring='neg_mean_squared_error', 
                           cv=5,
                           verbose=1)

grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Refit on whole dataset with best params
xgb_best = XGBRegressor(params=grid_search.best_params_)
xgb_best.fit(X, y)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters: {'colsample_bytree': 0.5, 'eta': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8}
Best cross-validation score: -1013880.803690877


Parameters: { "params" } are not used.



In [27]:
y_pred = xgb_best.predict(X_test)
rmse = np.sqrt(MSE(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 6.190289


In [20]:
test = train_data.loc[train_data['Customer_ID'] == 4]

In [21]:
test

Unnamed: 0,Customer_ID,Total_Order_Value,Total_Order_Count,Time_Sine_Last_Order,Average_Order_Value,Product_Diversity,Total_Products_Purchased,Average_Time_Between_Orders,Recent_Product_Category,Geographic,Marketing,Score,Expected_Sales
3,4,1966.93,11,697,178.811818,11,42,174.181818,54,97219,0.2,25,308.938


In [22]:
X_test = test.drop(['Customer_ID', 'Expected_Sales'], axis=1)
y_test = test['Expected_Sales']

In [23]:
y_pred = xgb_best.predict(X_test)

In [24]:
y_pred

array([302.7477], dtype=float32)

In [25]:
y_test

3    308.938
Name: Expected_Sales, dtype: float64

## Lightgbm

In [30]:
# Create a LightGBM dataset for training with features X_train and labels Y_train 
train_data = lgb.Dataset(X_train, label=y_train) 
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data) 

In [72]:
# defining parameters 
params = {
    'task': 'train', 
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 28,
    'learnnig_rage': 0.01,
    'metric': {'l2','l1'},
    'verbose': -1
}

In [73]:
# Set the number of rounds and train the model with early stopping 
# fitting the model
model = lgb.train(params,
                 train_set=train_data,
                 valid_sets=test_data)

In [74]:
# prediction
y_pred = model.predict(X_test)

# accuracy check
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse) 

MSE: 213.43
RMSE: 14.61


## CatBoost

In [76]:
from catboost import CatBoostRegressor
 
# Initialize the CatBoostRegressor with RMSE as the loss function
model = CatBoostRegressor(loss_function='RMSE')
 
# Fit the model on the training data with verbose logging every 100 iterations
model.fit(X_train, y_train, verbose=100)

Learning rate set to 0.038872
0:	learn: 508.9181223	total: 133ms	remaining: 2m 13s
100:	learn: 413.6714524	total: 329ms	remaining: 2.93s
200:	learn: 350.2720269	total: 466ms	remaining: 1.85s
300:	learn: 297.2394486	total: 613ms	remaining: 1.42s
400:	learn: 255.1044248	total: 759ms	remaining: 1.13s
500:	learn: 220.6431924	total: 925ms	remaining: 921ms
600:	learn: 191.7627494	total: 1.09s	remaining: 728ms
700:	learn: 167.7163553	total: 1.24s	remaining: 529ms
800:	learn: 147.7759792	total: 1.42s	remaining: 352ms
900:	learn: 132.3410446	total: 1.56s	remaining: 172ms
999:	learn: 118.5056310	total: 1.71s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a0f33ac280>

In [79]:
y_pred = model.predict(X_test)

# accuracy check
mse = mean_squared_error(y_test, y_pred)
rmse = mse**(0.5)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % rmse) 

MSE: 15.39
RMSE: 3.92
