In [1]:
import numpy as np
import pandas as pd
import re
from dataset import make_dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold

<h1> 0. Final Result

In [5]:
MSE_sale = {'Model': ['2.1', '2.2', '2.3', '2.4','2.5'],
        'MSE': [137.7401263695513, 140.3253148909816,145.85657861830057,140.83137329792353, 148.3557983756306]}
MSE_sale = pd.DataFrame(MSE_sale)
MSE_sorted = MSE_sale.sort_values(by='MSE')
MSE_sorted

Unnamed: 0,Model,MSE
0,2.1,137.740126
1,2.2,140.325315
3,2.4,140.831373
2,2.3,145.856579
4,2.5,148.355798


Based on the result, the best model is Model 2.1 
- X: Categorical (weekdays, Product IDs and Brand IDs)
- Y: sales
- Parameters: {'ccp_alpha': 0, 'max_depth': 25, 'max_features': 43, 'n_estimators': 200}

<h1> 1. Data Prepocessing

<h2> 1.1 Import Data

In [2]:
ss, not_scaled_df, scaled_df, data_train = make_dataset()
data_train = scaled_df
data_train.head()



Unnamed: 0,productID_0,productID_1,productID_2,productID_3,productID_6,productID_7,productID_8,productID_9,productID_10,productID_13,...,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel,sales
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543,14
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681,60
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321,15


<h2> 1.2 Filter Data

In [3]:
# Filter the datapoints with sales > 200
data_train = data_train[data_train['sales'] <= 200]
data_train = data_train.reset_index(drop=True)
y_train = data_train['sales']
x_train = data_train.drop('sales',axis=1)

<h1> 2. Random Forest

<h2> 2.1 Grid Search for RF based on Filtered dataset

In [4]:
features =len(x_train.columns)
third_feature = round(features/3)
third_feature

43

In [91]:
# Creating the model
model = RandomForestRegressor(random_state=24)

# Dataset used: categorical_list = ["productID", "brandID", "weekday"]
X = x_train
y = y_train

# Parameter grid to vary
param_grid = {
    'n_estimators': [50, 100,200,500],
    'max_features': ['sqrt','log2',43, 'auto'],
    'max_depth': [25,30,35],
    'ccp_alpha':[0, 0.005,0.007,0.1]
}


# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator = model,param_grid = param_grid , scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'max_depth': 25, 'max_features': 43, 'n_estimators': 200}
The corresponding test MSE is 137.7401263695513


<h2> 2.2 Grid Search for RF on Filtered dataset (with Binary weekday categorical)  

In [131]:
# Convert the weekdays to binary, 0 for weekdays, 1 for weekends 
data_train_wk = data_train.copy()
data_train_wk["weekday"] = data_train_wk.apply(lambda row: row["weekday_1"] * 1 + row["weekday_2"] * 2 + row["weekday_3"] * 3 + row["weekday_4"] * 4 + row["weekday_5"] * 5 + row["weekday_6"] * 6 + row["weekday_7"] * 7, axis=1)
pattern = re.compile(r'^weekday_\d+$')
weekday = data_train_wk.filter(regex=pattern)
data_train_wk = data_train_wk.drop(weekday.columns, axis=1)

# New weekend columns to check if it is a weekend
data_train_wk['weekends'] = data_train_wk["weekday"].apply(lambda x: 0 if x in [1, 2, 3, 4, 5] else 1)
x_train_wk = data_train_wk.drop(['sales', 'weekday'], axis=1)
x_train_wk.head()

Unnamed: 0,productID_0,productID_1,productID_2,productID_3,productID_6,productID_7,productID_8,productID_9,productID_10,productID_13,...,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel,weekends
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681,1
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321,0


In [139]:
features =len(x_train_wk.columns)
third_feature = round(features/3)
third_feature

41

In [168]:
# Creating the model
model = RandomForestRegressor(random_state=24)

# Dataset used: Categorical: "productID", "brandID", Binary "weekday")
X = x_train_wk
y = y_train

# Parameter grid to vary
param_grid = {
    'n_estimators': [50, 100,200,500],
    'max_features': ['sqrt','log2',41, 'auto'],
    'max_depth': [25,30,35],
    'ccp_alpha':[0, 0.005,0.007,0.1]
}


# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator = model,param_grid = param_grid , scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'max_depth': 30, 'max_features': 41, 'n_estimators': 200}
The corresponding test MSE is 140.3253148909816


<h2> 2.3 Grid Search for RF based on Filtered dataset (without Product ID)  

In [154]:
# x_train_PID, dataset without product ID, retain Brand ID
data_train_BID = data_train.copy()
# drop all product IDs
pattern = re.compile(r'^productID_\d+$')
Product_BID = data_train_BID.filter(regex=pattern)
data_train_BID = data_train_BID.drop(Product_ID.columns, axis=1)
# drop sales 
x_train_BID = data_train_BID.drop(['sales'], axis=1)
x_train_BID.head()

Unnamed: 0,brandID_1,brandID_2,brandID_3,brandID_4,brandID_5,brandID_8,brandID_9,brandID_10,brandID_11,brandID_12,...,avgFinalUnitPrice,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.176703,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.79562,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.937825,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.537158,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.47002,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321


In [161]:
features =len(x_train_BID.columns)
third_feature = round(features/3)
third_feature

18

In [170]:
# Creating the model
model = RandomForestRegressor(random_state=24)

# Dataset used: Categorical: ("brandID", "weekday")
X =x_train_BID
y = y_train

# Parameter grid to vary
param_grid = {
    'n_estimators': [50, 100,200,500],
    'max_features': ['sqrt','log2',18, 'auto'],
    'max_depth': [25,30,35],
    'ccp_alpha':[0, 0.005,0.007,0.1]
}


# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator = model,param_grid = param_grid , scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'max_depth': 25, 'max_features': 18, 'n_estimators': 500}
The corresponding test MSE is 145.85657861830057


<h2> 2.4 Grid Search for RF based on Filtered dataset (without brand ID)  

In [156]:
# x_train_PID, dataset without Brand ID, retain Product ID
data_train_PID = data_train.copy()
# drop all brand IDs
pattern = re.compile(r'^brandID_\d+$')
Brand_ID = data_train_PID.filter(regex=pattern)
data_train_PID = data_train_PID.drop(Brand_ID.columns, axis=1)
# drop sales
x_train_PID = data_train_PID.drop(['sales'], axis=1)
x_train_PID.head()

Unnamed: 0,productID_0,productID_1,productID_2,productID_3,productID_6,productID_7,productID_8,productID_9,productID_10,productID_13,...,avgFinalUnitPrice,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.176703,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.79562,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.937825,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.537158,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.47002,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321


In [160]:
features =len(x_train_PID.columns)
third_feature = round(features/3)
third_feature

33

In [173]:
# Creating the model
model = RandomForestRegressor(random_state=24)

# Dataset used: Categorical: ("ProductID", "weekday")
X =x_train_PID
y = y_train

# Parameter grid to vary
param_grid = {
    'n_estimators': [50, 100,200,500],
    'max_features': ['sqrt','log2',33, 'auto'],
    'max_depth': [25,30,35],
    'ccp_alpha':[0, 0.005,0.007,0.1]
}


# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator = model,param_grid = param_grid , scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'max_depth': 25, 'max_features': 33, 'n_estimators': 500}
The corresponding test MSE is 140.83137329792353


<h2> 2.5 Grid Search for RF based on Filtered dataset (without Product & Brand ID)  

In [159]:
# x_train_IDs, dataset without product & Brand ID
data_train_IDs = data_train.copy()
# drop all product IDs
pattern = re.compile(r'^productID_\d+$')
Product_IDs = data_train_IDs.filter(regex=pattern)
data_train_IDs = data_train_IDs.drop(Product_IDs.columns, axis=1)
# drop all brand IDs
pattern = re.compile(r'^brandID_\d+$')
Brand_IDs = data_train_IDs.filter(regex=pattern)
data_train_IDs = data_train_IDs.drop(Brand_IDs.columns, axis=1)
# drop sales
x_train_IDs = data_train_IDs.drop(['sales'], axis=1)
x_train_IDs.head()

Unnamed: 0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,attribute1,attribute2,attribute3,...,avgFinalUnitPrice,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.081797,-0.779161,-0.452709,...,-0.176703,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.081797,-0.779161,-0.903785,...,-0.79562,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.081797,-0.779161,-0.678247,...,-0.937825,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.081797,0.734214,-0.22717,...,0.537158,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.081797,0.229756,-0.22717,...,-0.47002,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321


In [162]:
features =len(x_train_IDs.columns)
third_feature = round(features/3)
third_feature

8

In [176]:
# Creating the model
model = RandomForestRegressor(random_state=24)

# Dataset used: Categorical: ("ProductID", "weekday")
X =x_train_IDs
y = y_train

# Parameter grid to vary
param_grid = {
    'n_estimators': [50, 100,200,500],
    'max_features': ['sqrt','log2',8, 'auto'],
    'max_depth': [25,30,35],
    'ccp_alpha':[0, 0.005,0.007,0.1]
}


# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator = model,param_grid = param_grid , scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0.005, 'max_depth': 25, 'max_features': 8, 'n_estimators': 500}
The corresponding test MSE is 148.3557983756306
