In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from dataset import make_dataset

<h1> 0. Final Result

In [2]:
MSE_sale ={'Model': [2.01, 2.02, 2.03, 2.04, 2.05, 2.06, 2.07, 2.08, 2.09, 2.10, 2.11, 2.12],
           'MSE': [137.7801818767695, 136.02916576894353, 135.99448363900683, 134.3224461200231, 136.19624750541766, 132.50391214319876
                   , 131.3649323422473, 133.04241442728107, 133.815309206687, 126.75651447602657, 130.5225702866221, 125.18154478043498]}
MSE_sale = pd.DataFrame(MSE_sale)
MSE_sorted = MSE_sale.sort_values(by='MSE')
MSE_sorted.reset_index(drop=True)

Unnamed: 0,Model,MSE
0,2.12,125.181545
1,2.1,126.756514
2,2.11,130.52257
3,2.07,131.364932
4,2.06,132.503912
5,2.08,133.042414
6,2.09,133.815309
7,2.04,134.322446
8,2.03,135.994484
9,2.02,136.029166


Based on the results, model 2.12 produces the lowest MSE. Thus, this is the best GradientBoostingRegressor model. <br>

X: scaled_df with interacting terms listed below: <br>
avgOriginalUnitPrice * avgFinalUnitPrice, clickVolume * avgFinalUnitPrice, attribute1 * avgFinalUnitPrice, attribute2 * avgFinalUnitPrice, attribute1 * avgOriginalUnitPrice, plus * meanPurchasePower, plus * meanUserLevel, meanUserLevel * meanPurchasePower <br>

y: sales <br>

Parameters: {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1500, 'random_state': 24}


<h1> 1. Data Processing

<h3> 1.1 Import Data

In [9]:
# importing training data set

ss, not_scaled_df, scaled_df, data_train = make_dataset()

# column indexes for non-categorical data
required_cols = scaled_df.columns[106:]

# scaler to be used to standardise misc
random_ss = StandardScaler()

<h3> 1.2 Filter Data

In [10]:
# remove outliers from scaled and non_scaled dataframes
scaled_df = scaled_df.loc[scaled_df['sales'] <= 200].reset_index(drop=True)
not_scaled_df = not_scaled_df.loc[not_scaled_df['sales'] <= 200].reset_index(drop=True)

<h1> 2. Gradient Boosting

<h4> 2.01 Grid Search on scaled_df without productID and brandID

In [5]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID
X = scaled_df[required_cols].drop('sales', axis=1)
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0, 0.01, 0.1],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 137.7801818767695


<h4> 2.02 Grid Search on scaled_df wtihout productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice

In [6]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0, 0.01, 0.1],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 136.02916576894353


<h4> 2.03 Grid Search on scaled_df without productID and brandID <br>
Interacting terms: attribute1 * attribute2

In [7]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes attribute1 * attribute 2
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0, 0.01, 0.1],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 4, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 135.99448363900683


<h4> 2.04 Grid Search on scaled_df wtihout productID and brandID <br>
Interacting terms: attribute1 * avgOriginalUnitPrice

In [8]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes attribute1 * avgOriginalUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_att1-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgOriginalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0, 0.01, 0.1],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 134.3224461200231


<h4> 2.05 Grid Search on scaled_df without productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, attribute1 * attribute2

In [9]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice, attribute1 * attribute2
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 136.19624750541766


<h4> 2.06 Grid Search on scaled_Df without productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, attribute1 * attribute2 , attribute1 * avgOriginalUnitPrice

In [10]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice, attribute1 * attribute2, attribute1 * OGUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
X['prod_att1-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgOriginalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 132.50391214319876


<h4> 2.07 Grid Search on scaled_df without productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, attribute1 * attribute2, attribute1 * avgOriginalUnitPrice, attribute1 * avgFinalUnitPrice

In [11]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice, attribute1 * attribute2, attribute1 * OGUnitPrice, attribute1 * FinalUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
X['prod_att1-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgOriginalUnitPrice']))
X['prod_att1-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgFinalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 131.3649323422473


<h4> 2.08 Grid Search on scaled_df without productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, attribute1 * attribute2, attribute1 * avgOriginalUnitPrice, attribute1 * avgFinalUnitPrice, ma14SalesVolume * avgOriginalUnitPrice

In [12]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice, attribute1 * attribute2, attribute1 * OGUnitPrice, attribute1 * FinalUnitPrice, salesVolume * OGUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
X['prod_att1-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgOriginalUnitPrice']))
X['prod_att1-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_salesVol-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['ma14SalesVolume'] * not_scaled_df['avgOriginalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1000, 'random_state': 24}
The corresponding test MSE for sales is 133.04241442728107


<h4> 2.09 Grid Search on scaled_df without productID and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, attribute1 * attribute2, attribute1 * avgOriginalUnitPrice, attribute1 * avgFinalUnitPrice, ma14SalesVolume * avgOriginalUnitPrice, ma14SalesVolume * avgFinalUnitPrice

In [13]:
# Creating the model
model = GradientBoostingRegressor()

# Dataset used: Without categorical variables such as productID and brandID, includes OGUnitPrice * FinalUnitPrice, attribute1 * attribute2, attribute1 * OGUnitPrice, attribute1 * FinalUnitPrice, salesVolume * OGUnitPrice, salesVolume * FinalUnitPrice
X = scaled_df[required_cols].drop('sales', axis=1)
X['prod_OGPrice-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_att1-att2'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['attribute2']))
X['prod_att1-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgOriginalUnitPrice']))
X['prod_att1-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['attribute1'] * not_scaled_df['avgFinalUnitPrice']))
X['prod_salesVol-OGPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['ma14SalesVolume'] * not_scaled_df['avgOriginalUnitPrice']))
X['prod_salesVol-FinalPrice'] = random_ss.fit_transform(pd.DataFrame(not_scaled_df['ma14SalesVolume'] * not_scaled_df['avgFinalUnitPrice']))
y = scaled_df.sales

# Parameter grid to vary
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f"The best parameter for the current model is {grid_search.best_params_}")
print(f"The corresponding test MSE for sales is {np.abs(grid_search.cv_results_['mean_test_score'][grid_search.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE for sales is 133.815309206687


<h4> 2.10 Grid Search on scaled_df with product and brandID

In [13]:
ss, not_scaled_df, scaled_df, data_train = make_dataset()

# Drop outliers
not_scaled_df = not_scaled_df[not_scaled_df['sales'] <= 200]
not_scaled_df = not_scaled_df.reset_index(drop=True)

y = not_scaled_df.sales
not_scaled_df = not_scaled_df.drop(['sales'], axis = 1)

In [15]:
# adding interracting terms
not_scaled_df1 = not_scaled_df.copy()
not_scaled_df1['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']

# scaling
ss = StandardScaler()
not_scaled_df1 = pd.DataFrame(ss.fit_transform(not_scaled_df1), columns = not_scaled_df1.columns)

X_train = not_scaled_df1
y_train = y

gbr = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

search1 = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
search1.fit(X_train, y_train)
print(f"The best parameter for the current model is {search1.best_params_}")
print(f"The corresponding test MSE is {np.abs(search1.cv_results_['mean_test_score'][search1.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE is 126.75651447602657


<h4> 2.11 Grid Search on scaled_df with product and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, clickVolume * avgFinalUnitPrice

In [16]:
# adding interracting terms
not_scaled_df2 = not_scaled_df.copy()
not_scaled_df2['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df2['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
# scaling
ss = StandardScaler()
not_scaled_df2 = pd.DataFrame(ss.fit_transform(not_scaled_df2), columns = not_scaled_df2.columns)

X_train = not_scaled_df2
y_train = y

gbr = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

search2 = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
search2.fit(X_train, y_train)
print(f"The best parameter for the current model is {search2.best_params_}")
print(f"The corresponding test MSE is {np.abs(search2.cv_results_['mean_test_score'][search2.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1000, 'random_state': 24}
The corresponding test MSE is 130.5225702866221


<h4> 2.12 Grid Search on scaled_df with product and brandID <br>
Interacting terms: avgOriginalUnitPrice * avgFinalUnitPrice, clickVolume * avgFinalUnitPrice, attribute1 * avgFinalUnitPrice, attribute2 * avgFinalUnitPrice, attribute1 * avgOriginalUnitPrice, plus * meanPurchasePower, plus * meanUserLevel, meanUserLevel * meanPurchasePower

In [17]:
# adding interracting terms
not_scaled_df3 = not_scaled_df.copy()
not_scaled_df3['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgFinalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute2*avgFinalUnitPrice'] = not_scaled_df3['attribute2'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgOriginalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgOriginalUnitPrice']
not_scaled_df3['plus*meanPurchasePower'] = not_scaled_df3['plus'] * not_scaled_df3['meanPurchasePower']
not_scaled_df3['plus*meanUserLevel'] = not_scaled_df3['plus'] * not_scaled_df3['meanUserLevel']
not_scaled_df3['meanUserLevel*meanPurchasePower'] = not_scaled_df3['meanUserLevel'] * not_scaled_df3['meanPurchasePower']

# scaling
ss = StandardScaler()
not_scaled_df3 = pd.DataFrame(ss.fit_transform(not_scaled_df3), columns = not_scaled_df3.columns)

X_train = not_scaled_df3
y_train = y

gbr = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [500, 1000, 1500],
    'max_depth': [1, 3, 5],
    'learning_rate': [0.005, 0.01, 0.05, 0.1],
    'min_samples_split': [2, 4, 8],
    'ccp_alpha': [0],
    'random_state': [24]
}

search3 = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
search3.fit(X_train, y_train)
print(f"The best parameter for the current model is {search3.best_params_}")
print(f"The corresponding test MSE is {np.abs(search3.cv_results_['mean_test_score'][search3.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE is 125.18154478043498


<h4> Output the 10-Fold CV of the best model

In [16]:
# adding interracting terms
not_scaled_df3 = not_scaled_df.copy()
not_scaled_df3['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgFinalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute2*avgFinalUnitPrice'] = not_scaled_df3['attribute2'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgOriginalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgOriginalUnitPrice']
not_scaled_df3['plus*meanPurchasePower'] = not_scaled_df3['plus'] * not_scaled_df3['meanPurchasePower']
not_scaled_df3['plus*meanUserLevel'] = not_scaled_df3['plus'] * not_scaled_df3['meanUserLevel']
not_scaled_df3['meanUserLevel*meanPurchasePower'] = not_scaled_df3['meanUserLevel'] * not_scaled_df3['meanPurchasePower']

# scaling
ss = StandardScaler()
not_scaled_df3 = pd.DataFrame(ss.fit_transform(not_scaled_df3), columns = not_scaled_df3.columns)

X_train = not_scaled_df3
y_train = y

gbr = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [1500],
    'max_depth': [3],
    'learning_rate': [0.05],
    'min_samples_split': [8],
    'ccp_alpha': [0],
    'random_state': [24]
}

search3 = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
search3.fit(X_train, y_train)
print(f"The best parameter for the current model is {search3.best_params_}")
print(f"The corresponding test MSE is {np.abs(search3.cv_results_['mean_test_score'][search3.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE is 125.18154478043498


In [17]:
res = np.zeros(shape=(10,1))
for i in range(10):
    string = f'split{i}_test_score'
    res[i][0] = -search3.cv_results_[string]

print(f'The 10-Fold CV test results for GBR model is \n{res}')
print(f'The mean CV score is {np.mean(res)}')

The 10-Fold CV test results for GBR model is 
[[148.60576424]
 [ 83.87541483]
 [ 98.86185944]
 [183.87481472]
 [ 97.39857891]
 [117.98422292]
 [153.38190614]
 [114.95826304]
 [ 92.91580774]
 [159.95881583]]
The mean CV score is 125.18154478043498


<h4> Output the 10-Fold CV of the best model with outliers

In [3]:
ss, not_scaled_df, scaled_df, data_train = make_dataset()
y = not_scaled_df.sales
not_scaled_df = not_scaled_df.drop('sales',axis=1)

# adding interracting terms
not_scaled_df4 = not_scaled_df.copy()
not_scaled_df4['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df4['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df4['attribute1*avgFinalUnitPrice'] = not_scaled_df4['attribute1'] * not_scaled_df4['avgFinalUnitPrice']
not_scaled_df4['attribute2*avgFinalUnitPrice'] = not_scaled_df4['attribute2'] * not_scaled_df4['avgFinalUnitPrice']
not_scaled_df4['attribute1*avgOriginalUnitPrice'] = not_scaled_df4['attribute1'] * not_scaled_df4['avgOriginalUnitPrice']
not_scaled_df4['plus*meanPurchasePower'] = not_scaled_df4['plus'] * not_scaled_df4['meanPurchasePower']
not_scaled_df4['plus*meanUserLevel'] = not_scaled_df4['plus'] * not_scaled_df4['meanUserLevel']
not_scaled_df4['meanUserLevel*meanPurchasePower'] = not_scaled_df4['meanUserLevel'] * not_scaled_df4['meanPurchasePower']

# scaling
ss = StandardScaler()
not_scaled_df4 = pd.DataFrame(ss.fit_transform(not_scaled_df4), columns = not_scaled_df4.columns)

X_train = not_scaled_df4
y_train = y

gbr = GradientBoostingRegressor()
param_grid = {
    'n_estimators' : [1500],
    'max_depth': [3],
    'learning_rate': [0.05],
    'min_samples_split': [8],
    'ccp_alpha': [0],
    'random_state': [24]
}

search4 = GridSearchCV(estimator=gbr, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
search4.fit(X_train, y_train)
print(f"The best parameter for the current model is {search4.best_params_}")
print(f"The corresponding test MSE is {np.abs(search4.cv_results_['mean_test_score'][search4.best_index_])}")

The best parameter for the current model is {'ccp_alpha': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_split': 8, 'n_estimators': 1500, 'random_state': 24}
The corresponding test MSE is 252.39323561615038


In [4]:
res = np.zeros(shape=(10,1))
for i in range(10):
    string = f'split{i}_test_score'
    res[i][0] = -search4.cv_results_[string]

print(f'The 10-Fold CV test results for GBR model is \n{res}')
print(f'The mean CV score is {np.mean(res)}')

The 10-Fold CV test results for GBR model is 
[[179.09100358]
 [ 89.71699795]
 [246.31021515]
 [187.22086067]
 [191.23905197]
 [123.44685211]
 [371.34105968]
 [133.66190064]
 [812.3279994 ]
 [189.57641501]]
The mean CV score is 252.39323561615038
