In [8]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from scipy import stats
import scipy.stats as sct
import re

<h1> Final model

In [27]:
MSE_sale = {'Model': ['2.1', '2.2'],
        'MSE': [142.50591835930155, 159.4901935628937]}
MSE_sale = pd.DataFrame(MSE_sale)
MSE_sorted = MSE_sale.sort_values(by='MSE')
MSE_sorted

Unnamed: 0,Model,MSE
0,2.1,142.505918
1,2.2,159.490194


Based on the result, the best model is Model 2.1 
- X: All features with Categorical (weekdays, Product IDs and Brand IDs)
- Y: sales
- Parameters: {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}


<h1> 1 Data Preprocess

<h2> 1.1 Import Data

In [28]:
from dataset import make_dataset
ss, not_scaled_df, scaled_df, data_train = make_dataset()

In [29]:
scaled_df.head()

Unnamed: 0,productID_0,productID_1,productID_2,productID_3,productID_6,productID_7,productID_8,productID_9,productID_10,productID_13,...,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel,sales
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.948957,-0.822878,0.524839,-0.176485,1.066933,-1.260993,-0.205651,-0.229769,0.157289,7
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.239057,0.123052,-0.2647,0.885324,0.092044,-0.370702,0.346301,0.205788,-0.711543,14
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.25392,0.231618,0.666149,1.231029,0.776884,1.459339,-0.280918,0.380011,-0.302681,60
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.583414,1.731132,0.968955,1.416228,1.864569,1.187306,1.036241,0.205788,-2.14256,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041857,0.312237,-0.881527,-0.495027,1.066933,-1.260993,1.036241,-0.914216,1.35321,15


<h2> 1.2 Filter Data

In [30]:
# Dataset used:

X = scaled_df[scaled_df['sales'] <= 200]
X = X.drop('sales', axis=1)
y = scaled_df[scaled_df['sales'] <= 200].sales

<h1> 2. Support Vector Machine

<h2> 2.1 Full dataset

In [6]:
# Kernal: 'linear', 'rbf'

# Creating the model
model = svm.SVR()

# Parameter grid to vary
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf'], #['linear', 'rbf', 'poly'],
    'degree': [2], #[2, 3, 4, 5],
    'gamma': ['scale', 'auto']
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f'The best parameter for the current model is {grid_search.best_params_}')
print("The corresponding test MSE is {res}".format(res = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]))

The best parameter for the current model is {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
The corresponding test MSE is -142.50591835930155


In [17]:
# Kernal: 'poly'

# Creating the model
model = svm.SVR()

# Parameter grid to vary
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['poly'], #['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4, 5],
    'gamma': ['scale', 'auto']
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f'The best parameter for the current model is {grid_search.best_params_}')
print("The corresponding test MSE is {res}".format(res = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]))

The best parameter for the current model is {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}
The corresponding test MSE is -179.13539939008317


By comparing the MSE, the best model for this dataset has the parameters {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'} and its MSE is 142.50591835930155

<h2> 2.2 Dataset with removal of “productID" and "brandID"

In [22]:
temp_X = not_scaled_df[not_scaled_df['sales'] <= 200]
temp_X = temp_X.drop('sales', axis=1)

all_col = not_scaled_df.columns.values
to_remove_col = []
for col_name in all_col: 
    if "productID" in col_name or "brandID" in col_name: 
        to_remove_col.append(col_name)
temp_X = temp_X.drop(to_remove_col, axis=1)
temp_X = pd.DataFrame(ss.fit_transform(temp_X), columns=temp_X.columns)
temp_X.head()

Unnamed: 0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,attribute1,attribute2,attribute3,...,avgFinalUnitPrice,ma14SalesVolume,meanAge,gender,meanEducation,maritalStatus,plus,meanPurchasePower,meanUserLevel,meanCityLevel
0,-0.380703,-0.378356,-0.400763,-0.444044,-0.439644,2.321294,-0.380703,0.083784,-0.776838,-0.45268,...,-0.178696,-0.94855,-0.822744,0.523442,-0.175039,1.066339,-1.258411,-0.205973,-0.229006,0.157147
1,-0.380703,-0.378356,-0.400763,-0.444044,-0.439644,-0.430794,2.626717,0.083784,-0.776838,-0.903213,...,-0.797647,0.239192,0.122138,-0.265167,0.885464,0.092679,-0.368471,0.345224,0.20604,-0.710447
2,-0.380703,-0.378356,-0.400763,-0.444044,-0.439644,2.321294,-0.380703,0.083784,-0.776838,-0.677946,...,-0.93986,1.253823,0.230584,0.664585,1.230744,0.776655,1.46085,-0.281137,0.380059,-0.302168
3,-0.380703,-0.378356,-0.400763,-0.444044,-0.439644,2.321294,-0.380703,0.083784,0.737,-0.227413,...,0.535205,-0.583091,1.728437,0.967035,1.415715,1.862969,1.188924,1.03422,0.20604,-2.139426
4,-0.380703,-0.378356,-0.400763,-0.444044,2.27457,-0.430794,-0.380703,0.083784,0.232387,-0.227413,...,-0.472029,0.042037,0.311114,-0.881268,-0.49319,1.066339,-1.258411,1.03422,-0.91265,1.351365


In [24]:
X = temp_X

In [25]:
# Kernal: 'linear', 'rbf'

# Creating the model
model = svm.SVR()

# Parameter grid to vary
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf'], #['linear', 'rbf', 'poly'],
    'degree': [2], #[2, 3, 4, 5],
    'gamma': ['scale', 'auto']
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f'The best parameter for the current model is {grid_search.best_params_}')
print("The corresponding test MSE is {res}".format(res = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]))

The best parameter for the current model is {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
The corresponding test MSE is -159.4901935628937


In [26]:
# Kernal: 'poly'

# Creating the model
model = svm.SVR()

# Parameter grid to vary
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['poly'], #['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4, 5],
    'gamma': ['scale', 'auto']
}

# Creating the object to carry out GridSearchCV, with 10-Fold Cross Validation
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', n_jobs=-1, cv=10, return_train_score=True)
grid_search.fit(X,y)

# Output the best parameters and its corresponding test MSE
print(f'The best parameter for the current model is {grid_search.best_params_}')
print("The corresponding test MSE is {res}".format(res = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]))

The best parameter for the current model is {'C': 10, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
The corresponding test MSE is -255.22431263592966


By comparing the MSE, the best model for this dataset has the parameters {'C': 100, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'} and its MSE is 159.4901935628937