In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from dataset import make_dataset
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from scipy.stats import *

<h1> 0. Final Result


In [15]:
MSE_sale = {'Model': ['2.1.1', '2.1.2', '2.1.3', '2.1.4','2.2.1.Ridge','2.2.1.Lasso','2.2.2.Ridge','2.2.2.Lasso'],
        'MSE': [1.067466099217631e+24, 2.4270306210874856e+24, 209.89150465202323, 209.56477778153803, 202.01896754492043, 202.3860511786856, 167.89975376346064, 167.97370141630745]}
MSE_sale = pd.DataFrame(MSE_sale)
MSE_sorted = MSE_sale.sort_values(by='MSE')
MSE_sorted.reset_index(drop=True)

Unnamed: 0,Model,MSE
0,2.2.2.Ridge,167.8998
1,2.2.2.Lasso,167.9737
2,2.2.1.Ridge,202.019
3,2.2.1.Lasso,202.3861
4,2.1.4,209.5648
5,2.1.3,209.8915
6,2.1.1,1.067466e+24
7,2.1.2,2.427031e+24


Based on the result, the best model is Model 2.2.2 using Ridge Regression
The details are: 
- X: Categorical attributes:
         weekdays, Product IDs and Brand IDs,
     Interacting terms: 
        'avgOriginalUnitPrice * avgFinalUnitPrice'
        'clickVolume * avgFinalUnitPrice'
- Y: sales
- alpha_value = 0.1

<h1> 1. Data Preprocess


<h2> 1.1 Import Data

In [3]:
# importing training data set
ss, not_scaled_df, scaled_df, data_train = make_dataset()



<h2> 1.2 Filter Data

In [4]:
# Drop outliers for scaled_df
scaled_df = scaled_df[scaled_df['sales'] <= 200]
scaled_df = scaled_df.reset_index(drop=True)

y = scaled_df.sales
scaled_df = scaled_df.drop(['sales'], axis = 1)

# Drop outliers for not_scaled_df
not_scaled_df = not_scaled_df[not_scaled_df['sales'] <= 200]
not_scaled_df = not_scaled_df.reset_index(drop=True)

not_scaled_df = not_scaled_df.drop(['sales'], axis = 1)

<h1> 2. Linear Models

<h2> 2.1 Preliminary Model Selection

<h4> 2.1.1 Multiple Linear Regression without any interaction terms

In [6]:
# define the function for 10 fold LR
kf = KFold(n_splits=10, random_state = 24, shuffle = True)
def LR_KFold_MSE(dataset):
    mse_multipleLR=[]
    for train_index, test_index in kf.split(dataset):
        X_train = dataset.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = dataset.iloc[test_index]
        y_test = y.iloc[test_index]
        # multiple LR
        lm=LinearRegression()
        lm.fit(X_train,y_train)
        mse_multipleLR.append(MSE(y_test,lm.predict(X_test)))        
    return mse_multipleLR

res_MLR = LR_KFold_MSE(scaled_df)     

print("the average mse is ", np.mean(res_MLR))
res_MLR

the average mse is  1.067466099217631e+24


[255.60075495793268,
 256.45866670220147,
 191.7911919263693,
 120.28051205458145,
 102.84968910217285,
 140.5332353071733,
 284.0433948863636,
 304.5286385276101,
 202.0505282315341,
 1.067466099217631e+25]

By obsering 10 different mse, we notice that one of the mse is significantly larger than the rest. This indicates potential non linear relationships in the data set.
Thus, MLR should not be the best choice.

<h4> 2.1.2 Polynomial Regression of degree 2 without any interaction terms

In [7]:
# define the function for 10 fold PR
kf = KFold(n_splits=10, random_state = 24, shuffle = True)
def Poly_KFold_MSE(dataset, degree):
    mse_multipleLR=[]
    for train_index, test_index in kf.split(dataset):
        X_train = dataset.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = dataset.iloc[test_index]
        y_test = y.iloc[test_index]
        # transform polynomial features
        poly = PolynomialFeatures(degree=degree)
        X_poly_train= poly.fit_transform(X_train)
        X_poly_test= poly.fit_transform(X_test)
        # multiple LR
        lm=LinearRegression()
        lm.fit(X_poly_train,y_train)
        mse_multipleLR.append(MSE(y_test,lm.predict(X_poly_test)))        
    return mse_multipleLR 
res_Poly = Poly_KFold_MSE(scaled_df,2)     

print("the average mse is ", np.mean(res_Poly))
res_Poly

the average mse is  2.4270306210874856e+24


[4.196392339498994e+24,
 4.6325738512365183e+24,
 3.3001409405316393e+23,
 6.840952172628255e+23,
 4.5426706356849674e+23,
 1.8674280582746582e+24,
 1.5799997178770547e+24,
 3.315748254522411e+23,
 1.0750833829807135e+24,
 9.118877660670187e+24]

The high mse indicates there is overfitting in Polynomial regression of degree 2

<h4> 2.1.3 Ridge Regression without any interaction terms
, taking alpha_values = [0.01,0.1,1,5,10,50,100]

In [8]:
# define the function for 10 fold Ridge Regression
kf = KFold(n_splits=10, random_state = 24, shuffle = True)
def Ridge_KFold_MSE(dataset, alpha):
    mse_ridge=[]
    for train_index, test_index in kf.split(dataset):
        X_train = dataset.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = dataset.iloc[test_index]
        y_test = y.iloc[test_index]
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_train, y_train)
        mse_ridge.append(MSE(y_test,ridge.predict(X_test)))        
    return mse_ridge
#list of alpha_values
alpha_values = [0.01,0.1,1,5,10,50,100]

# select the best alpha and its mse
best_alpha_Ridge = 0
best_mse_Ridge = 10**20

for alpha in alpha_values:
    res_Ridge = Ridge_KFold_MSE(scaled_df, alpha)
    current_mse = np.mean(res_Ridge)
    if (current_mse < best_mse_Ridge):
        best_alpha_Ridge = alpha
        best_mse_Ridge = current_mse  
   
print("the best alpha is ", best_alpha_Ridge)
print("the best mse is ", best_mse_Ridge)
Ridge_KFold_MSE(scaled_df, best_alpha_Ridge)
    

the best alpha is  5
the best mse is  209.89150465202323


[255.9864151182656,
 252.00604344746017,
 190.45437531555424,
 107.67167100184155,
 100.45698355212701,
 126.1434932918734,
 286.1079020581504,
 297.22987825971495,
 199.67405983684532,
 283.1842246383995]

<h4> 2.1.4 Lasso Regression without any interaction terms
, taking alpha_values = [0.01,0.1,1,5,10,50,100]

In [10]:
# define the function for 10 fold Lasso Regression
kf = KFold(n_splits=10, random_state = 24, shuffle = True)
def Lasso_KFold_MSE(dataset, alpha):
    mse_lasso=[]
    for train_index, test_index in kf.split(dataset):
        X_train = dataset.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = dataset.iloc[test_index]
        y_test = y.iloc[test_index]
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train, y_train)
        mse_lasso.append(MSE(y_test,lasso.predict(X_test))) 
    return mse_lasso

#list of alpha_values 
alpha_values = [0.01,0.1,1,5,10,50,100]

# select the best alpha and its mse
best_alpha_Lasso = 0
best_mse_Lasso = 10**20
for alpha in alpha_values:
    res_Lasso = Lasso_KFold_MSE(scaled_df, alpha)
    current_mse = np.mean(res_Lasso)
    if (current_mse < best_mse_Lasso):
        best_alpha_Lasso = alpha
        best_mse_Lasso = current_mse 
    
print("the best alpha is ", best_alpha_Lasso)
print("the best mse is ", best_mse_Lasso)


the best alpha is  0.01
the best mse is  209.56477778153803


In [11]:
# feature selection based on best lasso result, check columns whose coeficients are not zero
kf = KFold(n_splits=10, random_state = 24, shuffle = True)
for train_index, test_index in kf.split(scaled_df):
        X_train = scaled_df.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = scaled_df.iloc[test_index]
        y_test = y.iloc[test_index]
        lasso=Lasso(alpha=0.01)
        lasso.fit(X_train, y_train)
        print(scaled_df.columns[abs(lasso.coef_)>0.0001])



Index(['productID_2', 'productID_3', 'productID_6', 'productID_7',
       'productID_9', 'productID_13', 'productID_14', 'productID_16',
       'productID_17', 'productID_20', 'productID_24', 'productID_26',
       'productID_27', 'productID_28', 'productID_33', 'productID_37',
       'productID_38', 'productID_40', 'productID_41', 'productID_43',
       'productID_45', 'productID_51', 'productID_52', 'productID_57',
       'productID_58', 'productID_62', 'productID_64', 'productID_65',
       'productID_68', 'productID_71', 'productID_72', 'productID_73',
       'productID_75', 'productID_76', 'productID_78', 'productID_81',
       'productID_82', 'productID_85', 'productID_87', 'productID_88',
       'productID_90', 'productID_91', 'productID_93', 'productID_96',
       'productID_97', 'productID_99', 'brandID_1', 'brandID_3', 'brandID_4',
       'brandID_5', 'brandID_8', 'brandID_9', 'brandID_10', 'brandID_11',
       'brandID_12', 'brandID_13', 'brandID_14', 'brandID_15', 'brandID_

This shows that all attributes are significant as lasso regression does not drop any attribute

By observing the results of Lasso and Ridge, we notice that the results are close. Thus we will proceed adding interating terms using these two methods. The interacting terms are chosen based on real-life relationships and correlation matrix. 
We will only keep those  interacting terms that greatly decreses mse 

<h2> 2.2 Tuning Ridge & Lasso Regression with interaction terms, taking alpha_values = [0.01,0.1,1,5,10,50,100]

<h4> 2.2.1 Tuning using interaction terms  
<br>'avgOriginalUnitPrice * avgFinalUnitPrice'



In [12]:
# adding interracting terms
not_scaled_df1 = not_scaled_df.copy()
not_scaled_df1['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
# scaling

not_scaled_df1 = pd.DataFrame(ss.fit_transform(not_scaled_df1), columns = not_scaled_df1.columns)


# Ridge Regression
alpha_values = [0.01,0.1,1,5,10,50,100]
best_alpha_Ridge1 = 0
best_mse_Ridge1 = 10**20
for alpha in alpha_values:
    res_Ridge1 = Ridge_KFold_MSE(not_scaled_df1, alpha)
    current_mse = np.mean(res_Ridge1)
    if (current_mse < best_mse_Ridge1):
        best_alpha_Ridge1 = alpha
        best_mse_Ridge1 = current_mse     
print("Ridge: the best alpha is ", best_alpha_Ridge1)
print("Ridge: the best mse is ", best_mse_Ridge1)

# Lasso Regression
best_alpha_Lasso1 = 0
best_mse_Lasso1 = 10**20
for alpha in alpha_values:
    res_Lasso1 = Lasso_KFold_MSE(not_scaled_df1, alpha)
    current_mse = np.mean(res_Lasso1)
    if (current_mse < best_mse_Lasso1):
        best_alpha_Lasso1 = alpha
        best_mse_Lasso1 = current_mse     
print("Lasso: the best alpha is ", best_alpha_Lasso1)
print("Lasso: the best mse is ", best_mse_Lasso1)


Ridge: the best alpha is  0.1
Ridge: the best mse is  202.01896754492043


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso: the best alpha is  0.01
Lasso: the best mse is  202.3860511786856


<h4> 2.2.2 Tuning using interaction terms 
<br>'avgOriginalUnitPrice * avgFinalUnitPrice'
<br>'clickVolume * avgFinalUnitPrice'


In [14]:
# adding interracting terms
not_scaled_df2 = not_scaled_df.copy()
not_scaled_df2['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df2['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
# scaling
not_scaled_df2 = pd.DataFrame(ss.fit_transform(not_scaled_df2), columns = not_scaled_df2.columns)


# Ridge Regression
alpha_values = [0.01,0.1,1,5,10,50,100]
best_alpha_Ridge2 = 0
best_mse_Ridge2 = 10**20
for alpha in alpha_values:
    res_Ridge2 = Ridge_KFold_MSE(not_scaled_df2, alpha)
    current_mse = np.mean(res_Ridge2)
    if (current_mse < best_mse_Ridge2):
        best_alpha_Ridge2 = alpha
        best_mse_Ridge2 = current_mse     
print("Ridge: the best alpha is ", best_alpha_Ridge2)
print("Ridge: the best mse is ", best_mse_Ridge2)

# Lasso Regression

best_alpha_Lasso2 = 0
best_mse_Lasso2 = 10**20
for alpha in alpha_values:
    res_Lasso2 = Lasso_KFold_MSE(not_scaled_df2, alpha)
    current_mse = np.mean(res_Lasso2)
    if (current_mse < best_mse_Lasso2):
        best_alpha_Lasso2 = alpha
        best_mse_Lasso2 = current_mse     
print("Lasso: the best alpha is ", best_alpha_Lasso2)
print("Lasso: the best mse is ", best_mse_Lasso2)


Ridge: the best alpha is  0.1
Ridge: the best mse is  167.89975376346064


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso: the best alpha is  0.01
Lasso: the best mse is  167.97370141630745


Although we tried out more complicated models, new interaction terms failed to decrease MSE significantly (change > 5). Considering the trade-off between model complexity and MSE, we will choose these two interaction terms as the best combination.

Between Lasso and Ridge regression, the final MSE is very close. The reason for this might be:

Due to the nature of Ridge regression and lasso regression, Ridge regression uses all attributes while lasso only uses the more significant ones. However, in our case, lasso used all attributes, which leads to similar results.
