In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from dataset import make_dataset
from dataset import Predicted_Profit
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler
from scipy import stats
import scipy.stats as sct
from scipy.stats import lognorm

<h1> 0. Comparison of Profit Level

In [26]:
# Run this cell only after other cells have been run
Level_table = {'Model': ['RF_Norm', 'RF_Exp', 'RF_log', \
                         'DT_Norm', 'DT_Exp', 'DT_log', \
                         'SVM_Norm', 'SVM_Exp', 'SVM_log', \
                         'lasso_Norm', 'lasso_Exp', 'lasso_log',\
                         'GBR_Norm', 'GBR_Exp', 'GBR_log'],
               'Profit': [Profit_Norm_RF, Profit_Exp_RF, Profit_log_RF,\
                          Profit_Norm_dt, Profit_Exp_dt, Profit_log_dt,\
                          Profit_Norm_svm, Profit_Exp_svm, Profit_log_svm,\
                          Profit_Norm_L, Profit_Exp_L, Profit_log_L,\
                          Profit_Norm_GBR, Profit_Exp_GBR, Profit_log_GBR],
               'Level': [Level_Norm_RF, Level_Exp_RF,Level_log_RF,\
                         Level_Norm_dt, Level_Exp_dt, Level_log_dt,\
                         Level_Norm_svm, Level_Exp_svm, Level_log_svm,\
                         Level_Norm_L, Level_Exp_L, Level_log_L,\
                         Level_Norm_GBR, Level_Exp_GBR, Level_log_GBR]}

Level_table = pd.DataFrame(Level_table)
Level_table_sorted = Level_table.sort_values(by='Level', ascending=False)
Level_table_sorted = Level_table_sorted.reset_index(drop=True)
Level_table_sorted

Unnamed: 0,Model,Profit,Level
0,GBR_Exp,50228.0,0.758548
1,GBR_Norm,49864.0,0.753051
2,RF_Exp,49784.0,0.751842
3,RF_Norm,48748.0,0.736197
4,SVM_Exp,48588.0,0.73378
5,SVM_Norm,48560.0,0.733357
6,lasso_Exp,47576.0,0.718497
7,lasso_Norm,46572.0,0.703335
8,RF_log,44188.0,0.667331
9,DT_Exp,43804.0,0.661532


<h1> 1. General Processing of data

In [2]:
ss, not_scaled_df, scaled_df, data_train = make_dataset()
data_train = scaled_df
data_train = data_train[data_train['sales'] <= 200]
data_train = data_train.reset_index(drop=True)
y_train = data_train['sales']
x_train = data_train.drop('sales',axis=1)



In [3]:
p=20
c=12
s=8
overage = c-s
underage = p-c
critical = (p-c)/(p-s)
critical

0.6666666666666666

<h1> 2. Random Forest - Model 2.1

In [4]:
X_train_RF,X_test_RF,Y_train_RF,Y_test_RF = train_test_split(x_train,y_train,test_size=0.2,random_state=24)
rf= RandomForestRegressor(ccp_alpha= 0, max_depth =25, max_features=43, n_estimators=200, random_state = 24)
rf.fit(X_train_RF,Y_train_RF)
train_pred_RF = rf.predict(X_train_RF)

In [5]:
# Create table for comparison
results_RF= pd.DataFrame()
results_RF["Test_sales"] = Y_test_RF

# Max profit = True sales * (Profit - Cost)
results_RF["Max_profit"] = results_RF["Test_sales"] * (p-c)
results_RF =results_RF.reset_index(drop=True)

In [6]:
# Stand deviation of the training sales
ndata_RF = len(X_train_RF)
estimator_RF = 200
std_est_RF=np.sqrt(np.sum((train_pred_RF-Y_train_RF)**2)/(ndata_RF-estimator_RF-1))

# Predicted sales for testing sets
Y_pred_RF = rf.predict(X_test_RF).round()
Y_pred_RF[Y_pred_RF<0] = 0
results_RF["Predicted_sales"] = Y_pred_RF

# Normal
results_RF["Norm_I"] =(sct.norm.ppf(critical)*std_est_RF + Y_pred_RF).round()
results_RF["Norm_profit"] =Predicted_Profit(results_RF["Test_sales"],results_RF["Norm_I"])

#exponential
results_RF["Exp_I"] = (-Y_pred_RF * np.log(1 -critical)).round()
results_RF["Exp_profit"] =Predicted_Profit(results_RF["Test_sales"] , results_RF["Exp_I"])

#lognormal
results_RF["lognormal_I"] = (lognorm.ppf(critical,std_est_RF) + Y_pred_RF).round()
results_RF["lognormal_profit"] = Predicted_Profit(results_RF["Test_sales"] , results_RF["lognormal_I"])

results_RF.head()

Unnamed: 0,Test_sales,Max_profit,Predicted_sales,Norm_I,Norm_profit,Exp_I,Exp_profit,lognormal_I,lognormal_profit
0,4,32,5.0,7.0,20.0,5.0,28.0,12.0,0.0
1,7,56,23.0,25.0,-16.0,25.0,-16.0,30.0,-36.0
2,82,656,47.0,49.0,392.0,52.0,416.0,54.0,432.0
3,30,240,32.0,34.0,224.0,35.0,220.0,39.0,204.0
4,146,1168,70.0,72.0,576.0,77.0,616.0,77.0,616.0


In [7]:
# Normal
Profit_Norm_RF = sum(results_RF['Norm_profit'])
Level_Norm_RF = Profit_Norm_RF  / sum(results_RF['Max_profit'])
# Expontial
Profit_Exp_RF = sum(results_RF['Exp_profit'])
Level_Exp_RF = Profit_Exp_RF / sum(results_RF['Max_profit'])
# lognoraml
Profit_log_RF = sum(results_RF['lognormal_profit'])
Level_log_RF = Profit_log_RF / sum(results_RF['Max_profit'])

print (f'The level of profit achieved by Normal is {Level_Norm_RF}')
print (f'The level of profit achieved by Exponential is {Level_Exp_RF}')
print (f'The level of profit achieved by lognormal is {Level_log_RF}')

The level of profit achieved by Normal is 0.7361966896218437
The level of profit achieved by Exponential is 0.7518424549957714
The level of profit achieved by lognormal is 0.6673311586323547


<h1> 2. Decision Tree - Model 2.2

In [8]:
# additional precessing of X_train for Decision tree
x_train = x_train.drop(columns=[col for col in x_train.columns if col.startswith('productID_')], inplace=False)
X_train_dt,X_test_dt,Y_train_dt,Y_test_dt = train_test_split(x_train,y_train,test_size=0.2,random_state=24)
dt= DecisionTreeRegressor(ccp_alpha= 0, max_depth =4, min_samples_leaf=4, min_samples_split=2, random_state = 24)
dt.fit(X_train_dt,Y_train_dt)
train_pred_dt=dt.predict(X_train_dt)

In [9]:
# Create table for comparison
results_dt= pd.DataFrame()
results_dt["Test_sales"] = Y_test_dt

# Max profit = True sales * (Profit - Cost)
results_dt["Max_profit"] = results_dt["Test_sales"] * (p-c)
results_dt =results_dt.reset_index(drop=True)

# Stand deviation of the training sales
ndata_dt = len(X_train_dt)
estimator_dt = 1
std_est_dt=np.sqrt(np.sum((train_pred_dt-Y_train_dt)**2)/(ndata_dt-estimator_dt-1))

# Predicted sales for testing sets
Y_pred_dt = dt.predict(X_test_dt).round()
Y_pred_dt[Y_pred_dt<0] = 0
results_dt["Predicted_sales"] = Y_pred_dt

# Normal
results_dt["Norm_I"] =(sct.norm.ppf(critical)*std_est_dt + Y_pred_dt).round()
results_dt["Norm_profit"] =Predicted_Profit(results_dt["Test_sales"],results_dt["Norm_I"])

#exponential
results_dt["Exp_I"] = (-Y_pred_dt * np.log(1 -critical)).round()
results_dt["Exp_profit"] =Predicted_Profit(results_dt["Test_sales"] , results_dt["Exp_I"])

#lognormal
results_dt["lognormal_I"] = (lognorm.ppf(critical,std_est_dt) + Y_pred_dt).round()
results_dt["lognormal_profit"] = Predicted_Profit(results_dt["Test_sales"] , results_dt["lognormal_I"])


results_dt.head()

Unnamed: 0,Test_sales,Max_profit,Predicted_sales,Norm_I,Norm_profit,Exp_I,Exp_profit,lognormal_I,lognormal_profit
0,4,32,6.0,12.0,0.0,7.0,20.0,262.0,-1000.0
1,7,56,19.0,25.0,-16.0,21.0,0.0,275.0,-1016.0
2,82,656,46.0,52.0,416.0,51.0,408.0,302.0,-224.0
3,30,240,31.0,37.0,212.0,34.0,224.0,287.0,-788.0
4,146,1168,85.0,91.0,728.0,93.0,744.0,341.0,388.0


In [10]:
# Normal
Profit_Norm_dt = sum(results_dt['Norm_profit'])
Level_Norm_dt = Profit_Norm_dt  / sum(results_dt['Max_profit'])
# Expontial
Profit_Exp_dt = sum(results_dt['Exp_profit'])
Level_Exp_dt = Profit_Exp_dt / sum(results_dt['Max_profit'])
# lognoraml
Profit_log_dt = sum(results_dt['lognormal_profit'])
Level_log_dt = Profit_log_dt / sum(results_dt['Max_profit'])

print (f'The level of profit achieved by Normal is {Level_Norm_dt}')
print (f'The level of profit achieved by Exponential is {Level_Exp_dt}')
print (f'The level of profit achieved by lognormal is {Level_log_dt}')

The level of profit achieved by Normal is 0.6252265313519391
The level of profit achieved by Exponential is 0.6615319560227135
The level of profit achieved by lognormal is -5.791047480971367


<h1> 3. Support Vector Machine - Model 2.1

In [11]:
X_train_svm,X_test_svm,Y_train_svm,Y_test_svm = train_test_split(x_train,y_train,test_size=0.2,random_state=24)
svm_model = svm.SVR(C=100, kernel='rbf', degree=2, gamma='scale')
svm_model.fit(X_train_svm,Y_train_svm)
train_pred_svm=svm_model.predict(X_train_svm)

In [12]:
# Create table for comparison
results_svm= pd.DataFrame()
results_svm["Test_sales"] = Y_test_svm

# Max profit = True sales * (Profit - Cost)
results_svm["Max_profit"] = results_svm["Test_sales"] * (p-c)
results_svm =results_svm.reset_index(drop=True)

# Stand deviation of the training sales
ndata_svm = len(X_train_svm)
estimator_svm = 1
std_est_svm=np.sqrt(np.sum((train_pred_svm-Y_train_svm)**2)/(ndata_svm-estimator_svm-1))

# Predicted sales for testing sets
Y_pred_svm = svm_model.predict(X_test_svm).round()
Y_pred_svm[Y_pred_svm<0] = 0
results_svm["Predicted_sales"] = Y_pred_svm


# Normal
results_svm["Norm_I"] =(sct.norm.ppf(critical)*std_est_svm + Y_pred_svm).round()
results_svm["Norm_profit"] =Predicted_Profit(results_svm["Test_sales"],results_svm["Norm_I"])

#exponential
results_svm["Exp_I"] = (-Y_pred_svm * np.log(1 -critical)).round()
results_svm["Exp_profit"] =Predicted_Profit(results_svm["Test_sales"] , results_svm["Exp_I"])

#lognormal
results_svm["lognormal_I"] = (lognorm.ppf(critical,std_est_svm) + Y_pred_svm).round()
results_svm["lognormal_profit"] = Predicted_Profit(results_svm["Test_sales"] , results_svm["lognormal_I"])

results_svm.head()

Unnamed: 0,Test_sales,Max_profit,Predicted_sales,Norm_I,Norm_profit,Exp_I,Exp_profit,lognormal_I,lognormal_profit
0,4,32,6.0,9.0,12.0,7.0,20.0,33.0,-84.0
1,7,56,8.0,11.0,40.0,9.0,48.0,35.0,-56.0
2,82,656,53.0,56.0,448.0,58.0,464.0,80.0,640.0
3,30,240,38.0,41.0,196.0,42.0,192.0,65.0,100.0
4,146,1168,51.0,54.0,432.0,56.0,448.0,78.0,624.0


In [13]:
# Normal
Profit_Norm_svm = sum(results_svm['Norm_profit'])
Level_Norm_svm = Profit_Norm_svm  / sum(results_svm['Max_profit'])
# Expontial
Profit_Exp_svm = sum(results_svm['Exp_profit'])
Level_Exp_svm = Profit_Exp_svm / sum(results_svm['Max_profit'])
# lognoraml
Profit_log_svm = sum(results_svm['lognormal_profit'])
Level_log_svm = Profit_log_svm / sum(results_svm['Max_profit'])

print (f'The level of profit achieved by Normal is {Level_Norm_svm}')
print (f'The level of profit achieved by Exponential is {Level_Exp_svm}')
print (f'The level of profit achieved by lognormal is {Level_log_svm}')

The level of profit achieved by Normal is 0.7333574966775401
The level of profit achieved by Exponential is 0.7337803552011598
The level of profit achieved by lognormal is 0.23933792436873264


<h1> 4. Linear Model - Model 2.2.2 Ridge

In [14]:
# adding interracting terms
ss, not_scaled_df, scaled_df, data_train = make_dataset()
not_scaled_df = not_scaled_df[not_scaled_df['sales'] <= 200]
not_scaled_df = not_scaled_df.reset_index(drop=True)
y = not_scaled_df.sales
not_scaled_df = not_scaled_df.drop(['sales'], axis = 1)
not_scaled_df2 = not_scaled_df.copy()
not_scaled_df2['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df2['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']

# scaling
scaled_df2 = pd.DataFrame(ss.fit_transform(not_scaled_df2), columns = not_scaled_df2.columns)



In [15]:
y_train_L = y
x_train_L = scaled_df2

X_train_L,X_test_L,Y_train_L,Y_test_L = train_test_split(x_train_L,y_train_L,test_size=0.2,random_state=24)
value = 0.1
ridge=Ridge(alpha=value)
ridge.fit(X_train_L, Y_train_L)
train_pred_L=ridge.predict(X_train_L)

In [16]:
# Create table for comparison
results_L= pd.DataFrame()
results_L["Test_sales"] = Y_test_L

# Max profit = True sales * (Profit - Cost)
results_L["Max_profit"] = results_L["Test_sales"] * (p-c)
results_L =results_L.reset_index(drop=True)

# Stand deviation of the training sales
ndata_L = len(X_train_L)
estimator_L = len(X_train_L.columns)
std_est_L=np.sqrt(np.sum((train_pred_L-Y_train_L)**2)/(ndata_L-estimator_L-1))

# Predicted sales for testing sets
Y_pred_L = ridge.predict(X_test_L).round()
Y_pred_L[Y_pred_L<0] = 0
results_L["Predicted_sales"] = Y_pred_L

# Normal
results_L["Norm_I"] =(sct.norm.ppf(critical)*std_est_L + Y_pred_L).round()
results_L["Norm_profit"] =Predicted_Profit(results_L["Test_sales"],results_L["Norm_I"])

#exponential
results_L["Exp_I"] = (-Y_pred_L * np.log(1 -critical)).round()
results_L["Exp_profit"] =Predicted_Profit(results_L["Test_sales"] , results_L["Exp_I"])

#lognormal
results_L["lognormal_I"] = (lognorm.ppf(critical,std_est_L) + Y_pred_L).round()
results_L["lognormal_profit"] = Predicted_Profit(results_L["Test_sales"] , results_L["lognormal_I"])

results_L.head()

Unnamed: 0,Test_sales,Max_profit,Predicted_sales,Norm_I,Norm_profit,Exp_I,Exp_profit,lognormal_I,lognormal_profit
0,4,32,6.0,11.0,4.0,7.0,20.0,205.0,-772.0
1,7,56,4.0,9.0,48.0,4.0,32.0,203.0,-728.0
2,82,656,53.0,58.0,464.0,58.0,464.0,252.0,-24.0
3,30,240,37.0,42.0,192.0,41.0,196.0,236.0,-584.0
4,146,1168,110.0,115.0,920.0,121.0,968.0,309.0,516.0


In [17]:
# Normal
Profit_Norm_L = sum(results_L['Norm_profit'])
Level_Norm_L = Profit_Norm_L  / sum(results_L['Max_profit'])
# Expontial
Profit_Exp_L = sum(results_L['Exp_profit'])
Level_Exp_L = Profit_Exp_L / sum(results_L['Max_profit'])
# lognoraml
Profit_log_L = sum(results_L['lognormal_profit'])
Level_log_L = Profit_log_L / sum(results_L['Max_profit'])

print (f'The level of profit achieved by Normal is {Level_Norm_L}')
print (f'The level of profit achieved by Exponential is {Level_Exp_L}')
print (f'The level of profit achieved by lognormal is {Level_log_L}')

The level of profit achieved by Normal is 0.7033345415005436
The level of profit achieved by Exponential is 0.7184970399903347
The level of profit achieved by lognormal is -4.289416455237405


<h1> 5. Gradient Boosting - Model 2.12

In [21]:
ss, not_scaled_df, scaled_df, data_train = make_dataset()
not_scaled_df3 = not_scaled_df.copy()



In [22]:
# adding interracting terms
ss, not_scaled_df, scaled_df, data_train = make_dataset()
not_scaled_df3 = not_scaled_df.copy()
not_scaled_df3 = not_scaled_df3[not_scaled_df3['sales'] <= 200]
not_scaled_df3 = not_scaled_df3.reset_index(drop=True)
not_scaled_df3['avgOriginalUnitPrice * avgFinalUnitPrice'] = not_scaled_df['avgOriginalUnitPrice'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['clickVolume*avgFinalUnitPrice'] = not_scaled_df['clickVolume'] * not_scaled_df['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgFinalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute2*avgFinalUnitPrice'] = not_scaled_df3['attribute2'] * not_scaled_df3['avgFinalUnitPrice']
not_scaled_df3['attribute1*avgOriginalUnitPrice'] = not_scaled_df3['attribute1'] * not_scaled_df3['avgOriginalUnitPrice']
not_scaled_df3['plus*meanPurchasePower'] = not_scaled_df3['plus'] * not_scaled_df3['meanPurchasePower']
not_scaled_df3['plus*meanUserLevel'] = not_scaled_df3['plus'] * not_scaled_df3['meanUserLevel']
not_scaled_df3['meanUserLevel*meanPurchasePower'] = not_scaled_df3['meanUserLevel'] * not_scaled_df3['meanPurchasePower']
y = not_scaled_df3.sales
not_scaled_df3 = not_scaled_df3.drop(['sales'], axis = 1)

# scaling
scaled_df3 = pd.DataFrame(ss.fit_transform(not_scaled_df3), columns = not_scaled_df3.columns)



In [23]:
y_train_GBR = y
x_train_GBR = scaled_df3

X_train_GBR,X_test_GBR,Y_train_GBR,Y_test_GBR = train_test_split(x_train_GBR,y_train_GBR,test_size=0.2,random_state=24)
gbr = GradientBoostingRegressor(ccp_alpha= 0, learning_rate= 0.05, max_depth = 3, min_samples_split = 8, n_estimators=1500, random_state = 24)
gbr.fit(X_train_GBR,Y_train_GBR)
train_pred_GBR = gbr.predict(X_train_GBR)

In [24]:
# Create table for comparison
results_GBR= pd.DataFrame()
results_GBR["Test_sales"] = Y_test_GBR


# Max profit = True sales * (Profit - Cost)
results_GBR["Max_profit"] = results_GBR["Test_sales"] * (p-c)
results_GBR =results_GBR.reset_index(drop=True)

# Stand deviation of the training sales
ndata_GBR = len(X_train_GBR)
estimator_GBR = 1500

std_est_GBR=np.sqrt(np.sum((train_pred_GBR-Y_train_GBR)**2)/(ndata_GBR-estimator_GBR-1))

# Predicted sales for testing sets
Y_pred_GBR = gbr.predict(X_test_GBR).round()
Y_pred_GBR[Y_pred_GBR<0] = 0
results_GBR["Predicted_sales"] = Y_pred_GBR

# Normal
results_GBR["Norm_I"] =(sct.norm.ppf(critical)*std_est_GBR + Y_pred_GBR).round()
results_GBR["Norm_profit"] =Predicted_Profit(results_GBR["Test_sales"],results_GBR["Norm_I"])

#exponential
results_GBR["Exp_I"] = (-Y_pred_GBR * np.log(1 -critical)).round()
results_GBR["Exp_profit"] =Predicted_Profit(results_GBR["Test_sales"] , results_GBR["Exp_I"])

#lognormal
results_GBR["lognormal_I"] = (lognorm.ppf(critical,std_est_GBR) + Y_pred_GBR).round()
results_GBR["lognormal_profit"] = Predicted_Profit(results_GBR["Test_sales"] , results_GBR["lognormal_I"])

results_GBR.head()

Unnamed: 0,Test_sales,Max_profit,Predicted_sales,Norm_I,Norm_profit,Exp_I,Exp_profit,lognormal_I,lognormal_profit
0,4,32,2.0,5.0,28.0,2.0,16.0,15.0,-12.0
1,7,56,15.0,18.0,12.0,16.0,20.0,28.0,-28.0
2,82,656,42.0,45.0,360.0,46.0,368.0,55.0,440.0
3,30,240,27.0,30.0,240.0,30.0,240.0,40.0,200.0
4,146,1168,108.0,111.0,888.0,119.0,952.0,121.0,968.0


In [25]:
# Normal
Profit_Norm_GBR = sum(results_GBR['Norm_profit'])
Level_Norm_GBR = Profit_Norm_GBR  / sum(results_GBR['Max_profit'])
# Expontial
Profit_Exp_GBR = sum(results_GBR['Exp_profit'])
Level_Exp_GBR = Profit_Exp_GBR / sum(results_GBR['Max_profit'])
# lognoraml
Profit_log_GBR = sum(results_GBR['lognormal_profit'])
Level_log_GBR = Profit_log_GBR / sum(results_GBR['Max_profit'])

print (f'The level of profit achieved by Normal is {Level_Norm_GBR}')
print (f'The level of profit achieved by Exponential is {Level_Exp_GBR}')
print (f'The level of profit achieved by Lognormal is {Level_log_GBR}')

The level of profit achieved by Normal is 0.7530506222061133
The level of profit achieved by Exponential is 0.758547783013169
The level of profit achieved by Lognormal is 0.5732753413072369
