In [1]:
from clean_tabular import CleanData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

In [2]:
tab_class = CleanData.allTables()
explanatory = 'major_category'
tab_class.try_merge(df_list=['products', 'products_2'])
df_comb = tab_class.expand_category(df='combined')
df_comb['price'] = df_comb['price'].astype(np.float32)
df_comb = df_comb[df_comb['price'].astype(int) != 0]
df_comb = df_comb.loc[:, ['price', explanatory]]
df_comb = pd.get_dummies(df_comb, drop_first=True, prefix=None)
df_comb.columns = df_comb.columns.str.removeprefix(f'{explanatory}_')


                                     id  \
0  ac2140ae-f0d5-4fe7-ac08-df0f109fd734   
1  243809c0-9cfc-4486-ad12-3b7a16605ba9   
2  1c58d3f9-8b93-47ea-9415-204fcc2a22e6   
3  860673f1-57f6-47ba-8d2f-13f9e05b8f9a   
4  59948726-29be-4b35-ade5-bb2fd7331856   

                                        product_name  \
0  Second-Hand Sofas, Couches & Armchairs for Sal...   
1  Mirror wall art | in Wokingham, Berkshire | Gu...   
2  Stainless Steel Food Steamer | in Inverness, H...   
3  Sun loungers | in Skegness, Lincolnshire | Gum...   
4  Coffee side table from Ammunition ammo box hai...   

                                            category  \
0                                                N/A   
1  Home & Garden / Dining, Living Room Furniture ...   
2              Home & Garden / Other Household Goods   
3  Home & Garden / Garden & Patio / Outdoor Setti...   
4  Home & Garden / Dining, Living Room Furniture ...   

                                 product_description    price  \
0 

In [3]:
df_comb[df_comb['price'] == 0].count()

price                                0
Baby & Kids Stuff                    0
Clothes, Footwear & Accessories      0
Computers & Software                 0
DIY Tools & Materials                0
Health & Beauty                      0
Home & Garden                        0
Music, Films, Books & Games          0
Office Furniture & Equipment         0
Other Goods                          0
Phones, Mobile Phones & Telecoms     0
Sports, Leisure & Travel             0
Video Games & Consoles               0
dtype: int64

In [4]:
df_comb.head()
model = LinearRegression()
X = df_comb.drop(['price'], axis=1)
y = df_comb['price']
y = df_comb.applymap(np.sqrt)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)

print('R-Squared of Regression model: ', model.score(X_train, y_train))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_train, model.predict(X_train)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_train, model.predict(X_train)))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_test, model.predict(X_test)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_test, model.predict(X_test)))

R-Squared of Regression model:  0.9237182331421967
Mean Squared Error of Training Sample:  206.06475528744684
Mean Absolute Error of Training Sample:  0.6682947112566462
Mean Squared Error of Training Sample:  69.12817024655936
Mean Absolute Error of Training Sample:  0.6297657261138472


In [5]:
print(model.coef_)
print(model.feature_names_in_)

[[-6.18263353e+00 -2.13880447e+00  5.76951626e+00 -1.88817395e-01
  -4.24398420e-01 -1.30854921e+00 -6.54392722e+00  7.90343910e+00
  -1.99192210e+00  2.63476734e+00  1.08925690e+01 -1.04642097e+00]
 [ 1.00000000e+00 -3.21133673e-16 -2.27583167e-16 -1.55399962e-16
  -1.88152104e-16 -2.14017782e-16 -1.93302525e-16 -1.97976604e-16
  -1.49187639e-16 -3.06600869e-16 -3.20056761e-16 -1.84565922e-16]
 [ 3.26416858e-16  1.00000000e+00  5.55111512e-17  5.55111512e-17
   0.00000000e+00  5.55111512e-17  0.00000000e+00 -2.77555756e-16
   2.22044605e-16  2.22044605e-16  2.49800181e-16  1.11022302e-16]
 [ 1.36824893e-16  1.66533454e-16  1.00000000e+00 -1.94289029e-16
  -1.38777878e-17  1.94289029e-16 -1.24900090e-16 -1.11022302e-16
   2.77555756e-17 -1.52655666e-16 -2.22044605e-16 -3.19189120e-16]
 [ 2.04669251e-16  1.66533454e-16 -1.24900090e-16  1.00000000e+00
   2.08166817e-16 -8.32667268e-17  3.46944695e-16 -8.32667268e-17
   1.52655666e-16  1.31838984e-16 -2.08166817e-16 -3.60822483e-16]
 [ 6.