In [47]:
from clean_tabular import CleanData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

In [48]:
tab_class = CleanData.allTables()
explanatory = 'major_category'
tab_class.try_merge(df_list=['products', 'products_2'])
df_comb = tab_class.expand_category(df='combined')
df_comb['price'] = df_comb['price'].astype(np.float32)
df_comb = df_comb[df_comb['price'].astype(int) != 0]
df_comb = df_comb.loc[:, ['price', explanatory]]
df_comb = pd.get_dummies(df_comb, drop_first=True, prefix=None)
df_comb.columns = df_comb.columns.str.removeprefix(f'{explanatory}_')


                                     id  \
0  ac2140ae-f0d5-4fe7-ac08-df0f109fd734   
1  243809c0-9cfc-4486-ad12-3b7a16605ba9   
2  1c58d3f9-8b93-47ea-9415-204fcc2a22e6   
3  860673f1-57f6-47ba-8d2f-13f9e05b8f9a   
4  59948726-29be-4b35-ade5-bb2fd7331856   

                                        product_name  \
0  Second-Hand Sofas, Couches & Armchairs for Sal...   
1  Mirror wall art | in Wokingham, Berkshire | Gu...   
2  Stainless Steel Food Steamer | in Inverness, H...   
3  Sun loungers | in Skegness, Lincolnshire | Gum...   
4  Coffee side table from Ammunition ammo box hai...   

                                            category  \
0                                                N/A   
1  Home & Garden / Dining, Living Room Furniture ...   
2              Home & Garden / Other Household Goods   
3  Home & Garden / Garden & Patio / Outdoor Setti...   
4  Home & Garden / Dining, Living Room Furniture ...   

                                 product_description    price  \
0 

In [49]:
df_comb[df_comb['price'] == 0].count()

price                                0
Baby & Kids Stuff                    0
Clothes, Footwear & Accessories      0
Computers & Software                 0
DIY Tools & Materials                0
Health & Beauty                      0
Home & Garden                        0
Music, Films, Books & Games          0
Office Furniture & Equipment         0
Other Goods                          0
Phones, Mobile Phones & Telecoms     0
Sports, Leisure & Travel             0
Video Games & Consoles               0
dtype: int64

In [50]:
df_comb.head()
model = LinearRegression()
X = df_comb.drop(['price'], axis=1)
y = df_comb['price']
y = df_comb.applymap(np.sqrt)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)

print('R-Squared of Regression model: ', model.score(X_train, y_train))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_train, model.predict(X_train)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_train, model.predict(X_train)))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_test, model.predict(X_test)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_test, model.predict(X_test)))

R-Squared of Regression model:  0.9237442081514224
Mean Squared Error of Training Sample:  212.65039699814665
Mean Absolute Error of Training Sample:  0.6976285539807693
Mean Squared Error of Training Sample:  53.82577581669736
Mean Absolute Error of Training Sample:  0.5862685222266409


In [51]:
print(model.coef_)
print(model.feature_names_in_)

[[-6.48473563e+00 -2.07125492e+00  6.44429649e+00 -5.07911713e-01
  -3.81763668e-01 -1.90098728e+00 -7.28880611e+00  7.87621317e+00
  -2.39704552e+00  1.40736954e+00  1.02726244e+01 -1.65672379e+00]
 [ 1.00000000e+00 -2.04916499e-16 -6.88244916e-17 -1.01753985e-16
   4.25639590e-17 -1.68903667e-16 -7.13260415e-17 -3.54179406e-17
  -8.15178973e-17 -9.07392576e-17 -4.91799034e-17 -9.21155535e-17]
 [-1.41036682e-16  1.00000000e+00  1.66533454e-16 -5.55111512e-17
  -1.66533454e-16 -3.88578059e-16  3.88578059e-16  1.66533454e-16
   1.11022302e-16 -4.71844785e-16 -3.05311332e-16  5.55111512e-17]
 [ 6.24065833e-16  9.43689571e-16  1.00000000e+00  2.77555756e-16
  -1.26287869e-15 -4.02455846e-16  2.91433544e-16  4.44089210e-16
   5.82867088e-16  3.74700271e-16  3.60822483e-16  2.35922393e-16]
 [ 3.23585467e-16  6.10622664e-16  3.88578059e-16  1.00000000e+00
   1.80411242e-16 -6.93889390e-17 -2.77555756e-17  3.33066907e-16
  -4.92661467e-16 -2.08166817e-17 -2.56739074e-16 -1.24900090e-16]
 [ 4.