In [1]:
from clean_tabular import CleanData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

In [3]:
tab_class = CleanData(tab_names=['Products'])
explanatory = 'major_category'
df_comb = tab_class.expand_category()
df_comb['price'] = df_comb['price'].astype(np.float32)
df_comb = df_comb[df_comb['price'].astype(int) != 0]
df_comb = df_comb.loc[:, ['price', explanatory]]
df_comb = pd.get_dummies(df_comb, drop_first=True, prefix=None)
df_comb.columns = df_comb.columns.str.removeprefix(f'{explanatory}_')


                                     id  \
0  ac2140ae-f0d5-4fe7-ac08-df0f109fd734   
1  243809c0-9cfc-4486-ad12-3b7a16605ba9   
2  1c58d3f9-8b93-47ea-9415-204fcc2a22e6   
3  860673f1-57f6-47ba-8d2f-13f9e05b8f9a   
4  59948726-29be-4b35-ade5-bb2fd7331856   

                                        product_name  \
0  Second-Hand Sofas, Couches & Armchairs for Sal...   
1  Mirror wall art | in Wokingham, Berkshire | Gu...   
2  Stainless Steel Food Steamer | in Inverness, H...   
3  Sun loungers | in Skegness, Lincolnshire | Gum...   
4  Coffee side table from Ammunition ammo box hai...   

                                            category  \
0                                                N/A   
1  Home & Garden / Dining, Living Room Furniture ...   
2              Home & Garden / Other Household Goods   
3  Home & Garden / Garden & Patio / Outdoor Setti...   
4  Home & Garden / Dining, Living Room Furniture ...   

                                 product_description    price  \
0 

In [4]:
df_comb[df_comb['price'] == 0].count()

price                                0
Baby & Kids Stuff                    0
Clothes, Footwear & Accessories      0
Computers & Software                 0
DIY Tools & Materials                0
Health & Beauty                      0
Home & Garden                        0
Music, Films, Books & Games          0
Office Furniture & Equipment         0
Other Goods                          0
Phones, Mobile Phones & Telecoms     0
Sports, Leisure & Travel             0
Video Games & Consoles               0
dtype: int64

In [5]:
df_comb.head()
model = LinearRegression()
X = df_comb.drop(['price'], axis=1)
y = df_comb['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)

print('R-Squared of Regression model: ', model.score(X_train, y_train))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_train, model.predict(X_train)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_train, model.predict(X_train)))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_test, model.predict(X_test)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_test, model.predict(X_test)))

R-Squared of Regression model:  0.0034885593599468523
Mean Squared Error of Training Sample:  20934052997.268883
Mean Absolute Error of Training Sample:  5622.612642283437
Mean Squared Error of Training Sample:  666222320.407611
Mean Absolute Error of Training Sample:  3420.972867138972


In [6]:
print(model.coef_)
print(model.feature_names_in_)

[ -233.0437974    -52.8287567    916.79269396  3170.84849241
   224.46310299  -117.22842964  -199.89082951  1234.74400352
   -82.30156247   -45.49090427 34770.03342575  -108.76701407]
['Baby & Kids Stuff ' 'Clothes, Footwear & Accessories '
 'Computers & Software ' 'DIY Tools & Materials ' 'Health & Beauty '
 'Home & Garden ' 'Music, Films, Books & Games '
 'Office Furniture & Equipment ' 'Other Goods '
 'Phones, Mobile Phones & Telecoms ' 'Sports, Leisure & Travel '
 'Video Games & Consoles ']
