In [1]:
from clean_tabular import CleanData
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

In [2]:
tab_class = CleanData(tab_names=['Products'])
explanatory = 'major_category'
df_comb = tab_class.expand_category()
df_comb['price'] = df_comb['price'].astype(np.float32)
df_comb = df_comb[df_comb['price'].astype(int) != 0]
df_comb = df_comb.loc[:, ['price', explanatory]]
df_comb = pd.get_dummies(df_comb, drop_first=True, prefix=None)
df_comb.columns = df_comb.columns.str.removeprefix(f'{explanatory}_')


{0: 'Home & Garden ', 1: 'Baby & Kids Stuff ', 2: 'DIY Tools & Materials ', 3: 'Music, Films, Books & Games ', 4: 'Phones, Mobile Phones & Telecoms ', 5: 'Clothes, Footwear & Accessories ', 6: 'Other Goods ', 7: 'Health & Beauty ', 8: 'Sports, Leisure & Travel ', 9: 'Appliances ', 10: 'Computers & Software ', 11: 'Office Furniture & Equipment ', 12: 'Video Games & Consoles '}
{'Home & Garden ': 0, 'Baby & Kids Stuff ': 1, 'DIY Tools & Materials ': 2, 'Music, Films, Books & Games ': 3, 'Phones, Mobile Phones & Telecoms ': 4, 'Clothes, Footwear & Accessories ': 5, 'Other Goods ': 6, 'Health & Beauty ': 7, 'Sports, Leisure & Travel ': 8, 'Appliances ': 9, 'Computers & Software ': 10, 'Office Furniture & Equipment ': 11, 'Video Games & Consoles ': 12}
Encoder {'Home & Garden ': 0, 'Baby & Kids Stuff ': 1, 'DIY Tools & Materials ': 2, 'Music, Films, Books & Games ': 3, 'Phones, Mobile Phones & Telecoms ': 4, 'Clothes, Footwear & Accessories ': 5, 'Other Goods ': 6, 'Health & Beauty ': 7, 'S

In [3]:
df_comb[df_comb['price'] == 0].count()

price                                0
Baby & Kids Stuff                    0
Clothes, Footwear & Accessories      0
Computers & Software                 0
DIY Tools & Materials                0
Health & Beauty                      0
Home & Garden                        0
Music, Films, Books & Games          0
Office Furniture & Equipment         0
Other Goods                          0
Phones, Mobile Phones & Telecoms     0
Sports, Leisure & Travel             0
Video Games & Consoles               0
dtype: int64

In [6]:
df_comb.columns

Index(['price', 'Baby & Kids Stuff ', 'Clothes, Footwear & Accessories ',
       'Computers & Software ', 'DIY Tools & Materials ', 'Health & Beauty ',
       'Home & Garden ', 'Music, Films, Books & Games ',
       'Office Furniture & Equipment ', 'Other Goods ',
       'Phones, Mobile Phones & Telecoms ', 'Sports, Leisure & Travel ',
       'Video Games & Consoles '],
      dtype='object')

In [5]:
df_comb.head()
model = LinearRegression()
X = df_comb.drop(['price'], axis=1)
y = df_comb['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model.fit(X_train, y_train)

print('R-Squared of Regression model: ', model.score(X_train, y_train))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_train, model.predict(X_train)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_train, model.predict(X_train)))
print('Mean Squared Error of Training Sample: ', mean_squared_error(y_test, model.predict(X_test)))
print('Mean Absolute Error of Training Sample: ', mean_absolute_error(y_test, model.predict(X_test)))

R-Squared of Regression model:  0.0034885593599468523
Mean Squared Error of Training Sample:  20934052997.268883
Mean Absolute Error of Training Sample:  5622.612642283437
Mean Squared Error of Training Sample:  666222320.407611
Mean Absolute Error of Training Sample:  3420.972867138972


In [6]:
print(model.coef_)
print(model.feature_names_in_)

[ -233.0437974    -52.8287567    916.79269396  3170.84849241
   224.46310299  -117.22842964  -199.89082951  1234.74400352
   -82.30156247   -45.49090427 34770.03342575  -108.76701407]
['Baby & Kids Stuff ' 'Clothes, Footwear & Accessories '
 'Computers & Software ' 'DIY Tools & Materials ' 'Health & Beauty '
 'Home & Garden ' 'Music, Films, Books & Games '
 'Office Furniture & Equipment ' 'Other Goods '
 'Phones, Mobile Phones & Telecoms ' 'Sports, Leisure & Travel '
 'Video Games & Consoles ']
