In [323]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.linear_model import  Ridge
from sklearn.linear_model import  Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
plt.style.use('default')
import warnings
warnings.filterwarnings('ignore')

In [324]:
train = pd.read_csv("D:\\PROGRAMMING\\Datasets\\F\\Big Mart Sales\\train.csv")
test = pd.read_csv("D:\\PROGRAMMING\\Datasets\\F\\Big Mart Sales\\test.csv")

In [325]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [326]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


> Separating the independent and dependent variables

In [327]:
X = train.loc[:, ['Outlet_Establishment_Year', 'Item_MRP']]
y = train[['Item_Outlet_Sales']]

> Significance Level Checking

In [328]:
## Adding constant to the independent
X = sm.add_constant(X)

est = sm.OLS(y, X).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:      Item_Outlet_Sales   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.325
Method:                 Least Squares   F-statistic:                     2050.
Date:                Thu, 13 Jul 2023   Prob (F-statistic):               0.00
Time:                        15:19:29   Log-Likelihood:                -73849.
No. Observations:                8523   AIC:                         1.477e+05
Df Residuals:                    8520   BIC:                         1.477e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

> Find Coefficients | P-Value | Confidence Interval

In [329]:
## Coefficients
est.params

const                        21157.872142
Outlet_Establishment_Year      -10.596715
Item_MRP                        15.560175
dtype: float64

In [330]:
## P-Value
est.pvalues

const                        5.534568e-09
Outlet_Establishment_Year    5.419773e-09
Item_MRP                     0.000000e+00
dtype: float64

In [331]:
## Confidence Interval
est.conf_int()

Unnamed: 0,0,1
const,14051.485825,28264.258458
Outlet_Establishment_Year,-14.153743,-7.039688
Item_MRP,15.081996,16.038353


> Train Test Split

In [332]:
X.head()

Unnamed: 0,const,Outlet_Establishment_Year,Item_MRP
0,1.0,1999,249.8092
1,1.0,2009,48.2692
2,1.0,1999,141.618
3,1.0,1998,182.095
4,1.0,1987,53.8614


In [333]:
X = X.drop(['const'], axis = 1)
X_train, X_cv, y_train, y_cv = train_test_split(X, y)
print(X_train.shape)
print(X_cv.shape)
print(y_train.shape)
print(y_cv.shape)

(6392, 2)
(2131, 2)
(6392, 1)
(2131, 1)


> Training The Model

In [334]:
mlra = LinearRegression()
mlra.fit(X_train, y_train)

y_predict = mlra.predict(X_cv)

dfa = pd.DataFrame({'Act':y_cv.values.flatten(), 'Pred':y_predict.flatten()})

px.scatter(dfa, 'Act', 'Pred', trendline='ols', trendline_color_override='blue')

In [335]:
## Calculating MSE
mse = np.mean((y_predict - y_cv)**2)
mse

1909787.7384689609

In [336]:
## Coefficients
from pandas import Series, DataFrame


coeff = DataFrame(X_train.columns)
coeff['Coefficients estimate'] = Series(mlra.coef_.reshape(-1))
coeff

Unnamed: 0,0,Coefficients estimate
0,Outlet_Establishment_Year,-10.370129
1,Item_MRP,15.445155


In [337]:
## R2
mlra.score(X_cv, y_cv) * 100

34.134251028299744

In [338]:
'''
It is showing the 32% of variance in sales in explained by Establishment and MRP.

In other words if you know Establishment and MRP, you will have 32% information to make accurate
prediction about sales
'''

'\nIt is showing the 32% of variance in sales in explained by Establishment and MRP.\n\nIn other words if you know Establishment and MRP, you will have 32% information to make accurate\nprediction about sales\n'

> Linear Regression with more Variables

In [339]:
train['Item_Weight'].fillna((train['Item_Weight'].mean()), inplace=True)    ## filling nans
X = train.loc[:, ['Outlet_Establishment_Year', 'Item_MRP', 'Item_Weight']]
y = train[['Item_Outlet_Sales']]
## Train Test
X_train, X_cv, y_train, y_cv = train_test_split(X, y)

## Training model
mlrb = LinearRegression()
mlrb.fit(X_train, y_train)

## Predicting
y_predict = mlrb.predict(X_cv)


dfa = pd.DataFrame({'Act':y_cv.values.flatten(), 'Pred':y_predict.flatten()})

px.scatter(dfa, 'Act', 'Pred', trendline='ols', trendline_color_override='blue')

In [340]:
## Calculating mse
mse = np.mean((y_predict - y_cv)**2)
print(mse)

print(mlrb.score(X_cv, y_cv)*100)

coeff = DataFrame(X_train.columns)
coeff['Coeff_est'] = Series(mlrb.coef_.reshape(-1))
coeff

1966261.4674293366
33.4067661185247


Unnamed: 0,0,Coeff_est
0,Outlet_Establishment_Year,-9.247258
1,Item_MRP,15.508262
2,Item_Weight,-0.119287


In [341]:
'''
Hence the --> mse reduced , result in increase in R square
'''

'\nHence the --> mse reduced , result in increase in R square\n'

>> Ridge & Lasso

In [342]:
# X = pd.DataFrame(train)
# X = X.select_dtypes(include='number')
# X.drop(['Item_Outlet_Sales'], axis = 1, inplace=True)
# y = train[['Item_Outlet_Sales']]
# X_train, X_cv, y_train, y_cv = train_test_split(X, y)

> Ridge

In [343]:
ridge_reg = Ridge(alpha=0.05, normalize=True)
ridge_reg.fit(X_train, y_train)

ridge_pred = ridge_reg.predict(X_cv)

## mse
mse = np.mean((ridge_pred - y_cv)**2)
mse


1969958.8003738404

In [344]:
coeff_rr = DataFrame(X_train.columns)
coeff_rr['Coeff_est'] = Series(ridge_reg.coef_.reshape(-1))
coeff_rr

Unnamed: 0,0,Coeff_est
0,Outlet_Establishment_Year,-8.781724
1,Item_MRP,14.768916
2,Item_Weight,0.193118


In [345]:
print("R square is : ", ridge_reg.score(X_cv, y_cv))

R square is :  0.332815450522578


> lasso

In [348]:
lasso_reg = Lasso(alpha=0.3, normalize=True)
lasso_reg.fit(X_train, y_train)
lasso_pred = lasso_reg.predict(X_cv)

mse = mean_squared_error(y_cv, lasso_pred)
mse

1970199.5086723096

In [349]:
r2_score(y_cv, lasso_pred)

0.33273392757009845