In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
## Loading the data
df=pd.read_csv("D:\\ML Data\\Big_Mart_Train.csv")
df.sample(5)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3882,FDJ15,11.35,Regular,0.023322,Dairy,182.5608,OUT046,1997,Small,Tier 1,Supermarket Type1,3307.6944
7676,FDR33,7.31,Low Fat,0.026789,Snack Foods,110.357,OUT046,1997,Small,Tier 1,Supermarket Type1,3185.853
3661,FDF46,,Low Fat,0.093218,Snack Foods,116.7834,OUT027,1985,Medium,Tier 3,Supermarket Type3,4952.8862
1625,NCP06,20.7,Low Fat,0.039406,Household,149.8366,OUT018,2009,Medium,Tier 3,Supermarket Type2,2267.049
3190,FDL33,7.235,Low Fat,0.099879,Snack Foods,195.1452,OUT013,1987,High,Tier 3,Supermarket Type1,1761.7068


In [4]:
## Capping the outliers in the columns Outlet_Sales having values greater than 10000
df['Item_Outlet_Sales'].values[df['Item_Outlet_Sales'].values>10000]=10000

In [5]:
## Dealing with bivariate outliers
## We delete all the values which are greater than 8000
df=df[~(df['Item_Outlet_Sales']>8000)]

In [6]:
## Since Distribution is symmetric therefore we will fill values with mean
df['Item_Weight']=df['Item_Weight'].fillna(df['Item_Weight'].mean())

In [7]:
## It is visible that LF,Low Fat and low fat are same, similar to regular and reg and hence have to be replaced by a single
## term
df.Item_Fat_Content=df.Item_Fat_Content.replace({"LF":"Low Fat",
                                                 "low fat":"Low Fat",
                                                 "reg":"Regular"})

In [8]:
## It can be seen that Tier 2 cities just have small sized Outlets and hence its null values can be replaced with small
## outlets
df.loc[df['Outlet_Location_Type'] == 'Tier 2', 'Outlet_Size'] = 'Small'

In [9]:
## It can be seen that Supermarket Type 2 have outlet size as medium only and same is the case with Supermarket Type 3
## It can be also seen that Grocery Store also only have outlet size as small
## Hence they can be replaced to get rid of null values
df.loc[df['Outlet_Type'] == 'Supermarket Type 2', 'Outlet_Size'] = 'Medium'
df.loc[df['Outlet_Type'] == 'Supermarket Type 3', 'Outlet_Size'] = 'Medium'
df.loc[df['Outlet_Type'] == 'Grocery Store', 'Outlet_Size'] = 'Small'

In [10]:
#Determine average visibility of a product
visibility_avg = df.pivot_table(values='Item_Visibility', index='Item_Identifier')

#Impute 0 values with mean visibility of that product
missing_values = (df['Item_Visibility'] == 0)

print ('Number of 0 values initially: %d'%sum(missing_values))

df.loc[missing_values,'Item_Visibility'] = df.loc[missing_values,'Item_Identifier'].apply(lambda x: visibility_avg.at[x, 'Item_Visibility'])

print ('Number of 0 values after modification: %d'%sum(df['Item_Visibility'] == 0))

Number of 0 values initially: 523
Number of 0 values after modification: 1


In [11]:
df['Item_Type'] = df['Item_Identifier'].apply(lambda x: x[0:2])
df['Item_Type'] = df['Item_Type'].map({'FD':'Food','NC':'Non-Consumable',                                                            
              'DR':'Drinks'})
df['Item_Type'].value_counts()

Food              6084
Non-Consumable    1594
Drinks             795
Name: Item_Type, dtype: int64

In [12]:
df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']

In [13]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Years
0,FDA15,9.300,Low Fat,0.016047,Food,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,14
1,DRC01,5.920,Regular,0.019278,Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,4
2,FDN15,17.500,Low Fat,0.016760,Food,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,14
3,FDX07,19.200,Regular,0.015274,Food,182.0950,OUT010,1998,Small,Tier 3,Grocery Store,732.3800,15
4,NCD19,8.930,Low Fat,0.008082,Non-Consumable,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Food,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,26
8519,FDS36,8.380,Regular,0.046982,Food,108.1570,OUT045,2002,Small,Tier 2,Supermarket Type1,549.2850,11
8520,NCJ29,10.600,Low Fat,0.035186,Non-Consumable,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,9
8521,FDN46,7.210,Regular,0.145221,Food,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,4


In [14]:
df.loc[df['Item_Type'] == 'Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'

In [15]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Years
0,FDA15,9.300,Low Fat,0.016047,Food,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,14
1,DRC01,5.920,Regular,0.019278,Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,4
2,FDN15,17.500,Low Fat,0.016760,Food,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,14
3,FDX07,19.200,Regular,0.015274,Food,182.0950,OUT010,1998,Small,Tier 3,Grocery Store,732.3800,15
4,NCD19,8.930,Non-Edible,0.008082,Non-Consumable,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Food,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834,26
8519,FDS36,8.380,Regular,0.046982,Food,108.1570,OUT045,2002,Small,Tier 2,Supermarket Type1,549.2850,11
8520,NCJ29,10.600,Non-Edible,0.035186,Non-Consumable,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136,9
8521,FDN46,7.210,Regular,0.145221,Food,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976,4


In [16]:
df.drop(['Outlet_Establishment_Year'],axis=1,inplace=True)

In [17]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Years
0,FDA15,9.300,Low Fat,0.016047,Food,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,3735.1380,14
1,DRC01,5.920,Regular,0.019278,Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,443.4228,4
2,FDN15,17.500,Low Fat,0.016760,Food,141.6180,OUT049,Medium,Tier 1,Supermarket Type1,2097.2700,14
3,FDX07,19.200,Regular,0.015274,Food,182.0950,OUT010,Small,Tier 3,Grocery Store,732.3800,15
4,NCD19,8.930,Non-Edible,0.008082,Non-Consumable,53.8614,OUT013,High,Tier 3,Supermarket Type1,994.7052,26
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Food,214.5218,OUT013,High,Tier 3,Supermarket Type1,2778.3834,26
8519,FDS36,8.380,Regular,0.046982,Food,108.1570,OUT045,Small,Tier 2,Supermarket Type1,549.2850,11
8520,NCJ29,10.600,Non-Edible,0.035186,Non-Consumable,85.1224,OUT035,Small,Tier 2,Supermarket Type1,1193.1136,9
8521,FDN46,7.210,Regular,0.145221,Food,103.1332,OUT018,Medium,Tier 3,Supermarket Type2,1845.5976,4


In [18]:
df = pd.get_dummies(df, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type',
                              'Item_Type','Outlet_Identifier'])

In [19]:
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Outlet_Years,Item_Fat_Content_Low Fat,Item_Fat_Content_Non-Edible,Item_Fat_Content_Regular,Outlet_Location_Type_Tier 1,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,FDA15,9.300,0.016047,249.8092,3735.1380,14,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.920,0.019278,48.2692,443.4228,4,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.500,0.016760,141.6180,2097.2700,14,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.200,0.015274,182.0950,732.3800,15,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,8.930,0.008082,53.8614,994.7052,26,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,0.056783,214.5218,2778.3834,26,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8519,FDS36,8.380,0.046982,108.1570,549.2850,11,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
8520,NCJ29,10.600,0.035186,85.1224,1193.1136,9,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
8521,FDN46,7.210,0.145221,103.1332,1845.5976,4,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
df.drop('Item_Identifier', axis=1, inplace=True)

In [21]:
df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Outlet_Years,Item_Fat_Content_Low Fat,Item_Fat_Content_Non-Edible,Item_Fat_Content_Regular,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,9.300,0.016047,249.8092,3735.1380,14,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,5.920,0.019278,48.2692,443.4228,4,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,17.500,0.016760,141.6180,2097.2700,14,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,19.200,0.015274,182.0950,732.3800,15,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,8.930,0.008082,53.8614,994.7052,26,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,2778.3834,26,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8519,8.380,0.046982,108.1570,549.2850,11,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
8520,10.600,0.035186,85.1224,1193.1136,9,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
8521,7.210,0.145221,103.1332,1845.5976,4,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [22]:
df.shape

(8473, 31)

In [23]:
cols = list(df.columns.values) 
cols.pop(cols.index('Item_Outlet_Sales')) 
df = df[cols+['Item_Outlet_Sales']] 

In [24]:
df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Years,Item_Fat_Content_Low Fat,Item_Fat_Content_Non-Edible,Item_Fat_Content_Regular,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,...,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Item_Outlet_Sales
0,9.300,0.016047,249.8092,14,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,3735.1380
1,5.920,0.019278,48.2692,4,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,443.4228
2,17.500,0.016760,141.6180,14,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,2097.2700
3,19.200,0.015274,182.0950,15,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,732.3800
4,8.930,0.008082,53.8614,26,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,26,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,2778.3834
8519,8.380,0.046982,108.1570,11,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,549.2850
8520,10.600,0.035186,85.1224,9,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1193.1136
8521,7.210,0.145221,103.1332,4,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1845.5976


In [25]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [26]:
print(X)

[[9.30000000e+00 1.60473010e-02 2.49809200e+02 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 [5.92000000e+00 1.92782160e-02 4.82692000e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.75000000e+01 1.67600750e-02 1.41618000e+02 ... 0.00000000e+00
  0.00000000e+00 1.00000000e+00]
 ...
 [1.06000000e+01 3.51862710e-02 8.51224000e+01 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.21000000e+00 1.45220646e-01 1.03133200e+02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.48000000e+01 4.48782800e-02 7.54670000e+01 ... 0.00000000e+00
  1.00000000e+00 0.00000000e+00]]


In [27]:
print(y)

[3735.138   443.4228 2097.27   ... 1193.1136 1845.5976  765.67  ]


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [29]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [30]:
y_pred = regressor.predict(X_test)

In [31]:
y_pred

array([2742.70821613, 1158.8722537 , 1852.99657193, ..., 3414.15969172,
       2545.20061427, 2278.30086097])

In [32]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

MSE=mean_squared_error(y_test,y_pred)
MAE=mean_absolute_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
RMSE = np.sqrt(MSE)
print("R squared value: ", r2)
print("Root Mean Squared Error : ", RMSE)
print("Mean Absolute Error : ", MAE)

R squared value:  0.5475680192521344
Root Mean Squared Error :  1064.1099064211712
Mean Absolute Error :  810.907149539269


In [33]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

In [34]:
y_pred=lin_reg.predict(poly_reg.fit_transform(X_test))

In [35]:
y_pred

array([2460.75010412,  876.30769462, 1895.08766831, ..., 3711.61125434,
       2540.69423741, 2310.83920202])

In [36]:
MSE=mean_squared_error(y_test,y_pred)
MAE=mean_absolute_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
RMSE = np.sqrt(MSE)
print("R squared value: ", r2)
print("Root Mean Squared Error : ", RMSE)
print("Mean Absolute Error : ", MAE)

R squared value:  0.5777121194542636
Root Mean Squared Error :  1028.0497861729707
Mean Absolute Error :  740.4715241316807


In [37]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 42)
regressor.fit(X_train, y_train)

In [38]:
y_pred=regressor.predict(X_test)

In [39]:
y_pred

array([3139.9128,  485.3682,  759.012 , ..., 2292.3494, 2190.482 ,
       3952.1888])

In [40]:
MSE=mean_squared_error(y_test,y_test)
MAE=mean_absolute_error(y_test,y_test)
r2=r2_score(y_test,y_test)
RMSE = np.sqrt(MSE)
print("R squared value: ", r2)
print("Root Mean Squared Error : ", RMSE)
print("Mean Absolute Error : ", MAE)

R squared value:  1.0
Root Mean Squared Error :  0.0
Mean Absolute Error :  0.0


In [41]:
from sklearn.ensemble import RandomForestRegressor
rfr_reg=RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=50,n_jobs=4)
rfr_reg.fit(X_train,y_train)
y_rfreg = rfr_reg.predict(X_test)

In [42]:
y_rfreg = rfr_reg.predict(X_test)

In [43]:
MSE=mean_squared_error(y_test,y_rfreg)
MAE=mean_absolute_error(y_test,y_rfreg)
r2=r2_score(y_test,y_rfreg)
RMSE = np.sqrt(MSE)
print("R squared value: ", r2)
print("Root Mean Squared Error : ", RMSE)
print("Mean Absolute Error : ", MAE)

R squared value:  0.5775065376236246
Root Mean Squared Error :  1028.299997758972
Mean Absolute Error :  740.8392620238118
