# Regression Benchmark

**Importing libraries**

In [87]:
# importing liabraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Importing Dataset

In [88]:
data = pd.read_csv('datasets/dataBM.csv')

In [89]:
data.shape

(8523, 12)

In [90]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [91]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

# Shuffling and Creating Train and Test Set

In [92]:
from sklearn.utils import shuffle

# shuffling the Dataset
data = shuffle(data, random_state=42)

# creating 4 divisions 
div = int(data.shape[0]/4)

# 3 parts to train set and 1 part to test set
train = data.loc[:3*div+1:]
test = data.loc[3*div+1:]

In [93]:
train.shape

(4458, 12)

In [94]:
test.shape

(4066, 12)

In [95]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7503,FDI28,14.3,Low Fat,0.0263,Frozen Foods,79.4302,OUT013,1987,High,Tier 3,Supermarket Type1,1743.0644
2957,NCM17,7.93,Low Fat,0.071136,Health and Hygiene,42.7086,OUT046,1997,Small,Tier 1,Supermarket Type1,356.8688
7031,FDC14,14.5,Regular,0.041313,Canned,42.0454,OUT049,1999,Medium,Tier 1,Supermarket Type1,377.5086
1084,DRC36,,Regular,0.044767,Soft Drinks,173.7054,OUT027,1985,Medium,Tier 3,Supermarket Type3,5778.4782
856,FDS27,10.195,Regular,0.012456,Meat,197.511,OUT035,2004,Small,Tier 2,Supermarket Type1,2356.932


In [96]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
6391,DRM23,16.6,Low Fat,0.135944,Hard Drinks,172.0422,OUT049,1999,Medium,Tier 1,Supermarket Type1,2586.633
2895,FDM27,,Regular,0.277459,Meat,156.3946,OUT019,1985,Small,Tier 1,Grocery Store,473.3838
6814,FDJ57,,Regular,0.0,Seafood,184.3582,OUT019,1985,Small,Tier 1,Grocery Store,185.7582
7537,FDV08,7.35,Low Fat,0.028571,Fruits and Vegetables,40.8454,OUT013,1987,High,Tier 3,Supermarket Type1,587.2356
3425,NCA17,20.6,Low Fat,0.0,Health and Hygiene,148.8392,OUT046,1997,Small,Tier 1,Supermarket Type1,2684.5056


## Simple Mean (mean of Item_Outlet_Sales)

In [97]:
# storing simple mean in a new column in the test set as "simple_mean"
test['simple_mean'] = train['Item_Outlet_Sales'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['simple_mean'] = train['Item_Outlet_Sales'].mean()


In [98]:
# calculating mean absolute error
from sklearn.metrics import mean_absolute_error as MAE

simple_mean_error = MAE(test['Item_Outlet_Sales'], test['simple_mean'])
simple_mean_error

1355.4481105570335

# Mean Item Outlet Sales with respect to Outlet_Type

In [99]:
out_type = pd.pivot_table(train, values='Item_Outlet_Sales', index=['Outlet_Type'], aggfunc=np.mean)
out_type

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Type,Unnamed: 1_level_1
Grocery Store,332.606025
Supermarket Type1,2245.228499
Supermarket Type2,1938.684847
Supermarket Type3,3776.185098


In [100]:
# initializing new column to zero
test['Out_type_mean'] = 0

# for every unique entry in Outlet_Type
for i in train['Outlet_Type'].unique():
    # assign the mean value corresponding to unique entry
    test['Out_type_mean'][test['Outlet_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Type']==str(i)].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_type_mean'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_type_mean'][test['Outlet_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Type']==str(i)].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_type_mean'][test['Outlet_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Type']==str(i)].mean()
A value is trying to be set on a copy of a slice from a Dat

In [101]:
# calculating mean absolute error 
out_type_error = MAE(test['Item_Outlet_Sales'], test['Out_type_mean'])
out_type_error

1138.8026221064356

# Mean Item Outlet Sales with respect to Outlet_Establishment_Year

In [102]:
out_year = pd.pivot_table(train, values='Item_Outlet_Sales', index=['Outlet_Establishment_Year'], aggfunc=np.mean)
out_year

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Establishment_Year,Unnamed: 1_level_1
1985,2547.909733
1987,2160.883029
1997,2200.803276
1998,320.792473
1999,2245.345807
2002,2136.727766
2004,2415.594646
2007,2320.598343
2009,1938.684847


In [106]:
# initializing new column to zero
test['Out_year_mean'] = 0

# for every unique entry in Outlet_Establishment_Year
for i in train['Outlet_Establishment_Year'].unique():
    # assign the mean value corresponding to unique entry
    test['Out_year_mean'][test['Outlet_Establishment_Year'] == i ] = train['Item_Outlet_Sales'][train['Outlet_Establishment_Year'] == i].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_year_mean'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_year_mean'][test['Outlet_Establishment_Year'] == i ] = train['Item_Outlet_Sales'][train['Outlet_Establishment_Year'] == i].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_year_mean'][test['Outlet_Establishment_Year'] == i ] = train['Item_Outlet_Sales'][train['Outlet_Establishment_Year'] == i].mean()
A value is tr

In [107]:
# calculating mean absolute error
out_year_error = MAE(test['Item_Outlet_Sales'], test['Out_year_mean'])
out_year_error

1267.2633548928097

## Mean Item Outlet Sales with respect to Outlet_Location_Type

In [108]:
out_loc = pd.pivot_table(train, values='Item_Outlet_Sales', index=['Outlet_Location_Type'], aggfunc=np.mean)
out_loc

Unnamed: 0_level_0,Item_Outlet_Sales
Outlet_Location_Type,Unnamed: 1_level_1
Tier 1,1804.051997
Tier 2,2288.475485
Tier 3,2255.793325


In [109]:
# initializing empty column
test['Out_loc_mean'] = 0

# for every unique entry in Outlet_location_type
for i in train['Outlet_Location_Type'].unique():
    # calculate and assign mean corresponding to the unique entries
    test['Out_loc_mean'][test['Outlet_Location_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Location_Type'] == str(i)].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_loc_mean'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_loc_mean'][test['Outlet_Location_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Location_Type'] == str(i)].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Out_loc_mean'][test['Outlet_Location_Type']==str(i)] = train['Item_Outlet_Sales'][train['Outlet_Location_Type'] == str(i)].mean()
A value is trying to b

In [110]:
# calculating mean absolute error
out_loc_error = MAE(test['Item_Outlet_Sales'], test['Out_loc_mean'])
out_loc_error

1348.8509267072288

## Mean Item_Outlet_Sales with respect to both Outlet_Location_Type and Outlet_Establishment_Year

In [112]:
combo = pd.pivot_table(train, values='Item_Outlet_Sales', index=['Outlet_Location_Type', 'Outlet_Establishment_Year'], aggfunc=np.mean)
combo

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Outlet_Sales
Outlet_Location_Type,Outlet_Establishment_Year,Unnamed: 2_level_1
Tier 1,1985,344.882031
Tier 1,1997,2200.803276
Tier 1,1999,2245.345807
Tier 2,2002,2136.727766
Tier 2,2004,2415.594646
Tier 2,2007,2320.598343
Tier 3,1985,3776.185098
Tier 3,1987,2160.883029
Tier 3,1998,320.792473
Tier 3,2009,1938.684847


In [114]:
# initialize new empty column
test['Super_mean'] = 0

# Assigning variables to strings (to shorten code lenght)
s2 = 'Outlet_Location_Type'
s1 = 'Outlet_Establishment_Year'

# for every unique value in s1
for i in train[s1].unique():
    # for every unique value in s2
    for j in train[s2].unique():
        test['Super_mean'][(test[s1]==i) & (test[s2] == str(j))] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2] == str(j))].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Super_mean'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Super_mean'][(test[s1]==i) & (test[s2] == str(j))] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2] == str(j))].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Super_mean'][(test[s1]==i) & (test[s2] == str(j))] = train['Item_Outlet_Sales'][(train[s1] == i) & (train[s2] == str(j))].mean()
A value is trying to be se

In [115]:
# calculating mean absolute error
super_mean_error = MAE(test['Item_Outlet_Sales'], test['Super_mean'])
super_mean_error

1140.0522313200124