In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Reading the Dataset

In [2]:
data = pd.read_csv("Train.csv")

data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Looking at the Shape and Columns

In [3]:
print(data.shape)
print("******************************************************************************")
print(data .columns)

(8523, 12)
******************************************************************************
Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


# Shuffling the data to remove bias during training

In [4]:
data = data.sample(frac=1)

# Checking null values present inside data

In [5]:
data.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

# Looking at data types of Null Value containing Columns

In [6]:
print(data['Item_Weight'].dtype)
print(data['Outlet_Size'].dtype)

float64
object


# Exploring the mean of Item Weight based on Item Type

In [7]:
data.groupby(['Item_Type']).mean()['Item_Weight']

Item_Type
Baking Goods             12.277108
Breads                   11.346936
Breakfast                12.768202
Canned                   12.305705
Dairy                    13.426069
Frozen Foods             12.867061
Fruits and Vegetables    13.224769
Hard Drinks              11.400328
Health and Hygiene       13.142314
Household                13.384736
Meat                     12.817344
Others                   13.853285
Seafood                  12.552843
Snack Foods              12.987880
Soft Drinks              11.847460
Starchy Foods            13.690731
Name: Item_Weight, dtype: float64

# Imputing Null values of Item Weight

In [8]:
for i in data.Item_Type.value_counts().index:
    data.loc[(data['Item_Weight'].isna()) & (data['Item_Type'] == i), ['Item_Weight']] = \
    data.loc[data['Item_Type'] == 'Fruits and Vegetables', ['Item_Weight']].mean()[0]

# Bifurcating data into categorical and numerical data frames

In [9]:
cat_data = data.select_dtypes(object)
num_data = data.select_dtypes(np.number)

# Exploring Null Values in Categorical Data Frame

In [10]:
cat_data.isna().sum()

Item_Identifier            0
Item_Fat_Content           0
Item_Type                  0
Outlet_Identifier          0
Outlet_Size             2410
Outlet_Location_Type       0
Outlet_Type                0
dtype: int64

# Exploring Null Values in Numerical Data Frames

In [11]:
num_data.isna().sum()

Item_Weight                  0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
Item_Outlet_Sales            0
dtype: int64

# Analyzing Categories of Outlet Size Columns

In [12]:
cat_data.Outlet_Size.value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

# Exploring Outlet Size based on Outlet Type

In [13]:
cat_data.groupby(['Outlet_Type','Outlet_Size']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Location_Type
Outlet_Type,Outlet_Size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Grocery Store,Small,528,528,528,528,528
Supermarket Type1,High,932,932,932,932,932
Supermarket Type1,Medium,930,930,930,930,930
Supermarket Type1,Small,1860,1860,1860,1860,1860
Supermarket Type2,Medium,928,928,928,928,928
Supermarket Type3,Medium,935,935,935,935,935


# Imputing the Null Values

In [14]:
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Grocery Store'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type1'), ['Outlet_Size']] = 'Small'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type2'), ['Outlet_Size']] = 'Medium'
cat_data.loc[(cat_data['Outlet_Size'].isna()) & (cat_data['Outlet_Type'] == 'Supermarket Type3'), ['Outlet_Size']] = 'Medium'

# Categories of Item Fat Content

In [15]:
cat_data.Item_Fat_Content.value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

# Removing Duplicates

In [16]:
cat_data.loc[cat_data['Item_Fat_Content'] == 'LF' , ['Item_Fat_Content']] = 'Low Fat'
cat_data.loc[cat_data['Item_Fat_Content'] == 'reg' , ['Item_Fat_Content']] = 'Regular'
cat_data.loc[cat_data['Item_Fat_Content'] == 'low fat' , ['Item_Fat_Content']] = 'Low Fat'

# Label Encoding of Categorical Data

In [17]:
le = LabelEncoder()
cat_data = cat_data.apply(le.fit_transform)

# Standardization of both dataframes

In [18]:
ss = StandardScaler()

In [19]:
num_data = pd.DataFrame(ss.fit_transform(num_data.drop(['Item_Outlet_Sales'], axis=1)), columns = num_data.drop(['Item_Outlet_Sales'],axis=1).columns)
cat_data = pd.DataFrame(ss.fit_transform(cat_data.drop(['Item_Identifier'], axis=1)), columns = cat_data.drop(['Item_Identifier'], axis=1).columns)

# Creating final Dataset

In [20]:
final_data = pd.concat([num_data,cat_data],axis=1)

In [21]:
final_data.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,0.870202,0.718355,-0.371698,0.497909,-0.738147,-0.291391,0.802852,0.799954,-0.138882,-0.252658
1,-1.965559,1.105149,0.549961,1.334103,-0.738147,-0.053847,-0.607071,-0.66408,1.091569,1.002972
2,0.491785,0.104355,-0.212884,0.497909,-0.738147,0.183698,0.802852,0.799954,-0.138882,-0.252658
3,-0.881158,-0.039119,-0.874176,0.139541,1.354743,-0.528935,1.507813,-0.66408,-1.369334,-0.252658
4,0.988457,-1.281758,1.085105,0.139541,1.354743,-0.291391,1.507813,-0.66408,-1.369334,-0.252658


# Defining Dependent and Independent Variables

In [22]:
X = final_data
y = data['Item_Outlet_Sales']

In [23]:
X.shape, y.shape

((8523, 10), (8523,))

# Train and Validation Splits

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=5)