In [107]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

<h3>Data Preprocessing - Training</h3>

In [272]:
data_raw = pd.read_csv('https://datahack-prod.s3.amazonaws.com/train_file/train_XnW6LSF.csv')

In [273]:
data = data_raw

In [274]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [275]:
data.Item_Fat_Content.value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [276]:
data.shape

(8523, 12)

In [277]:
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [278]:
data.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [279]:
data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)
data['Outlet_Size'].fillna(data['Outlet_Size'].mode()[0], inplace=True)

In [280]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].map({'Low Fat': 0, 'LF':0, 'low fat':0, 'Regular': 1, 'reg':1})

data['Item_Type'] = data['Item_Type'].map({'Fruits and Vegetables': 0, 'Snack Foods': 1, 'Household': 2, 
                                          'Frozen Foods': 3, 'Dairy': 4, 'Canned': 5, 'Baking Goods': 6, 
                                           'Health and Hygiene': 7, 'Soft Drinks': 8, 'Meat': 9,
                                           'Breads': 10, 'Hard Drinks': 11, 'Others': 12, 'Starchy Foods': 13,
                                          'Breakfast': 14, 'Seafood': 15})

data['Outlet_Size'] = data['Outlet_Size'].map({'Small':0, 'Medium':1, 'High':2})
data['Outlet_Location_Type'] = data['Outlet_Location_Type'].map({'Tier 1':0, 'Tier 2':1, 'Tier 3':2})
data['Outlet_Type'] = data['Outlet_Type'].map({'Supermarket Type1':0, 'Supermarket Type2':1, 
                                               'Supermarket Type3':2, 'Grocery Store':3})

In [281]:
data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [282]:
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,0,0.016047,4,249.8092,OUT049,1999,1,0,0,3735.138
1,DRC01,5.92,1,0.019278,8,48.2692,OUT018,2009,1,2,1,443.4228
2,FDN15,17.5,0,0.01676,9,141.618,OUT049,1999,1,0,0,2097.27
3,FDX07,19.2,1,0.0,0,182.095,OUT010,1998,1,2,3,732.38
4,NCD19,8.93,0,0.0,2,53.8614,OUT013,1987,2,2,0,994.7052


In [283]:
from sklearn import preprocessing

IV = np.array(data['Item_Visibility'])
MRP = np.array(data['Item_MRP'])
IW = np.array(data['Item_Weight'])
year = np.array(data['Outlet_Establishment_Year'])
#sales = np.array(data['Item_Outlet_Sales'])
fat = np.array(data['Item_Fat_Content'])
IT = np.array(data['Item_Type'])
size = np.array(data['Outlet_Size'])
OLT = np.array(data['Outlet_Location_Type'])
OT = np.array(data['Outlet_Type'])

norm_IV = preprocessing.normalize([IV])
norm_IMRP = preprocessing.normalize([MRP])
norm_IW = preprocessing.normalize([IW])
norm_year = preprocessing.normalize([year])
#norm_sales = preprocessing.normalize([sales])
norm_fat = preprocessing.normalize([fat])
norm_IT = preprocessing.normalize([IT])
norm_size = preprocessing.normalize([size])
norm_OLT = preprocessing.normalize([OLT])
norm_OT = preprocessing.normalize([OT])

In [284]:
#data['Item_Visibility'] = scaled_data['Item_Visibility']
#data['Item_MRP'] = scaled_data['Item_MRP']

data['Item_Visibility'] = norm_IV[0]
data['Item_MRP'] = norm_IMRP[0]
data['Item_Weight'] = norm_IW[0]
data['Outlet_Establishment_Year'] = norm_year[0]
#data['Item_Outlet_Sales'] = norm_sales[0]
data['Item_Fat_Content'] = norm_fat[0]
data['Item_Type'] = norm_IT[0]
data['Outlet_Size'] = norm_size[0]
data['Outlet_Location_Type'] = norm_OLT[0]
data['Outlet_Type'] = norm_OT[0]
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,0.007443,0.0,0.002072,0.007457,0.017556,OUT049,0.010838,0.010582,0.0,0.0,3735.138
1,DRC01,0.004738,0.018239,0.00249,0.014915,0.003392,OUT018,0.010892,0.010582,0.015721,0.008329,443.4228
2,FDN15,0.014006,0.0,0.002164,0.016779,0.009952,OUT049,0.010838,0.010582,0.0,0.0,2097.27
3,FDX07,0.015366,0.018239,0.0,0.0,0.012797,OUT010,0.010833,0.010582,0.015721,0.024987,732.38
4,NCD19,0.007147,0.0,0.0,0.003729,0.003785,OUT013,0.010773,0.021163,0.015721,0.0,994.7052


<h3>Data Preprocessing - Test</h3>

In [285]:
data_test = pd.read_csv('https://datahack-prod.s3.amazonaws.com/test_file/test_FewQE9B.csv')

In [286]:
data_test.isnull().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [287]:
data_test['Item_Fat_Content'].fillna(data_test['Item_Fat_Content'].mode()[0], inplace=True)
data_test['Item_Weight'].fillna(data_test['Item_Weight'].mean(), inplace=True)
data_test['Outlet_Size'].fillna(data_test['Outlet_Size'].mode()[0], inplace=True)

In [288]:
data_test['Item_Fat_Content'] = data_test['Item_Fat_Content'].map({'Low Fat': 0, 'LF':0, 'low fat':0, 'Regular': 1, 'reg':1})

data_test['Item_Type'] = data_test['Item_Type'].map({'Fruits and Vegetables': 0, 'Snack Foods': 1, 'Household': 2, 
                                          'Frozen Foods': 3, 'Dairy': 4, 'Canned': 5, 'Baking Goods': 6, 
                                           'Health and Hygiene': 7, 'Soft Drinks': 8, 'Meat': 9,
                                           'Breads': 10, 'Hard Drinks': 11, 'Others': 12, 'Starchy Foods': 13,
                                          'Breakfast': 14, 'Seafood': 15})

data_test['Outlet_Size'] = data_test['Outlet_Size'].map({'Small':0, 'Medium':1, 'High':2})
data_test['Outlet_Location_Type'] = data_test['Outlet_Location_Type'].map({'Tier 1':0, 'Tier 2':1, 'Tier 3':2})
data_test['Outlet_Type'] = data_test['Outlet_Type'].map({'Supermarket Type1':0, 'Supermarket Type2':1, 
                                               'Supermarket Type3':2, 'Grocery Store':3})

In [289]:
from sklearn import preprocessing

IV_test = np.array(data_test['Item_Visibility'])
MRP_test = np.array(data_test['Item_MRP'])
IW_test = np.array(data_test['Item_Weight'])
year_test = np.array(data_test['Outlet_Establishment_Year'])
fat_test = np.array(data_test['Item_Fat_Content'])
IT_test = np.array(data_test['Item_Type'])
size_test = np.array(data_test['Outlet_Size'])
OLT_test = np.array(data_test['Outlet_Location_Type'])
OT_test = np.array(data_test['Outlet_Type'])

norm_IV_test = preprocessing.normalize([IV_test])
norm_IMRP_test = preprocessing.normalize([MRP_test])
norm_IW_test = preprocessing.normalize([IW_test])
norm_year_test = preprocessing.normalize([year_test])
norm_fat_test = preprocessing.normalize([fat_test])
norm_IT_test = preprocessing.normalize([IT_test])
norm_size_test = preprocessing.normalize([size_test])
norm_OLT_test = preprocessing.normalize([OLT_test])
norm_OT_test = preprocessing.normalize([OT_test])

In [290]:
data_test['Item_Visibility'] = norm_IV_test[0]
data_test['Item_MRP'] = norm_IMRP_test[0]
data_test['Item_Weight'] = norm_IW_test[0]
data_test['Outlet_Establishment_Year'] = norm_year_test[0]
data_test['Item_Fat_Content'] = norm_fat_test[0]
data_test['Item_Type'] = norm_IT_test[0]
data_test['Outlet_Size'] = norm_size_test[0]
data_test['Outlet_Location_Type'] = norm_OLT_test[0]
data_test['Outlet_Type'] = norm_OT_test[0]
data_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,0.020566,0.0,0.001205,0.002273,0.009294,OUT049,0.013275,0.012962,0.0,0.0
1,FDW14,0.008226,0.022288,0.00612,0.009094,0.007524,OUT017,0.013328,0.012962,0.009628,0.0
2,NCN55,0.01447,0.0,0.015858,0.027281,0.020831,OUT010,0.013268,0.012962,0.019256,0.0306
3,FDQ58,0.00725,0.0,0.002451,0.002273,0.013359,OUT017,0.013328,0.012962,0.009628,0.0
4,FDY38,0.012583,0.022288,0.018887,0.009094,0.020183,OUT027,0.013182,0.012962,0.019256,0.0204


<h3>Model</h3>

In [352]:
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 
            'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 
           'Outlet_Type']

In [353]:
x = data[features]
y = data['Item_Outlet_Sales']
x.shape, y.shape

((8523, 9), (8523,))

In [354]:
x_test = data_test[features]
x_test.shape

(5681, 9)

In [414]:
x_train,x_val,y_train,y_val = train_test_split(x,y,random_state=10,test_size=0.2)

In [415]:
x_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
5470,0.012805,0.018239,0.008437,0.007457,0.005517,0.010892,0.010582,0.015721,0.008329
8431,0.015766,0.0,0.006887,0.0,0.003991,0.010854,0.010582,0.00786,0.0
3557,0.006127,0.0,0.004134,0.009322,0.00824,0.010854,0.010582,0.00786,0.0
7918,0.012165,0.0,0.004346,0.003729,0.00766,0.010838,0.010582,0.0,0.0
4573,0.009844,0.018239,0.007504,0.001864,0.004249,0.010865,0.0,0.00786,0.0


In [416]:
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import InputLayer, Dense 

In [417]:
input_neurons = x.shape[1]
output_neurons = 1
input_neurons

9

In [418]:
hidden_layers = 2
layer_1 = 8
layer_2 = 4

In [431]:
model = Sequential()
model.add(InputLayer(input_shape=(input_neurons,)))
model.add(Dense(units=layer_1, activation='relu'))
model.add(Dense(units=layer_2, activation='relu'))
model.add(Dense(units=output_neurons, activation='linear'))
model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_82 (Dense)             (None, 8)                 80        
_________________________________________________________________
dense_83 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_84 (Dense)             (None, 1)                 5         
Total params: 121
Trainable params: 121
Non-trainable params: 0
_________________________________________________________________


In [432]:
model.compile(optimizer="Adam", loss="mae",metrics=['mse'])
#model.compile(loss='mean_squared_error', optimizer='adam')

In [451]:
model_train = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [452]:
prediction = model.predict(x_val)

In [453]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_val, prediction, squared=False)

1728.3375901627387

In [454]:
test_predictions = model.predict(x_test)

In [455]:
test_predictions

array([[1836.4052],
       [1872.0831],
       [1767.3687],
       ...,
       [1846.9103],
       [2009.4125],
       [1843.2767]], dtype=float32)

In [456]:
submission = pd.DataFrame(data_test[['Item_Identifier', 'Outlet_Identifier']])
submission['Item_Outlet_Sales'] = test_predictions
submission.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1836.405151
1,FDW14,OUT017,1872.08313
2,NCN55,OUT010,1767.368652
3,FDQ58,OUT017,1885.280518
4,FDY38,OUT027,1857.07019


In [457]:
submission.to_csv('submission.csv', index=False)