In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Combine test data and train data into one file
train['source']='train'
test['source']='test'
data = pd.concat([train, test],ignore_index=True)
print(train.shape, test.shape, data.shape)

(8523, 13) (5681, 12) (14204, 13)


In [5]:
data.head()

Unnamed: 0,Item_Fat_Content,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,Outlet_Location_Type,Outlet_Size,Outlet_Type,source
0,Low Fat,FDA15,249.8092,3735.138,Dairy,0.016047,9.3,1999,OUT049,Tier 1,Medium,Supermarket Type1,train
1,Regular,DRC01,48.2692,443.4228,Soft Drinks,0.019278,5.92,2009,OUT018,Tier 3,Medium,Supermarket Type2,train
2,Low Fat,FDN15,141.618,2097.27,Meat,0.01676,17.5,1999,OUT049,Tier 1,Medium,Supermarket Type1,train
3,Regular,FDX07,182.095,732.38,Fruits and Vegetables,0.0,19.2,1998,OUT010,Tier 3,,Grocery Store,train
4,Low Fat,NCD19,53.8614,994.7052,Household,0.0,8.93,1987,OUT013,Tier 3,High,Supermarket Type1,train


In [6]:
data.describe()

Unnamed: 0,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Establishment_Year
count,14204.0,8523.0,14204.0,11765.0,14204.0
mean,141.004977,2181.288914,0.065953,12.792854,1997.830681
std,62.086938,1706.499616,0.051459,4.652502,8.371664
min,31.29,33.29,0.0,4.555,1985.0
25%,94.012,834.2474,0.027036,8.71,1987.0
50%,142.247,1794.331,0.054021,12.6,1999.0
75%,185.8556,3101.2964,0.094037,16.75,2004.0
max,266.8884,13086.9648,0.328391,21.35,2009.0


# Data Cleaning


In [7]:
for column_name in data.columns:
    count = (data[column_name] == 0).sum()
    print(column_name,count)

Item_Fat_Content 0
Item_Identifier 0
Item_MRP 0
Item_Outlet_Sales 0
Item_Type 0
Item_Visibility 879
Item_Weight 0
Outlet_Establishment_Year 0
Outlet_Identifier 0
Outlet_Location_Type 0
Outlet_Size 0
Outlet_Type 0
source 0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
Item_Fat_Content             14204 non-null object
Item_Identifier              14204 non-null object
Item_MRP                     14204 non-null float64
Item_Outlet_Sales            8523 non-null float64
Item_Type                    14204 non-null object
Item_Visibility              14204 non-null float64
Item_Weight                  11765 non-null float64
Outlet_Establishment_Year    14204 non-null int64
Outlet_Identifier            14204 non-null object
Outlet_Location_Type         14204 non-null object
Outlet_Size                  10188 non-null object
Outlet_Type                  14204 non-null object
source                       14204 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 1.4+ MB


In [9]:
#filling misssing values
data.Item_Outlet_Sales = data.Item_Outlet_Sales.fillna(data.Item_Outlet_Sales.mean())
data.Item_Weight = data.Item_Weight.fillna(data.Item_Weight.mean())
data['Outlet_Size'].value_counts()

Medium    4655
Small     3980
High      1553
Name: Outlet_Size, dtype: int64

In [10]:
data.Outlet_Size = data.Outlet_Size.fillna('Medium')

In [11]:
data.isnull().sum()

Item_Fat_Content             0
Item_Identifier              0
Item_MRP                     0
Item_Outlet_Sales            0
Item_Type                    0
Item_Visibility              0
Item_Weight                  0
Outlet_Establishment_Year    0
Outlet_Identifier            0
Outlet_Location_Type         0
Outlet_Size                  0
Outlet_Type                  0
source                       0
dtype: int64

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 13 columns):
Item_Fat_Content             14204 non-null object
Item_Identifier              14204 non-null object
Item_MRP                     14204 non-null float64
Item_Outlet_Sales            14204 non-null float64
Item_Type                    14204 non-null object
Item_Visibility              14204 non-null float64
Item_Weight                  14204 non-null float64
Outlet_Establishment_Year    14204 non-null int64
Outlet_Identifier            14204 non-null object
Outlet_Location_Type         14204 non-null object
Outlet_Size                  14204 non-null object
Outlet_Type                  14204 non-null object
source                       14204 non-null object
dtypes: float64(4), int64(1), object(8)
memory usage: 1.4+ MB


In [13]:
#categorize Item_Type into Non consumables, Drinks and Foods according to the Item_Identifier
data['Item_Identifier'].value_counts()
data['Item_Type_Combined'] = data['Item_Identifier'].apply(lambda x: x[0:2])
data['Item_Type_Combined'] = data['Item_Type_Combined'].map({'FD':'Food','NC':'Non-Consumable','DR':'Drinks'})
data['Item_Type_Combined'].value_counts()

Food              10201
Non-Consumable     2686
Drinks             1317
Name: Item_Type_Combined, dtype: int64

# Numerical and One-Hot Coding of Categorical variables


In [14]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()

data['Outlet'] = le.fit_transform(data['Outlet_Identifier'])
var_mod = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Type_Combined','Outlet_Type','Outlet']
le = LabelEncoder()
for i in var_mod:
    data[i] = le.fit_transform(data[i])

In [15]:
#One Hot Coding:
data = pd.get_dummies(data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type','Item_Type_Combined','Outlet'])

In [16]:
data.head()

Unnamed: 0,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Type,Item_Visibility,Item_Weight,Outlet_Establishment_Year,Outlet_Identifier,source,Item_Fat_Content_0,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,249.8092,3735.138,Dairy,0.016047,9.3,1999,OUT049,train,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,48.2692,443.4228,Soft Drinks,0.019278,5.92,2009,OUT018,train,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,141.618,2097.27,Meat,0.01676,17.5,1999,OUT049,train,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,182.095,732.38,Fruits and Vegetables,0.0,19.2,1998,OUT010,train,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,53.8614,994.7052,Household,0.0,8.93,1987,OUT013,train,0,...,0,1,0,0,0,0,0,0,0,0


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14204 entries, 0 to 14203
Data columns (total 37 columns):
Item_Identifier              14204 non-null object
Item_MRP                     14204 non-null float64
Item_Outlet_Sales            14204 non-null float64
Item_Type                    14204 non-null object
Item_Visibility              14204 non-null float64
Item_Weight                  14204 non-null float64
Outlet_Establishment_Year    14204 non-null int64
Outlet_Identifier            14204 non-null object
source                       14204 non-null object
Item_Fat_Content_0           14204 non-null uint8
Item_Fat_Content_1           14204 non-null uint8
Item_Fat_Content_2           14204 non-null uint8
Item_Fat_Content_3           14204 non-null uint8
Item_Fat_Content_4           14204 non-null uint8
Outlet_Location_Type_0       14204 non-null uint8
Outlet_Location_Type_1       14204 non-null uint8
Outlet_Location_Type_2       14204 non-null uint8
Outlet_Size_0                

# Exploratory Data Analysis

In [18]:
#Drop the columns 
data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1,inplace=True)


In [19]:
#Split the data back into test and train
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns
test.drop(['Item_Outlet_Sales','source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

#Export files into csv format
train.to_csv("train_modified.csv",index=False)
test.to_csv("test_modified.csv",index=False)

# Modelling


In [20]:
# Reading modified data
train2 = pd.read_csv("train_modified.csv")
test2 = pd.read_csv("test_modified.csv")

In [21]:
train2.head()

Unnamed: 0,Item_Identifier,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Identifier,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,249.8092,3735.138,0.016047,9.3,OUT049,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,48.2692,443.4228,0.019278,5.92,OUT018,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,141.618,2097.27,0.01676,17.5,OUT049,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,182.095,732.38,0.0,19.2,OUT010,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,NCD19,53.8614,994.7052,0.0,8.93,OUT013,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [22]:
X_train = train2.drop(['Item_Outlet_Sales', 'Outlet_Identifier','Item_Identifier'], axis=1)
y_train = train2.Item_Outlet_Sales

In [23]:
X_test = test2.drop(['Outlet_Identifier','Item_Identifier'], axis=1)


In [24]:
X_train.head()

Unnamed: 0,Item_MRP,Item_Visibility,Item_Weight,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Item_Fat_Content_3,Item_Fat_Content_4,Outlet_Location_Type_0,Outlet_Location_Type_1,...,Outlet_0,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,249.8092,0.016047,9.3,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,48.2692,0.019278,5.92,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,141.618,0.01676,17.5,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,182.095,0.0,19.2,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,53.8614,0.0,8.93,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [25]:
y_train.head()

0    3735.1380
1     443.4228
2    2097.2700
3     732.3800
4     994.7052
Name: Item_Outlet_Sales, dtype: float64

# Random Forest

In [26]:
from sklearn.ensemble import RandomForestRegressor
model1 = RandomForestRegressor(n_estimators=100,max_depth=6, min_samples_leaf=50,n_jobs=4)
model1.fit(X_train, y_train)

RandomForestRegressor(max_depth=6, min_samples_leaf=50, n_jobs=4)

In [27]:
y_pred = model1.predict(X_test)
y_pred

array([1658.85608693, 1365.7310414 ,  593.67581097, ..., 1941.16656893,
       3732.9685873 , 1300.76237174])

In [28]:
# Measuring Accuracy
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import KFold

In [29]:
rf_accuracy =model1.score(X_train,y_train)
rf_accuracy

0.6125787425223272

In [30]:
kf=KFold(n_splits=10)
score=cross_val_score(model1,X_train,y_train,cv=kf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.6024493  0.61987209 0.56942182 0.60667455 0.53854823 0.61068668
 0.62447836 0.59626746 0.63417304 0.57495785]
Average Cross Validation score :0.5977529380966179


In [31]:
submission = pd.DataFrame({
'Item_Identifier':test2['Item_Identifier'],
'Outlet_Identifier':test2['Outlet_Identifier'],
'Item_Outlet_Sales': y_pred
},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [32]:
submission.to_csv('solution1.csv',index=False)

# Linear Regression

In [33]:
from sklearn.linear_model import  LinearRegression
model2 = LinearRegression()
model2.fit(X_train, y_train)

LinearRegression()

In [34]:
y_pred = model2.predict(X_test)

In [35]:
y_pred

array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,
       3565.6645235 , 1267.46171871])

In [36]:
lr_accuracy =model2.score(X_train,y_train)
lr_accuracy

0.5635892777270479

In [37]:
kf=KFold(n_splits=10)
score=cross_val_score(model2,X_train,y_train,cv=kf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.56007338 0.58321392 0.54262136 0.56768806 0.51084995 0.57705962
 0.57864355 0.55527242 0.57979323 0.54809404]
Average Cross Validation score :0.5603309560979115


In [38]:
submission = pd.DataFrame({
'Item_Identifier':test2['Item_Identifier'],
'Outlet_Identifier':test2['Outlet_Identifier'],
'Item_Outlet_Sales': y_pred
},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [39]:
submission.to_csv('solution2.csv',index=False)

# Decision Tree

In [40]:
from sklearn.tree import DecisionTreeRegressor
model3 = DecisionTreeRegressor(max_depth=15,min_samples_leaf=300)
model3.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=15, min_samples_leaf=300)

In [41]:
y_pred = model2.predict(X_test)

In [42]:
y_pred

array([1848.53604783, 1472.81670435, 1875.65285894, ..., 1809.18796433,
       3565.6645235 , 1267.46171871])

In [43]:
dt_accuracy =model3.score(X_train,y_train)
dt_accuracy

0.5884050821570488

In [44]:
kf=KFold(n_splits=10)
score=cross_val_score(model3,X_train,y_train,cv=kf)
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))

Cross Validation Scores are [0.5855529  0.57810651 0.54307334 0.58291815 0.52136587 0.56946275
 0.59113351 0.56350217 0.60017802 0.54875222]
Average Cross Validation score :0.5684045432299738


In [45]:
submission = pd.DataFrame({
'Item_Identifier':test2['Item_Identifier'],
'Outlet_Identifier':test2['Outlet_Identifier'],
'Item_Outlet_Sales': y_pred
},columns=['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales'])

In [46]:
submission.to_csv('solution3.csv',index=False)

# Tuning

In [47]:
from hyperopt import tpe, hp, fmin, STATUS_OK,Trials

In [48]:
space={'n_estimators':hp.uniform("n_estimators",100,500),
       "max_depth":hp.uniform("max_depth",5,20),
       "min_samples_leaf":hp.uniform("min_samples_leaf",1,5),
       "min_samples_split":hp.uniform("min_samples_split", 2,0)}

In [49]:
# define objective function

def hyperparameter_tuning(params):
    clf =RandomForestRegressor(**params)
    acc = cross_val_score(clf, X_train,y_train,scoring="accuracy").mean()
    return {"loss": -acc, "status": STATUS_OK}

In [50]:
# Initialize trials object
trials = Trials()

best = fmin(
    fn=hyperparameter_tuning,
    space = space, 
    algo=tpe.suggest, 
    max_evals=50, 
    trials=trials
)

print("Best: {}".format(best))

100%|██████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 33.26trial/s, best loss=?]


AllTrialsFailed: 

In [51]:
trials.results

[{'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss': nan, 'status': 'ok'},
 {'loss'