**Changes Done**

1) Dropped the "Item_Identifier" categorical column from the data.

2) Used Decision Tree Regressor

In [1]:
# Importing libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Reading the datasets

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.shape, test.shape

((8523, 12), (5681, 11))

# Data Preprocessing

In [4]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(["Low Fat", "low fat"], "LF")
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace("Regular", "reg")

test['Item_Fat_Content'] = test['Item_Fat_Content'].replace(["Low Fat", "low fat"], "LF")
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace("Regular", "reg")

**Replacing null values in "Item_Weight" column with the average weight of respective "Item_Type"** 

In [5]:
grouped = train.groupby("Item_Type")["Item_Weight"]

fill_values = {}
for name, group in grouped:
    fill_values[name] = group.mean()
    
fill_func = lambda g:g.fillna(fill_values[g.name])
train["Item_Weight"] = train.groupby("Item_Type")["Item_Weight"].apply(fill_func)
test["Item_Weight"] = test.groupby("Item_Type")["Item_Weight"].apply(fill_func)

In [6]:
print(train.Item_Weight.isna().sum())
print(test.Item_Weight.isna().sum())

0
0


**Create another category for missing values in "Outlet_Size" column**

In [7]:
train['Outlet_Size'] = train['Outlet_Size'].replace(np.NaN, "Unknown")
test['Outlet_Size'] = test['Outlet_Size'].replace(np.NaN, "Unknown")

In [8]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,reg,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,reg,0.0,Fruits and Vegetables,182.095,OUT010,1998,Unknown,Tier 3,Grocery Store,732.38
4,NCD19,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
train.Item_Identifier.nunique()

1559

**Concatenating the train and test data**

In [10]:
labels = train.iloc[:, -1]
train = train.iloc[:, :-1]

In [11]:
data = pd.concat([train, test], ignore_index=True)
data.drop('Item_Identifier', axis=1, inplace=True)

In [12]:
data.shape

(14204, 10)

In [13]:
data.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,5.92,reg,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,19.2,reg,0.0,Fruits and Vegetables,182.095,OUT010,1998,Unknown,Tier 3,Grocery Store
4,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


**Applying Categorical Encoding**

In [14]:
categorical_var = np.where(data.dtypes != np.float)[0]

columns_names_encod = data.columns[np.array(categorical_var)]
data_transformed = pd.get_dummies(data, columns=columns_names_encod, drop_first=True)

In [15]:
data_transformed.shape

(14204, 44)

**Splitting the data back**

In [16]:
final_train = data_transformed[:train.shape[0]]
final_test = data_transformed[train.shape[0]:]

In [17]:
final_train.shape, final_test.shape

((8523, 44), (5681, 44))

**Creating validation set from training set**

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(final_train, labels, test_size=0.2, random_state=21, shuffle=False)

In [19]:
final_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_reg,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,...,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_Unknown,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5.92,0.019278,48.2692,1,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,1,0
2,17.5,0.01676,141.618,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,19.2,0.0,182.095,1,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
4,8.93,0.0,53.8614,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [20]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_reg,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,...,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_Unknown,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5.92,0.019278,48.2692,1,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,1,0
2,17.5,0.01676,141.618,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,19.2,0.0,182.095,1,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
4,8.93,0.0,53.8614,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [21]:
X_train.shape, X_cv.shape

((6818, 44), (1705, 44))

**Feature Scaling**

In [22]:
"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaled_train_columns = sc.fit_transform(X_train.iloc[:, [0, 1, 2]].values)
scaled_cv_columns = sc.transform(X_cv.iloc[:, [0, 1, 2]].values)
scaled_test_columns = sc.transform(final_test.iloc[:, [0, 1, 2]].values)

"""
# Since it is a decision tree classifier, no need for scaling

'\nfrom sklearn.preprocessing import StandardScaler\nsc = StandardScaler()\n\nscaled_train_columns = sc.fit_transform(X_train.iloc[:, [0, 1, 2]].values)\nscaled_cv_columns = sc.transform(X_cv.iloc[:, [0, 1, 2]].values)\nscaled_test_columns = sc.transform(final_test.iloc[:, [0, 1, 2]].values)\n\n'

In [24]:
"""
X_train.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_train_columns
X_cv.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_cv_columns
final_test.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] =scaled_test_columns
"""

"\nX_train.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_train_columns\nX_cv.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_cv_columns\nfinal_test.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] =scaled_test_columns\n"

# Training the data and getting results for cross-validation set

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

param_grid = [{'max_depth': [ 4, 6, 8, 10, 12, 14, 16, 18, 20],
               'min_samples_split': [10, 8, 6, 4, 2],
               'min_samples_leaf': [1, 5, 10, 20, 50, 100],
    
             }]

dt_reg = DecisionTreeRegressor()

grid_search = GridSearchCV(dt_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'max_depth': [4, 6, 8, 10, 12, 14, 16, 18, 20],
                          'min_samples_leaf': [1, 5, 10, 20, 50, 100],
          

In [26]:
# To get all the evaluation scores
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1135.0883663075838 {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 10}
1135.0883663075838 {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 8}
1135.0883663075838 {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 6}
1135.0883663075838 {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4}
1135.088366307584 {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}
1133.947149883359 {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 10}
1133.947149883359 {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 8}
1133.947149883359 {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 6}
1133.9471498833593 {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 4}
1133.947149883359 {'max_depth': 4, 'min_samples_leaf': 5, 'min_samples_split': 2}
1134.4940856263902 {'max_depth': 4, 'min_samples_leaf': 10, 'min_samples_split': 10}
1134.4940856263902 {'max_depth': 4, 'min_samples_leaf': 10, 'min_samples_split': 8}
1134

In [31]:
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_
print(grid_search.best_params_)

y_train_pred = final_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(train_rmse)

y_cv_pred = final_model.predict(X_cv)
cv_rmse = np.sqrt(mean_squared_error(y_cv, y_cv_pred))
print(cv_rmse)

{'max_depth': 6, 'min_samples_leaf': 50, 'min_samples_split': 4}
1056.8782707256084
1085.092035514753


# Testing the model on Test Dataset

In [32]:
predictions_test = final_model.predict(final_test)

In [33]:
predictions_test = pd.Series(predictions_test)

In [34]:
frame = {'Item_Identifier': test.Item_Identifier, 'Outlet_Identifier':test.Outlet_Identifier, 
         'Item_Outlet_Sales': predictions_test}
submission = pd.DataFrame(frame)
submission.to_csv('Decision_Tree_Submission_8.csv', index=False)