**Changes Done**

1) Dropped the "item_Identifier" column from the data'

2) Used Random Forest Model with GridSearchCV/RandomizedSearchCV

3) No feature scaling is required, since decision tree are fundamental to Random forest and scaling is not required for decision trees

In [1]:
# Importing libraries 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Reading the datasets

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.shape, test.shape

((8523, 12), (5681, 11))

# Data Preprocessing

In [4]:
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace(["Low Fat", "low fat"], "LF")
train['Item_Fat_Content'] = train['Item_Fat_Content'].replace("Regular", "reg")

test['Item_Fat_Content'] = test['Item_Fat_Content'].replace(["Low Fat", "low fat"], "LF")
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace("Regular", "reg")

**Replacing null values in "Item_Weight" column with the average weight of respective "Item_Type"** 

In [5]:
grouped = train.groupby("Item_Type")["Item_Weight"]

fill_values = {}
for name, group in grouped:
    fill_values[name] = group.mean()
    
fill_func = lambda g:g.fillna(fill_values[g.name])
train["Item_Weight"] = train.groupby("Item_Type")["Item_Weight"].apply(fill_func)
test["Item_Weight"] = test.groupby("Item_Type")["Item_Weight"].apply(fill_func)

In [6]:
print(train.Item_Weight.isna().sum())
print(test.Item_Weight.isna().sum())

0
0


**Create another category for missing values in "Outlet_Size" column**

In [7]:
train['Outlet_Size'] = train['Outlet_Size'].replace(np.NaN, "Unknown")
test['Outlet_Size'] = test['Outlet_Size'].replace(np.NaN, "Unknown")

In [8]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,reg,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,reg,0.0,Fruits and Vegetables,182.095,OUT010,1998,Unknown,Tier 3,Grocery Store,732.38
4,NCD19,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [9]:
train.Item_Identifier.nunique()

1559

**Concatenating the train and test data**

In [10]:
labels = train.iloc[:, -1]
train = train.iloc[:, :-1]

In [11]:
data = pd.concat([train, test], ignore_index=True)
data.drop('Item_Identifier', axis=1, inplace=True)

In [12]:
data.shape

(14204, 10)

In [13]:
data.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,9.3,LF,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,5.92,reg,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,17.5,LF,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,19.2,reg,0.0,Fruits and Vegetables,182.095,OUT010,1998,Unknown,Tier 3,Grocery Store
4,8.93,LF,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


**Applying Categorical Encoding**

In [14]:
categorical_var = np.where(data.dtypes != np.float)[0]

columns_names_encod = data.columns[np.array(categorical_var)]
data_transformed = pd.get_dummies(data, columns=columns_names_encod, drop_first=True)

In [15]:
data_transformed.shape

(14204, 44)

**Splitting the data back**

In [16]:
final_train = data_transformed[:train.shape[0]]
final_test = data_transformed[train.shape[0]:]

In [17]:
final_train.shape, final_test.shape

((8523, 44), (5681, 44))

**Creating validation set from training set**

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(final_train, labels, test_size=0.2, random_state=21, shuffle=False)

In [19]:
final_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_reg,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,...,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_Unknown,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5.92,0.019278,48.2692,1,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,1,0
2,17.5,0.01676,141.618,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,19.2,0.0,182.095,1,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
4,8.93,0.0,53.8614,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [20]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Fat_Content_reg,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,...,Outlet_Establishment_Year_2007,Outlet_Establishment_Year_2009,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Size_Unknown,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,5.92,0.019278,48.2692,1,0,0,0,0,0,0,...,0,1,1,0,0,0,1,0,1,0
2,17.5,0.01676,141.618,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,19.2,0.0,182.095,1,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0
4,8.93,0.0,53.8614,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [21]:
X_train.shape, X_cv.shape

((6818, 44), (1705, 44))

**Feature Scaling**

In [22]:
"""
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaled_train_columns = sc.fit_transform(X_train.iloc[:, [0, 1, 2]].values)
scaled_cv_columns = sc.transform(X_cv.iloc[:, [0, 1, 2]].values)
scaled_test_columns = sc.transform(final_test.iloc[:, [0, 1, 2]].values)

X_train.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_train_columns
X_cv.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_cv_columns
final_test.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] =scaled_test_columns

"""

"\nfrom sklearn.preprocessing import StandardScaler\nsc = StandardScaler()\n\nscaled_train_columns = sc.fit_transform(X_train.iloc[:, [0, 1, 2]].values)\nscaled_cv_columns = sc.transform(X_cv.iloc[:, [0, 1, 2]].values)\nscaled_test_columns = sc.transform(final_test.iloc[:, [0, 1, 2]].values)\n\nX_train.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_train_columns\nX_cv.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] = scaled_cv_columns\nfinal_test.loc[:, ['Item_Weight', 'Item_Visibility', 'Item_MRP']] =scaled_test_columns\n\n"

# Training the data and getting results for cross-validation set

In [26]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

param_grid = [{'n_estimators': [50, 100, 130, 150],
               'max_features': [6, 8, 10, 15, 20, 25, 30],
               'max_depth': [4, 6, 8, 10, 12, 14, 18, 20],
               'min_samples_split': [15, 10, 5],
               'bootstrap': [True, False],
              },]

forest_reg = RandomForestRegressor()

grid_search = RandomizedSearchCV(forest_reg, param_grid, n_iter = 100, cv=3, scoring='neg_mean_squared_error', return_train_score=True)

grid_search.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [29]:
final_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_estimator_.feature_importances_)

y_train_pred = final_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(train_rmse)

y_cv_pred = final_model.predict(X_cv)
cv_rmse = np.sqrt(mean_squared_error(y_cv, y_cv_pred))
print(cv_rmse)

{'n_estimators': 100, 'min_samples_split': 10, 'max_features': 25, 'max_depth': 6, 'bootstrap': True}
[5.03166126e-03 8.37247630e-03 5.50462913e-01 7.71759297e-04
 2.07836067e-04 2.28447750e-04 3.01528011e-04 5.63395124e-04
 1.32182911e-04 1.58449086e-03 4.91895942e-04 3.76766942e-04
 8.11923637e-04 2.69350442e-04 6.74648633e-05 1.87061391e-04
 4.74066888e-04 7.01119083e-04 2.56010207e-04 1.33506785e-04
 7.99943973e-05 1.06549504e-02 5.65362891e-02 8.56622084e-02
 8.73450669e-05 3.50643258e-04 7.20633180e-05 8.23966123e-05
 5.32957831e-05 1.20438775e-04 7.19970034e-02 1.73977022e-05
 1.44384700e-04 1.22838064e-04 3.75146179e-05 1.41767571e-02
 1.53637267e-02 1.61422361e-03 3.59284926e-03 9.23081124e-04
 1.30581914e-03 8.23629819e-02 1.18255116e-02 7.13904293e-02]
1046.0665953856223
1086.7903813408136


# Testing the model on Test Dataset

In [30]:
predictions_test = final_model.predict(final_test)

In [31]:
predictions_test = pd.Series(predictions_test)

In [32]:
frame = {'Item_Identifier': test.Item_Identifier, 'Outlet_Identifier':test.Outlet_Identifier, 
         'Item_Outlet_Sales': predictions_test}
submission = pd.DataFrame(frame)
submission.to_csv('Random_Forest_Submission_9.csv', index=False)