In [1]:
import os, sys
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train , test = train_test_split(train_df,test_size=0.2,random_state=42)

In [4]:
test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
7503,FDI28,14.3,Low Fat,0.0263,Frozen Foods,79.4302,OUT013,1987,High,Tier 3,Supermarket Type1,1743.0644
2957,NCM17,7.93,Low Fat,0.071136,Health and Hygiene,42.7086,OUT046,1997,Small,Tier 1,Supermarket Type1,356.8688
7031,FDC14,14.5,Regular,0.041313,Canned,42.0454,OUT049,1999,Medium,Tier 1,Supermarket Type1,377.5086
1084,DRC36,,Regular,0.044767,Soft Drinks,173.7054,OUT027,1985,Medium,Tier 3,Supermarket Type3,5778.4782
856,FDS27,10.195,Regular,0.012456,Meat,197.511,OUT035,2004,Small,Tier 2,Supermarket Type1,2356.932


In [6]:
target_column_name = 'Item_Outlet_Sales'
input_feature_train_df = train.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train[target_column_name]

input_feature_test_df = test.drop(columns=target_column_name,axis=1)
target_feature_test_df = test[target_column_name]

In [7]:
# function to get numerical and categorical columns 
def Numerical_categorical_column(dataframe):
    '''
    This function returns the numerical and categorical column 
    return numerical_columns,categorical_columns
    '''
    numerical_columns = [i for i in dataframe.columns if  dataframe[i].dtype != 'O']
    categorical_columns = [i for i in dataframe.columns if  dataframe[i].dtype == 'O']
    return numerical_columns,categorical_columns

numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)
print(f'numerical_features : {numerical_features}')
print(f'categorical_features : {categorical_features}')

numerical_features : ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
categorical_features : ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [8]:
# Replacing same values with different names 
input_feature_train_df['Item_Fat_Content'] = input_feature_train_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])
input_feature_test_df['Item_Fat_Content'] = input_feature_test_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])

In [9]:
#Removing unwanted data from name item_identifier
input_feature_train_df['Item_Identifier'] = input_feature_train_df['Item_Identifier'].apply(lambda x:x[:2])
input_feature_test_df['Item_Identifier'] = input_feature_test_df['Item_Identifier'].apply(lambda x:x[:2])

In [10]:
# Feature Engineering of Outlet Establishment year
input_feature_train_df['Outlet_age'] = 2013 - input_feature_train_df['Outlet_Establishment_Year']
input_feature_train_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

input_feature_test_df['Outlet_age'] = 2013 - test_df['Outlet_Establishment_Year']
input_feature_test_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

In [11]:
# # Those product who are non-consumbale but have fat content will replace them with non-ediable fat content

input_feature_train_df.loc[input_feature_train_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'
input_feature_test_df.loc[input_feature_test_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'

In [12]:
# dropping unwanted columns will work on it after some time 
input_feature_train_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)
input_feature_test_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)

In [13]:
numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)
numerical_features

['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_age']

In [14]:
categorical_features

['Item_Identifier',
 'Item_Fat_Content',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type']

In [15]:
input_feature_test_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_age
7503,FD,14.300,Low Fat,0.026300,79.4302,High,Tier 3,Supermarket Type1,
2957,NC,7.930,Non Edible,0.071136,42.7086,Small,Tier 1,Supermarket Type1,26.0
7031,FD,14.500,Regular,0.041313,42.0454,Medium,Tier 1,Supermarket Type1,
1084,DR,,Regular,0.044767,173.7054,Medium,Tier 3,Supermarket Type3,11.0
856,FD,10.195,Regular,0.012456,197.5110,Small,Tier 2,Supermarket Type1,9.0
...,...,...,...,...,...,...,...,...,...
7205,FD,11.800,Regular,0.093656,127.1704,Small,Tier 1,Supermarket Type1,
3257,FD,7.020,Low Fat,0.000000,148.1734,Medium,Tier 3,Supermarket Type2,28.0
6346,FD,14.500,Regular,0.041215,42.0454,High,Tier 3,Supermarket Type1,
6318,FD,9.800,Regular,0.141184,50.5008,Medium,Tier 3,Supermarket Type2,


In [16]:
train_num_df = input_feature_train_df[numerical_features]
train_cat_df = input_feature_train_df[categorical_features]
train_cat_df = train_cat_df.apply(LabelEncoder().fit_transform)
train_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

test_num_df = input_feature_test_df[numerical_features]
test_cat_df = input_feature_test_df[categorical_features]
test_cat_df = test_cat_df .apply(LabelEncoder().fit_transform)
test_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

## Handling Missing Values
### Numerical Features

In [17]:
imputer_num=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
new_array=imputer_num.fit_transform(train_num_df) # impute the missing values
train_num_df = pd.DataFrame(data=new_array,columns=train_num_df.columns)


scaler = StandardScaler()
num_array = scaler.fit_transform(train_num_df)
train_num_df = pd.DataFrame(num_array,columns=train_num_df.columns)

train_num_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age
0,-0.775861,-0.600703,0.470709,-0.136169
1,1.165811,-0.362159,0.457877,-0.493521
2,1.074438,0.194933,-0.482625,0.102066
3,-1.044269,-0.704944,-1.603553,-0.493521
4,-0.010614,1.383177,0.218375,0.102066
...,...,...,...,...
6813,-0.799846,4.282848,-0.043511,-0.017052
6814,0.617574,1.001006,-1.059078,-1.089109
6815,1.074438,-0.916931,1.526207,-0.493521
6816,1.702626,-0.228187,-0.383072,-1.089109


In [18]:
imputer_cat=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
new_array=imputer_cat.fit_transform(train_cat_df) # impute the missing values
train_cat_df = pd.DataFrame(data=new_array,columns=train_cat_df.columns)



scaler = StandardScaler()
cat_array = scaler.fit_transform(train_cat_df)
train_cat_df = pd.DataFrame(cat_array,columns=train_cat_df.columns)
train_cat_df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.182954,1.226883,-0.429160,-1.383482,-0.259489
1,1.727281,0.109479,0.897829,-0.149659,-0.259489
2,-0.182954,1.226883,0.897829,-1.383482,-0.259489
3,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
4,-0.182954,-1.007925,0.897829,-1.383482,-0.259489
...,...,...,...,...,...
6813,-0.182954,1.226883,-1.756149,1.084165,-1.509802
6814,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
6815,1.727281,0.109479,0.897829,-0.149659,-0.259489
6816,-0.182954,-1.007925,0.897829,-0.149659,-0.259489


In [19]:
train_df = pd.concat([train_num_df,train_cat_df],axis=1)

In [20]:
# Piepline for Numerical Columns 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])

### Categorical Features

In [21]:
cat_df = train_df[categorical_features]
cat_df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.182954,1.226883,-0.429160,-1.383482,-0.259489
1,1.727281,0.109479,0.897829,-0.149659,-0.259489
2,-0.182954,1.226883,0.897829,-1.383482,-0.259489
3,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
4,-0.182954,-1.007925,0.897829,-1.383482,-0.259489
...,...,...,...,...,...
6813,-0.182954,1.226883,-1.756149,1.084165,-1.509802
6814,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
6815,1.727281,0.109479,0.897829,-0.149659,-0.259489
6816,-0.182954,-1.007925,0.897829,-0.149659,-0.259489


In [22]:
cat_df = cat_df.apply(LabelEncoder().fit_transform)


In [23]:
# While encoding Nan values of Outlet_size got encoded with 3 so we are replacing it 
cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [25]:
cat_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])

In [26]:
cat_array = cat_pipeline.fit_transform(cat_df)
cat_df = pd.DataFrame(cat_array,columns=cat_df.columns)
cat_df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.182954,1.226883,-0.429160,-1.383482,-0.259489
1,1.727281,0.109479,0.897829,-0.149659,-0.259489
2,-0.182954,1.226883,0.897829,-1.383482,-0.259489
3,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
4,-0.182954,-1.007925,0.897829,-1.383482,-0.259489
...,...,...,...,...,...
6813,-0.182954,1.226883,-1.756149,1.084165,-1.509802
6814,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
6815,1.727281,0.109479,0.897829,-0.149659,-0.259489
6816,-0.182954,-1.007925,0.897829,-0.149659,-0.259489


#### Creating Pipeline

In [27]:
from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
            ])

In [28]:
num_df = input_feature_train_df[numerical_features]
cat_df = input_feature_train_df[categorical_features]
cat_df = cat_df.apply(LabelEncoder().fit_transform)
cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

In [29]:
input_feature_train_df = pd.concat([num_df,cat_df],axis=1)
input_feature_train_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
549,9.500,0.035206,171.3448,14,1,2,1.0,0,1
7757,18.000,0.047473,170.5422,11,2,1,,1,1
764,17.600,0.076122,111.7202,16,1,2,2.0,0,1
6867,8.325,0.029845,41.6138,11,1,0,,1,1
2716,12.850,0.137228,155.5630,16,1,0,2.0,0,1
...,...,...,...,...,...,...,...,...,...
5734,9.395,0.286345,139.1838,15,1,2,,2,0
5191,15.600,0.117575,75.6670,6,1,0,,1,1
5390,17.600,0.018944,237.3590,11,2,1,,1,1
860,20.350,0.054363,117.9466,6,1,0,,1,1


In [30]:
array = preprocessing.fit_transform(input_feature_train_df)
df = pd.DataFrame(array,columns=input_feature_train_df.columns)
df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,-0.775861,-0.600703,0.470709,-0.136169,-0.182954,1.226883,-0.429160,-1.383482,-0.259489
1,1.165811,-0.362159,0.457877,-0.493521,1.727281,0.109479,0.897829,-0.149659,-0.259489
2,1.074438,0.194933,-0.482625,0.102066,-0.182954,1.226883,0.897829,-1.383482,-0.259489
3,-1.044269,-0.704944,-1.603553,-0.493521,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
4,-0.010614,1.383177,0.218375,0.102066,-0.182954,-1.007925,0.897829,-1.383482,-0.259489
...,...,...,...,...,...,...,...,...,...
6813,-0.799846,4.282848,-0.043511,-0.017052,-0.182954,1.226883,-1.756149,1.084165,-1.509802
6814,0.617574,1.001006,-1.059078,-1.089109,-0.182954,-1.007925,0.897829,-0.149659,-0.259489
6815,1.074438,-0.916931,1.526207,-0.493521,1.727281,0.109479,0.897829,-0.149659,-0.259489
6816,1.702626,-0.228187,-0.383072,-1.089109,-0.182954,-1.007925,0.897829,-0.149659,-0.259489


In [43]:

# All transformation combined
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train,test = train_test_split(train_df,test_size=0.2,random_state=42)

target_column_name = 'Item_Outlet_Sales'
input_feature_train_df = train_df.drop(columns=[target_column_name],axis=1)
target_feature_train_df = train_df[target_column_name]

#input_feature_test_df = test_df.drop(columns=[target_column_name],axis=1)
#target_feature_test_df = test_df[target_column_name]
input_feature_test_df = test_df

# function to get numerical and categorical columns 
def Numerical_categorical_column(dataframe):
    '''
    This function returns the numerical and categorical column 
    return numerical_columns,categorical_columns
    '''
    numerical_columns = [i for i in dataframe.columns if  dataframe[i].dtype != 'O']
    categorical_columns = [i for i in dataframe.columns if  dataframe[i].dtype == 'O']
    return numerical_columns,categorical_columns



#Replacing same values with different names 
input_feature_train_df['Item_Fat_Content'] = input_feature_train_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])
input_feature_test_df['Item_Fat_Content'] = input_feature_test_df['Item_Fat_Content'].replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'])

#Removing unwanted data from name item_identifier
input_feature_train_df['Item_Identifier'] = input_feature_train_df['Item_Identifier'].apply(lambda x:x[:2])
input_feature_test_df['Item_Identifier'] = input_feature_test_df['Item_Identifier'].apply(lambda x:x[:2])

# Feature Engineering of Outlet Establishment year
input_feature_train_df['Outlet_age'] = 2013 - input_feature_train_df['Outlet_Establishment_Year']
input_feature_train_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

input_feature_test_df['Outlet_age'] = 2013 - test_df['Outlet_Establishment_Year']
input_feature_test_df.drop(columns=['Outlet_Establishment_Year'],inplace=True)

#Those product who are non-consumbale but have fat content will replace them with non-ediable fat content

input_feature_train_df.loc[input_feature_train_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'
input_feature_test_df.loc[input_feature_test_df['Item_Identifier']=='NC','Item_Fat_Content'] = 'Non Edible'


# dropping unwanted columns will work on it after some time 
input_feature_train_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)
input_feature_test_df.drop(columns=['Item_Type','Outlet_Identifier'],inplace = True)

numerical_features,categorical_features = Numerical_categorical_column(input_feature_train_df)


# Piepline for Numerical Columns 
from sklearn.pipeline import Pipeline
num_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])


cat_pipeline = Pipeline(steps=[
    ('imputer',KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)),
    ('scaler',StandardScaler())
])



from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer([
                ('num_pipeline', num_pipeline, numerical_features),
                ('cat_pipeline', cat_pipeline, categorical_features),
            ])

train_num_df = input_feature_train_df[numerical_features]
train_cat_df = input_feature_train_df[categorical_features]
train_cat_df = train_cat_df.apply(LabelEncoder().fit_transform)
train_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

test_num_df = input_feature_test_df[numerical_features]
test_cat_df = input_feature_test_df[categorical_features]
test_cat_df = test_cat_df .apply(LabelEncoder().fit_transform)
test_cat_df['Outlet_Size'].replace(3,np.nan,inplace=True)

input_feature_train_df = pd.concat([train_num_df,train_cat_df],axis=1)
input_feature_test_df = pd.concat([test_num_df,test_cat_df],axis=1)


train_array = preprocessing.fit_transform(input_feature_train_df)
input_feature_train_df = pd.DataFrame(train_array,columns=input_feature_train_df.columns)
input_feature_train_df



test_array = preprocessing.transform(input_feature_test_df)
input_feature_test_df = pd.DataFrame(test_array,columns=input_feature_test_df.columns)
input_feature_test_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_age,Item_Identifier,Item_Fat_Content,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1.798914,-1.135138,-0.532035,-0.139541,-0.179795,-0.997813,-0.429154,-1.369334,-0.252658
1,-1.047131,-0.536960,-0.861920,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658
2,0.393036,0.648183,1.618094,-0.020085,1.735696,0.119565,-1.756324,1.091569,-1.508289
3,-1.272300,-0.983503,0.225484,-1.095190,-0.179795,-0.997813,0.898016,-0.138882,-0.252658
4,0.186917,1.016910,1.497272,1.532846,-0.179795,1.236942,-0.429154,1.091569,2.258603
...,...,...,...,...,...,...,...,...,...
5676,-0.544215,-1.020172,0.005181,0.099372,-0.179795,1.236942,0.898016,-1.369334,-0.252658
5677,-1.207149,1.489663,0.452086,-1.334103,-0.179795,1.236942,-0.429154,1.091569,1.002972
5678,-0.658514,0.143358,-0.357287,-0.497909,1.735696,0.119565,0.898016,-0.138882,-0.252658
5679,0.553055,-1.281758,1.182389,-1.095190,-0.179795,1.236942,0.898016,-0.138882,-0.252658


In [44]:
train_array  = pd.concat([input_feature_train_df,target_feature_train_df],axis=1)
#test_array  = pd.concat([input_feature_test_df,target_feature_test_df],axis=1)

In [45]:
X = train_array.drop(columns=['Item_Outlet_Sales'])
y = train_array['Item_Outlet_Sales']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
model=RandomForestRegressor(criterion='absolute_error', max_depth = 7, max_features = 0.79, max_samples = 0.7, n_estimators = 75)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
r2=r2_score(y_test, y_pred)
r2

print(f'r2 score : {r2}')


mse = mean_squared_error(y_test, y_pred)
print("RMSE: %.2f" % (mse**(1/2.0)))

r2 score : 0.6133800628452333
RMSE: 1025.10


In [47]:
from sklearn.model_selection import GridSearchCV
pramrf = {'n_estimators':range(10,100,5),
        'criterion':["squared_error", "absolute_error"],
       'max_depth':range(3,10,1),
        'max_features':[i/100.0 for i in range(70,100,3)],
        'max_samples':[i/100.0 for i in range(70,100,5)]
       }

gridrf  = GridSearchCV(RandomForestRegressor(), pramrf, cv=5, n_jobs=-1)
gridrf.fit(X_train,y_train)
gridrf.best_params_