Importing Dependencies


In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

import mlflow
import mlflow.xgboost



In [45]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Sales Prediction")
mlflow.xgboost.autolog()

Data Collection and Analysis

In [46]:
#loading data
big_mart_data=pd.read_csv('Train.csv')
testing_big_mart_data=pd.read_csv('Test.csv')


In [47]:
big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [48]:
#number of data pounts and features
big_mart_data.shape

(8523, 12)

In [49]:
big_mart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


Categorical features:-

-Item_Identifier
-Item_Fat_Content 
-Item_Type 
-Outlet_Identifier
-Outlet_Size  
-Outlet_Location_Type
-Outlet_Type    

In [50]:
#checking missing values
big_mart_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Handling missing values

Mean-->average value [For weight as numerical value],
Mode--> Most repeated value [For outlet size as object value]

In [51]:
#mean value for weight
big_mart_data['Item_Weight'].mean()

12.857645184135976

In [52]:
#filling the missing values in item weight colun with mean value
big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean(),inplace=True)
testing_big_mart_data['Item_Weight'].fillna(testing_big_mart_data['Item_Weight'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  testing_big_mart_data['Item_Weight'].fillna(testing_big_mart_data['Item_Weight'].mean(),inplace=True)


In [53]:
big_mart_data.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

Replacing missing values of outlet size with mode

In [54]:
mode_of_outlet_size = big_mart_data.groupby('Outlet_Type')['Outlet_Size'].agg(lambda x: x.mode()[0])
print(mode_of_outlet_size)

test_mode_of_outlet_size = testing_big_mart_data.groupby('Outlet_Type')['Outlet_Size'].agg(lambda x: x.mode()[0])



Outlet_Type
Grocery Store         Small
Supermarket Type1     Small
Supermarket Type2    Medium
Supermarket Type3    Medium
Name: Outlet_Size, dtype: object


In [55]:
missing_values=big_mart_data['Outlet_Size'].isnull()
print(missing_values)

test_missing_values=testing_big_mart_data['Outlet_Size'].isnull()
print(test_missing_values)

0       False
1       False
2       False
3        True
4       False
        ...  
8518    False
8519     True
8520    False
8521    False
8522    False
Name: Outlet_Size, Length: 8523, dtype: bool
0       False
1        True
2        True
3        True
4       False
        ...  
5676    False
5677    False
5678     True
5679     True
5680     True
Name: Outlet_Size, Length: 5681, dtype: bool


In [56]:
big_mart_data.loc[missing_values,'Outlet_Size']=big_mart_data.loc[missing_values,'Outlet_Type'].map(mode_of_outlet_size)
testing_big_mart_data.loc[test_missing_values,'Outlet_Size']=testing_big_mart_data.loc[test_missing_values,'Outlet_Type'].map(test_mode_of_outlet_size)

In [57]:
big_mart_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [58]:
testing_big_mart_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

Data Analysis

In [59]:
big_mart_data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.226124,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,9.31,0.026989,93.8265,1987.0,834.2474
50%,12.857645,0.053931,143.0128,1999.0,1794.331
75%,16.0,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


Data Preprocessing

In [60]:
big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [61]:
big_mart_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64

In [62]:
big_mart_data.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat','reg':'Regular'}},inplace=True)
big_mart_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

In [63]:
testing_big_mart_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    3396
Regular    1935
LF          206
reg          78
low fat      66
Name: count, dtype: int64

In [64]:
testing_big_mart_data.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat','reg':'Regular'}},inplace=True)
testing_big_mart_data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    3668
Regular    2013
Name: count, dtype: int64

Label Encoding

In [65]:
encoder=LabelEncoder()

In [66]:
big_mart_data['Item_Identifier']=encoder.fit_transform(big_mart_data['Item_Identifier'])
big_mart_data['Item_Fat_Content']=encoder.fit_transform(big_mart_data['Item_Fat_Content'])
big_mart_data['Item_Type']=encoder.fit_transform(big_mart_data['Item_Type'])
big_mart_data['Outlet_Identifier']=encoder.fit_transform(big_mart_data['Outlet_Identifier'])
big_mart_data['Outlet_Size']=encoder.fit_transform(big_mart_data['Outlet_Size'])
big_mart_data['Outlet_Location_Type']=encoder.fit_transform(big_mart_data['Outlet_Location_Type'])
big_mart_data['Outlet_Type']=encoder.fit_transform(big_mart_data['Outlet_Type'])

In [67]:
big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,1,0.0,6,182.095,0,1998,2,2,0,732.38
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052


In [68]:
testing_big_mart_data['Item_Identifier']=encoder.fit_transform(testing_big_mart_data['Item_Identifier'])
testing_big_mart_data['Item_Fat_Content']=encoder.fit_transform(testing_big_mart_data['Item_Fat_Content'])
testing_big_mart_data['Item_Type']=encoder.fit_transform(testing_big_mart_data['Item_Type'])
testing_big_mart_data['Outlet_Identifier']=encoder.fit_transform(testing_big_mart_data['Outlet_Identifier'])
testing_big_mart_data['Outlet_Size']=encoder.fit_transform(testing_big_mart_data['Outlet_Size'])
testing_big_mart_data['Outlet_Location_Type']=encoder.fit_transform(testing_big_mart_data['Outlet_Location_Type'])
testing_big_mart_data['Outlet_Type']=encoder.fit_transform(testing_big_mart_data['Outlet_Type'])

In [69]:
testing_big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1103,20.75,0,0.007565,13,107.8622,9,1999,1,0,1
1,1067,8.3,1,0.038428,4,87.3198,2,2007,2,1,1
2,1406,14.6,0,0.099575,11,241.7538,0,1998,2,2,0
3,809,7.315,0,0.015388,13,155.034,2,2007,2,1,1
4,1184,12.695633,1,0.118599,4,234.23,5,1985,1,2,3


Splitting features and targets

In [70]:
X=big_mart_data.drop(columns='Item_Outlet_Sales',axis=1)
Y=big_mart_data['Item_Outlet_Sales']

In [71]:
print(X)

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0                 156        9.300                 0         0.016047   
1                   8        5.920                 1         0.019278   
2                 662       17.500                 0         0.016760   
3                1121       19.200                 1         0.000000   
4                1297        8.930                 0         0.000000   
...               ...          ...               ...              ...   
8518              370        6.865                 0         0.056783   
8519              897        8.380                 1         0.046982   
8520             1357       10.600                 0         0.035186   
8521              681        7.210                 1         0.145221   
8522               50       14.800                 0         0.044878   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0             4  249.8092                  9    

In [72]:
print(Y)

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64


Splitting Train and test data

In [73]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=45,test_size=0.2)

XGboost regressor

In [74]:
regressor=XGBRegressor()
sns.set()

In [75]:
with mlflow.start_run() as run:



    # Distplot for Item_Weight
    plt.figure(figsize=(6,6))
    sns.distplot(big_mart_data["Item_Weight"])
    plt.title('Distribution of Item Weight')
    distplot_item_weight_path = "distplot_item_weight.png"
    plt.savefig(distplot_item_weight_path)
    mlflow.log_artifact(distplot_item_weight_path)
    plt.close()     
    
    # Distplot for Item_Visibility
    plt.figure(figsize=(6,6))
    sns.distplot(big_mart_data["Item_Visibility"])
    plt.title('Distribution of Item Visibility')
    distplot_item_visibility_path = "distplot_item_visibility.png"
    plt.savefig(distplot_item_visibility_path)
    mlflow.log_artifact(distplot_item_visibility_path)
    plt.close()

    # Distplot for Item_MRP
    plt.figure(figsize=(6,6))
    sns.distplot(big_mart_data["Item_MRP"])
    plt.title('Distribution of Item MRP')
    distplot_item_mrp_path = "distplot_item_mrp.png"
    plt.savefig(distplot_item_mrp_path)
    mlflow.log_artifact(distplot_item_mrp_path)
    plt.close()

    # Distplot for Item_Outlet_Sales
    plt.figure(figsize=(6,6))
    sns.distplot(big_mart_data["Item_Outlet_Sales"])
    plt.title('Distribution of Item Outlet Sales')
    distplot_item_outlet_sales_path = "distplot_item_outlet_sales.png"
    plt.savefig(distplot_item_outlet_sales_path)
    mlflow.log_artifact(distplot_item_outlet_sales_path)
    plt.close()

    # Countplot for Outlet_Establishment_Year
    plt.figure(figsize=(6,6))
    sns.countplot(x='Outlet_Establishment_Year', data=big_mart_data)
    plt.title('Countplot of Outlet Establishment Year')
    countplot_outlet_est_year_path = "countplot_outlet_est_year.png"
    plt.savefig(countplot_outlet_est_year_path)
    mlflow.log_artifact(countplot_outlet_est_year_path)
    plt.close()

    # Countplot for Item_Fat_Content
    plt.figure(figsize=(6,6))
    sns.countplot(x='Item_Fat_Content', data=big_mart_data)
    plt.title('Countplot of Item Fat Content')
    countplot_item_fat_content_path = "countplot_item_fat_content.png"
    plt.savefig(countplot_item_fat_content_path)
    mlflow.log_artifact(countplot_item_fat_content_path)
    plt.close()

    # Countplot for Item_Type
    plt.figure(figsize=(25,6))
    sns.countplot(x='Item_Type', data=big_mart_data)
    plt.title('Countplot of Item Type')
    countplot_item_type_path = "countplot_item_type.png"
    plt.savefig(countplot_item_type_path)
    mlflow.log_artifact(countplot_item_type_path)
    plt.close()

    # Countplot for Outlet_Size
    plt.figure(figsize=(6,6))
    sns.countplot(x='Outlet_Size', data=big_mart_data)
    plt.title('Countplot of Outlet Size')
    countplot_outlet_size_path = "countplot_outlet_size.png"
    plt.savefig(countplot_outlet_size_path)
    mlflow.log_artifact(countplot_outlet_size_path)
    plt.close()


    # Your model training code here
    regressor.fit(X, Y)
    
    # Evaluation code
    training_data_prediction = regressor.predict(X_train)
    r2_train = metrics.r2_score(Y_train, training_data_prediction)
    
    testing_data_prediction = regressor.predict(X_test)
    r2_test = metrics.r2_score(Y_test, testing_data_prediction)
    
    # The trained model, parameters, and metrics are logged automatically with autologging



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(big_mart_data["Item_Weight"])

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(big_mart_data["Item_Visibility"])

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexi

Predictive System

In [76]:
pred_train=regressor.predict(X)

if len(pred_train) == len(big_mart_data):
    big_mart_data['predicted sales'] = pred_train

pred_test=regressor.predict(testing_big_mart_data)

if len(pred_test) == len(testing_big_mart_data):
    testing_big_mart_data['predicted sales'] = pred_test



In [77]:
big_mart_data.to_excel('train_updated.xlsx', index=False, engine='openpyxl')
testing_big_mart_data.to_excel('test_updated.xlsx', index=False, engine='openpyxl')