In [78]:
# Necessary Imports:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## modules for machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score


### Question 1: Read the CSV file in Pandas and create a DataFrame named Grc_df. What is the number of rows and columns in Grc_df? Print the first 10 and last 10 rows of Grc_df.


In [79]:
# Reading CSV file:
Grc_df = pd.read_csv('./Grocery_dataset.csv')


Grc_df.info() # Tells us there are 5000 rows(entries) and 12 columns



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5000 non-null   object 
 1   Item_Weight                4182 non-null   float64
 2   Item_Fat_Content           5000 non-null   object 
 3   Item_Visibility            5000 non-null   float64
 4   Item_Type                  5000 non-null   object 
 5   Item_MRP                   5000 non-null   float64
 6   Outlet_Identifier          5000 non-null   object 
 7   Outlet_Establishment_Year  5000 non-null   int64  
 8   Outlet_Size                3561 non-null   object 
 9   Outlet_Location_Type       5000 non-null   object 
 10  Outlet_Type                5000 non-null   object 
 11  Item_Outlet_Sales          5000 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 468.9+ KB


In [80]:
# Printing both first and last 10 rows of df: (Get's truncated, might need to view as scrollable to see all info)
print(Grc_df.head(10))
print(Grc_df.tail(10))

  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15        9.300          Low Fat         0.016047   
1           DRC01        5.920          Regular         0.019278   
2           FDN15       17.500          Low Fat         0.016760   
3           FDX07       19.200          Regular         0.000000   
4           NCD19        8.930          Low Fat         0.000000   
5           FDP36       10.395          Regular         0.000000   
6           FDO10       13.650          Regular         0.012741   
7           FDP10          NaN          Low Fat         0.127470   
8           FDH17       16.200          Regular         0.016687   
9           FDU28       19.200          Regular         0.094450   

               Item_Type  Item_MRP Outlet_Identifier  \
0                  Dairy  249.8092            OUT049   
1            Soft Drinks   48.2692            OUT018   
2                   Meat  141.6180            OUT049   
3  Fruits and Vegetables  1

## 2. Are there any null values in the Grc_df? If yes, then in which columns and how many? Finally handle these null values using any strategy shown during the labs.

In [81]:
# Identifying the different null values:
print(Grc_df.isna().sum())

# 818 null values for Item_Weight and 1439 for Outlet_Size

Item_Identifier                 0
Item_Weight                   818
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1439
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [82]:
# Since it's quite a lot of null values for Item_Weight and Outlet_Size I will fill them instead of dropping them
# Filling Item_Weight: Using mean since it's numerical data
Grc_df['Item_Weight'].fillna(Grc_df['Item_Weight'].mean(), inplace=True)

# Filling Outlet_size: Using Mode since this is categorical data
Grc_df['Outlet_Size'].fillna(Grc_df['Outlet_Size'].mode()[0], inplace = True)

print(Grc_df.isna().sum())

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [83]:

# Finding the number of uniques:
unique_sizes = Grc_df['Outlet_Size'].nunique()
print(f"Number of Unique Sizes: {unique_sizes}")

# Finding the count of each unique:
outlet_size_counts = Grc_df['Outlet_Size'].value_counts()
print(f"\n Frequency of each outlet size value: \n {outlet_size_counts}")

# Calculating Maximum and Minimum (Assuming the assignment question meant to find the max/min in terms of frequency):
maximum_outlet = outlet_size_counts.idxmax()
minimum_outlet = outlet_size_counts.idxmin() 

print(f"\n Maximum = {maximum_outlet} and minimum = {minimum_outlet}")

Number of Unique Sizes: 3

 Frequency of each outlet size value: 
 Outlet_Size
Medium    3044
Small     1398
High       558
Name: count, dtype: int64

 Maximum = Medium and minimum = High


## 4. How many unique Item Fat Content types are in the Grc_df? List them. Do you see any issues with the Fat Content types? If yes, then handle this issue. 


In [84]:

# Finding the number of uniques:
unique_fat_sizes = Grc_df['Item_Fat_Content'].nunique()
print(f"Number of Unique Fat Contents: {unique_fat_sizes}")

# Finding the count of each unique:
fat_counts = Grc_df['Item_Fat_Content'].value_counts()
print(f"\n Frequency of each fat content: \n {fat_counts}")


Number of Unique Fat Contents: 5

 Frequency of each fat content: 
 Item_Fat_Content
Low Fat    3008
Regular    1679
LF          183
reg          68
low fat      62
Name: count, dtype: int64


In [85]:

# Issue is that there are 3 unique categories for Low Fat (Low Fat, LF and low fat) and 2 unique categories for Regular (Regular, reg)
# To fix we need to turn this into two categories instead of 5.

Grc_df['Item_Fat_Content'] = Grc_df['Item_Fat_Content'].replace({
  'low fat' : 'Low Fat',
  'LF' : 'Low Fat', 
  'reg' : 'Regular'
})

# Since there are now only two categories : Low Fat and Regular, we might want to convert to Binary for potential machine learning use:
Grc_df['Item_Fat_Content'] = Grc_df['Item_Fat_Content'].replace({
  'Low Fat' : 0,
  'Regular' : 1
})

# Finding the number of uniques:
unique_fat_sizes = Grc_df['Item_Fat_Content'].nunique()
print(f"Number of Unique Fat Contents: {unique_fat_sizes}")

# Finding the count of each unique:
fat_counts = Grc_df['Item_Fat_Content'].value_counts()
print(f"\n Frequency of each fat content: \n {fat_counts}")


Number of Unique Fat Contents: 2

 Frequency of each fat content: 
 Item_Fat_Content
0    3253
1    1747
Name: count, dtype: int64


## 5. Drop the columns having index values of 0,6 and create a new Dataframe Grc_new_df.

In [86]:
# To drop columns with index value 0 and 6 we use the drop method.

Grc_new_df = Grc_df.drop(columns=Grc_df.columns[[0,6]])

print(Grc_new_df.info())
# As you can see, Item_Idenfitier(0) and Outlet_Idenfitier(1) are removed



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                5000 non-null   float64
 1   Item_Fat_Content           5000 non-null   int64  
 2   Item_Visibility            5000 non-null   float64
 3   Item_Type                  5000 non-null   object 
 4   Item_MRP                   5000 non-null   float64
 5   Outlet_Establishment_Year  5000 non-null   int64  
 6   Outlet_Size                5000 non-null   object 
 7   Outlet_Location_Type       5000 non-null   object 
 8   Outlet_Type                5000 non-null   object 
 9   Item_Outlet_Sales          5000 non-null   float64
dtypes: float64(4), int64(2), object(4)
memory usage: 390.8+ KB
None


## 6. Using different supermarket type listed in the column Outlet_Type create two different Dataframes from Grc_new_df. Name these Dataframes as SupType_1 and SupType_2. 

In [89]:
# Saving two new df's based on supermarket type:
SupType_1 = Grc_new_df[Grc_new_df['Outlet_Type'] == 'Supermarket Type1']
SupType_2 = Grc_new_df[Grc_new_df['Outlet_Type'] == 'Supermarket Type2']


print(f"SupType_1: \n {SupType_1.info()}")
print(f"SupType_2: \n {SupType_2.info()}")

<class 'pandas.core.frame.DataFrame'>
Index: 3328 entries, 0 to 4998
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Weight                3328 non-null   float64
 1   Item_Fat_Content           3328 non-null   int64  
 2   Item_Visibility            3328 non-null   float64
 3   Item_Type                  3328 non-null   object 
 4   Item_MRP                   3328 non-null   float64
 5   Outlet_Establishment_Year  3328 non-null   int64  
 6   Outlet_Size                3328 non-null   object 
 7   Outlet_Location_Type       3328 non-null   object 
 8   Outlet_Type                3328 non-null   object 
 9   Item_Outlet_Sales          3328 non-null   float64
dtypes: float64(4), int64(2), object(4)
memory usage: 286.0+ KB
SupType_1: 
 None
<class 'pandas.core.frame.DataFrame'>
Index: 526 entries, 1 to 4999
Data columns (total 10 columns):
 #   Column                     Non-Null Co