# Food Sales Predictions Part 1 
- Oscar Ochoa
- 02/03/2022

In [82]:
# mounting drive
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [83]:
# importing pandas
import pandas as pd 

In [84]:
# loading data set
# loading data set 
file_name = "/content/drive/MyDrive/Coding Dojo/Project1/sales_predictions (1).csv"
food_df = pd.read_csv(file_name)
food_df.tail()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.38,Regular,0.046982,Baking Goods,108.157,OUT045,2002,,Tier 2,Supermarket Type1,549.285
8520,NCJ29,10.6,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.21,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976
8522,DRG01,14.8,Low Fat,0.044878,Soft Drinks,75.467,OUT046,1997,Small,Tier 1,Supermarket Type1,765.67


In [85]:
# checking rows and columns
food_df.shape

(8523, 12)

In [86]:
# checking data types of variables
food_dtypes = food_df.dtypes
food_dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [87]:
# checking for duplicted / No duplicates
food_df[food_df.duplicated()]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales


In [88]:
# identifying where missing values are
food_df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

 - Ultimately decided to fill our missing values the median of our data since 0s would make no sense for item weight and there would've been too many rows dropped by the NaN values

In [89]:
# Replacing all NaN values with the median of our Item_Weight column 
food_df["Item_Weight"] = food_df.loc[:, "Item_Weight"].fillna(food_df["Item_Weight"].median())
food_df.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

- I Decided to rename all NaN values for "Outlet_Size" to unknown for the same reason that we dont want to lose a lot of our data due to the unknown outlet sizes

In [90]:
# Filling all our NaN values for outlet size with "Unknown"
food_df["Outlet_Size"] = food_df.loc[:, "Outlet_Size"].fillna("Unknown")

In [91]:
# Confirming that there is no missing data in our data frame
food_df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [92]:
# slicing our object data types 
str_cols = food_dtypes[food_dtypes == "object"].index
str_cols

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [93]:
# Checking for syntax errors 
for col in str_cols:
  print(f"Column: {col}")
  print(food_df[col].value_counts())
  print("\n\n")

Column: Item_Identifier
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64



Column: Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64



Column: Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: Item_Type, dtype: int64



Column: Outlet_Identifier
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT01

In [98]:
# fixing syntax errors in Item_Fat_Content
fat_count_map = {"LF": "Low Fat",
                 "low fat": "Low Fat",
                 "reg": "Regular"}

food_df["Item_Fat_Content"] = food_df["Item_Fat_Content"].replace(fat_count_map)
food_df["Item_Fat_Content"].unique()

array(['Low Fat', 'Regular'], dtype=object)

In [123]:
# printing min max min for numerical data
float_cols = food_dtypes[food_dtypes == "float64"].index

for col in float_cols:
  print(f"Column: {col}:")
  print(f"Min:\t {food_df[col].min()}")
  print(f"Max:\t {food_df[col].max()}")
  print(f"Mean:\t {food_df[col].mean()}")
  print("\n")



Column: Item_Weight:
Min:	 4.555
Max:	 21.35
Mean:	 12.813419570574444


Column: Item_Visibility:
Min:	 0.0
Max:	 0.328390948
Mean:	 0.06613202877895127


Column: Item_MRP:
Min:	 31.29
Max:	 266.8884
Mean:	 140.9927819781768


Column: Item_Outlet_Sales:
Min:	 33.29
Max:	 13086.9648
Mean:	 2181.2889135750365


