# Food Sales Prediction
Zach Hanson

## Importing Libraries

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (OneHotEncoder, StandardScaler, 
                                  OrdinalEncoder)
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

## Importing Data

In [31]:
original_df = pd.read_csv('sales_predictions.csv')
sales_df = original_df.copy()
sales_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [32]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


## Simple Data Cleaning

### Duplicates

In [33]:
#Checking for duplicates
sales_df.duplicated().sum()

0

No duplicates.

### Inconsistencies

In [34]:
#Checking Fat Content
sales_df['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

In [36]:
#Removing these inconsistencies
sales_df = sales_df.replace(['LF', 'low fat', 'reg'],
                            ['Low Fat', 'Low Fat', 'Regular'])

#Checking again to make sure inconsistencies are removed
sales_df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

This inconsistency has been removed.

In [37]:
#Dropping any rows that are empty in our target column
sales_df = sales_df.dropna(subset=['Item_Outlet_Sales'])

## Identifying and Splitting Data

### Identifying Data

In [38]:
#Target column is "Item_Outlet_Sales"
y = sales_df['Item_Outlet_Sales']

#Dropping Item Identifier, Outlet Establishment Year, Item_Outlet_Sales,
#Outlet_Identifier, Item_Weight
X = sales_df.drop(columns='Item_Outlet_Sales', axis=1)
X.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


### Splitting Data


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Column Selectors

In [11]:
#Numerical Selector
num_selector = make_column_selector(dtype_include='number')

#Categorical Selector
cat_selector = make_column_selector(dtype_include='object')

## Imputers

In [12]:
#Mean imputer for numerical data
mean_imputer = SimpleImputer(strategy='mean')

#Categorical imputer to fill missing nominal data with "Missing"
missing_imputer = SimpleImputer(strategy='constant',fill_value='Missing')

## Scaler and OneHotEncoding

### Scaler

In [13]:
#Instantiate Scaler
scaler = StandardScaler()

### One Hot Encoder

In [14]:
#Instantiate One Hot Encoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

## Pipelines

### Numerical Pipeline

In [15]:
num_pipeline = make_pipeline(mean_imputer, scaler)
num_pipeline

### Categorical Pipeline

In [16]:
nom_pipeline = make_pipeline(missing_imputer, ohe)
nom_pipeline

## Applying Transformers

### Creating Tuples

In [17]:
#Numerical Tuple
num_tuple = (num_pipeline, num_selector)

#Nominal Tuple
nom_tuple = (nom_pipeline, cat_selector)

### Creating Column Transformer

In [18]:
preprocessor = make_column_transformer(num_tuple, nom_tuple, remainder='drop')
preprocessor

### Fitting to Data

In [19]:
preprocessor.fit(X_train)

## Transform Data

In [20]:
#Transforming Train Set
X_train_processed = preprocessor.transform(X_train)

#Transforming Test Set
X_test_processed = preprocessor.transform(X_test)