<a href="https://colab.research.google.com/github/amltago/sales_prediction/blob/main/Sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 1- Part 2 (Core)

Submitted by: Aiza Marie Tago

## Loading Data

Mounting and loading the data from Google Drive

## Loading file from Google Drive and importing libraries

In [None]:
# Mounting drive

from google.colab import drive
drive.mount('/content/drive')

## Importing libraries and loading dataframe

In [None]:
# Importing libraries

import pandas as pd
import pandas as np

In [None]:
# Loading dataframe

filename = ("/content/drive/MyDrive/Coding Dojo/Files/sales_predictions.csv")
df = pd.read_csv(filename)

## Checking dataframe

In [None]:
# Checking headers

df.head()

## Cleaning Data

In [None]:
# Identifying data types

df.info()

In [None]:
# 1) How many rows and columns?

# Running df.shape for rows and columns checking

df.shape
print( 'Rows and Columns', (df.shape))

In [None]:
# 2) What are the datatypes of each variable?

# Checking for data types

df.dtypes

In [None]:
# Checking and dropping duplicates

df.duplicated().sum()
print("Duplicates", (df.duplicated().sum()))

**There are no duplicates in the dataset**

In [None]:
# Checking null or nan

df.isna().sum()

In [None]:
# Checking Item_Weight column

no_weight = df['Item_Weight'].isna()
df.loc[no_weight, :]


In [None]:
# Dropping columns with NAN values as they are not necessary

df.drop(columns= 'Item_Weight', inplace = True)
df.drop(columns= 'Outlet_Size', inplace = True)

**Dropping both columns, although each column are having large numbers, I believe they have minimal to no effect in sales prediction or calculating income**

## Dataframe after dropping columns with NAN/null values

In [None]:
# Checking dataframe after dropping columns

df.head()

In [None]:
# Checking data information after dropping columns

df.info()

## Checking for inconsistencies

In [None]:
# Checking for inconsistencies

# Get column names
column_names = df.columns

# Iterate each column
for columns in column_names:
    print(columns,df[columns].unique())


In [None]:
# Checking for inconsistencies for categorical data

df_cat = df.select_dtypes(include=['object'])

cat_names = df_cat.columns

# Iterate each column

for columns in cat_names:
    print(columns,df[columns].unique())

In [None]:
# Checking for inconsistencies for numerical data

df_num = df.select_dtypes(include=['int64','float64'])

num_names = df_num.columns

# Iterate each column

for columns in num_names:
    print(columns,df[columns].unique())

## Validating content of categorical columns

In [None]:
# Checking for value counts

ID = df['Item_Identifier']
ID.value_counts()

In [None]:
len(ID)

In [None]:
#Checking integrity of  entries

FDW = ID == 'FDW13'
FDW.value_counts()

In [None]:
#Checking integrity of  entries

DR = ID == 'DRC01'
DR.value_counts()

**The entries that are showing unique are identifiers of the items**

In [None]:
# Filtering Item_Fat_Content column

IFC = df['Item_Fat_Content']
IFC.unique()

In [None]:
# Changing low fat and LF to Low Fat

df.loc[df['Item_Fat_Content'] == 'low fat', 'Item_Fat_Content'] = 'Low Fat'
df.loc[df['Item_Fat_Content'] == 'LF', 'Item_Fat_Content'] = 'Low Fat'

# Changing reg to Regular

df.loc[df['Item_Fat_Content'] == 'reg', 'Item_Fat_Content'] = 'Regular'

In [None]:
# Checking dataframe after changing variables (refer to above cell)

cat_names

# Iterate each column

for columns in cat_names:
    print(columns,df[columns].unique())

In [None]:
# Checking for Item_Fat_Content column for value counts

IFC.value_counts()

In [None]:
# Checking for Item_Type column for value counts

I_Type= df['Item_Type']
I_Type.value_counts()

In [None]:
# Renaming Item Type

df.loc[df['Item_Type'] == 'Fruits and Vegetables', 'Item_Type'] = 'F&V'
df.loc[df['Item_Type'] == 'Snack Foods', 'Item_Type'] = 'SF'
df.loc[df['Item_Type'] == 'Household', 'Item_Type'] = 'HH'
df.loc[df['Item_Type'] == 'Frozen Foods', 'Item_Type'] = 'FF'
df.loc[df['Item_Type'] == 'Baking Goods', 'Item_Type'] = 'BG'
df.loc[df['Item_Type'] == 'Health and Hygiene', 'Item_Type'] = 'H&H'
df.loc[df['Item_Type'] == 'Soft Drinks', 'Item_Type'] = 'SD'
df.loc[df['Item_Type'] == 'Hard Drinks', 'Item_Type'] = 'HD'
df.loc[df['Item_Type'] == 'Starchy Foods', 'Item_Type'] = 'SFoods'
df.loc[df['Item_Type'] == 'Breakfast', 'Item_Type'] = 'Bfast'
df.loc[df['Item_Type'] == 'Seafood', 'Item_Type'] = 'SFood'


In [None]:
# Checking for Item_Fat_Content column for value counts

O_ID = df['Outlet_Identifier']
O_ID.value_counts()

In [None]:
# Checking for Outlet_Type column for value counts

OType= df['Outlet_Type']
OType.value_counts()

**The categorical columns have identifiers for items for this reason it would filter unique entries. Has checked the validity of the data per column. Updated the Item_Fat_Content the variables low fat and LF to Low Fat, and reg to Regular for uniformity**

## Validating content of numerical columns

In [None]:
#import statistics

import statistics

In [None]:
# Filtering Item_Visibility

item_vis = df['Item_Visibility']

ave_item_vis = statistics.mean(item_vis)

print("The visibility is " + str (round(ave_item_vis, 2)))
print("Min",item_vis.min())
print("Max",item_vis.max())

In [None]:
# Filtering Item_MRP

MRP = df['Item_MRP']

ave_MRP = statistics.mean(MRP)

print("The average MRP is " + str (round(ave_MRP, 2)))
print("Min",MRP.min())
print("Mticax",MRP.max())

In [None]:
# Filtering and sorting Outlet_Establishment_Year

established_year = df['Outlet_Establishment_Year']
established_year.value_counts()
sort_per_year = established_year.value_counts()
sort_per_year.sort_values()

In [None]:
# Filtering and checking min, max and mean for sales column

sales = df['Item_Outlet_Sales']

average = statistics.mean(sales)

print("The average sales is $" + str(round(average, 2)))
print("Min",sales.min())
print("Max",sales.max())

**Showed the min and max of each columns except for Outlet_Establishment_Year, instead I sorted the values.**

# Project 1 - Part 3 (Core)

## Histogram to view the distributions of various features in your dataset.


In [None]:
# Importing libraries

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Creating histogram for Item Identifier

plt.style.use('seaborn')
plt.figure(figsize=(8,5))
plt.hist(I_Type, bins=16, alpha=0.7, edgecolor='black', label='Item Type')

plt.xticks(rotation= 20)
plt.yticks(rotation= 20);

In [None]:
# Creating histogram for Outlet Type

plt.style.use('seaborn')
plt.figure(figsize=(8,5))
plt.hist(OType, bins=4, alpha=0.7, edgecolor='black', label='Outlet Type');


In [None]:
# Creating histogram for Sales

plt.style.use('seaborn')
plt.figure(figsize=(7,5))
plt.hist(sales, bins=8, alpha=0.7, edgecolor='black', label='Sales');

## Boxplot to view statistical summaries of various features in your dataset

In [None]:
# Creating boxplot for Item Outlet Sales and the Type of Outlet

sns.boxplot(x='Item_Outlet_Sales', y='Outlet_Type', data=df);

In [None]:
# Boxplot for Item Outlet Sales and the year the outlet was established

sns.boxplot(x='Outlet_Establishment_Year', y='Item_Outlet_Sales', data=df);

## Heatmap of the correlation between features

In [None]:
# Creating correlation

corr = df.corr()

sns.heatmap(corr, cmap = 'Oranges');

In [None]:
# Creating correlation annot= True

sns.heatmap(corr, cmap = 'Blues', annot = True);