In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import numpy as np
import statistics

In [2]:
data=pd.read_csv("Dataset/fastfood.csv")

**The goal of collecting this dataset**

Fast food is known for its convenience and affordability, but it is also infamous for its high-calorie, high-fat, 
and high-sugar content. This dataset aims to shed light on the nutritional value of these fast food products, helping consumers make more informed decisions about their food choices.

With information on calories, fat, carbohydrates, protein, and other key nutrients, this dataset provides a valuable resource for nutritionists, researchers, and health-conscious individuals. By analyzing this dataset, we can gain a better understanding of the nutritional impact of fast food consumption and work towards creating healthier food options in the fast food industry.

**Data**

The dataset we choose is(Fast Food Nutrition) which provides a comprehensive breakdown of the nutritional content of various fast food products from popular fast food chains. We got our dataset from Kaggle website, URL of source data: https://www.kaggle.com/datasets/ulrikthygepedersen/fastfood-nutrition

In [4]:
#Number of observations 
len(data)

515

Our dataset consists of 515 records.

In [5]:
#Number of variables 
data.shape[1]

17

Our dataset consists of 17 variables.

In [15]:
#Data type of each variable 
data.dtypes

restaurant      object
item            object
calories         int64
cal_fat          int64
total_fat        int64
sat_fat        float64
trans_fat      float64
cholesterol      int64
sodium           int64
total_carb       int64
fiber          float64
sugar            int64
protein        float64
vit_a          float64
vit_c          float64
calcium        float64
salad           object
dtype: object

In [7]:
#The 10 first records in the dataset
data.head(10)

Unnamed: 0,restaurant,item,calories,cal_fat,total_fat,sat_fat,trans_fat,cholesterol,sodium,total_carb,fiber,sugar,protein,vit_a,vit_c,calcium,salad
0,Mcdonalds,Artisan Grilled Chicken Sandwich,380,60,7,2.0,0.0,95,1110,44,3.0,11,37.0,4.0,20.0,20.0,Other
1,Mcdonalds,Single Bacon Smokehouse Burger,840,410,45,17.0,1.5,130,1580,62,2.0,18,46.0,6.0,20.0,20.0,Other
2,Mcdonalds,Double Bacon Smokehouse Burger,1130,600,67,27.0,3.0,220,1920,63,3.0,18,70.0,10.0,20.0,50.0,Other
3,Mcdonalds,Grilled Bacon Smokehouse Chicken Sandwich,750,280,31,10.0,0.5,155,1940,62,2.0,18,55.0,6.0,25.0,20.0,Other
4,Mcdonalds,Crispy Bacon Smokehouse Chicken Sandwich,920,410,45,12.0,0.5,120,1980,81,4.0,18,46.0,6.0,20.0,20.0,Other
5,Mcdonalds,Big Mac,540,250,28,10.0,1.0,80,950,46,3.0,9,25.0,10.0,2.0,15.0,Other
6,Mcdonalds,Cheeseburger,300,100,12,5.0,0.5,40,680,33,2.0,7,15.0,10.0,2.0,10.0,Other
7,Mcdonalds,Classic Chicken Sandwich,510,210,24,4.0,0.0,65,1040,49,3.0,6,25.0,0.0,4.0,2.0,Other
8,Mcdonalds,Double Cheeseburger,430,190,21,11.0,1.0,85,1040,35,2.0,7,25.0,20.0,4.0,15.0,Other
9,Mcdonalds,Double Quarter Pounder® with Cheese,770,400,45,21.0,2.5,175,1290,42,3.0,10,51.0,20.0,6.0,20.0,Other


**Some information about the dataset:**

In [8]:
#The mean of the datase:
data.describe()

Unnamed: 0,calories,cal_fat,total_fat,sat_fat,trans_fat,cholesterol,sodium,total_carb,fiber,sugar,protein,vit_a,vit_c,calcium
count,515.0,515.0,515.0,515.0,515.0,515.0,515.0,515.0,503.0,515.0,514.0,301.0,305.0,305.0
mean,530.912621,238.813592,26.590291,8.153398,0.465049,72.456311,1246.737864,45.664078,4.137177,7.262136,27.891051,18.857143,20.170492,24.852459
std,282.436147,166.40751,18.411876,6.418811,0.839644,63.160406,689.954278,24.883342,3.03746,6.761301,17.683921,31.38433,30.592243,25.522073
min,20.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,330.0,120.0,14.0,4.0,0.0,35.0,800.0,28.5,2.0,3.0,16.0,4.0,4.0,8.0
50%,490.0,210.0,23.0,7.0,0.0,60.0,1110.0,44.0,3.0,6.0,24.5,10.0,10.0,20.0
75%,690.0,310.0,35.0,11.0,1.0,95.0,1550.0,57.0,5.0,9.0,36.0,20.0,30.0,30.0
max,2430.0,1270.0,141.0,47.0,8.0,805.0,6080.0,156.0,17.0,87.0,186.0,180.0,400.0,290.0


In [10]:
#find out if there is any missing values
data.isnull().sum()

restaurant       0
item             0
calories         0
cal_fat          0
total_fat        0
sat_fat          0
trans_fat        0
cholesterol      0
sodium           0
total_carb       0
fiber           12
sugar            0
protein          1
vit_a          214
vit_c          210
calcium        210
salad            0
dtype: int64

In [16]:
#calculate the variance
data.var()

  data.var()


calories        79770.177175
cal_fat         27691.459348
total_fat         338.997182
sat_fat            41.201131
trans_fat           0.705002
cholesterol      3989.236901
sodium         476036.905859
total_carb        619.180711
fiber               9.226165
sugar              45.715198
protein           312.721050
vit_a             984.976190
vit_c             935.885311
calcium           651.376186
dtype: float64

In [None]:
# data visualization of calories
data.hist(column='calories')

In [None]:
# data visualization of cal_fat
data.hist(column='cal_fat')

In [None]:
# data visualization of total_fat
data.hist(column='total_fat')

In [None]:
# data visualization of sat_fat
data.hist(column='sat_fat')

In [None]:
# data visualization of trans_fat
data.hist(column='trans_fat')

In [None]:
# data visualization of cholesterol
data.hist(column='cholesterol')

In [None]:
# data visualization of sodium
data.hist(column='sodium')

In [None]:
# data visualization of total_carb
data.hist(column='total_carb')

In [None]:
# data visualization of fiber
data.hist(column='fiber')

In [None]:
# data visualization of sugar
data.hist(column='sugar')

In [None]:
# data visualization of protein
data.hist(column='protein')

In [None]:
# data visualization of vit_a
data.hist(column='vit_a')

In [None]:
# data visualization of vit_c
data.hist(column='vit_c')

In [None]:
# data visualization of calcium
data.hist(column='calcium')

In [None]:
#joint plot diagram shows each nutrition column
sns.jointplot(data= data, dropna =True)
plt.show()

In [None]:
# missing values visualizing using bar chart
msno.bar(data)

In [None]:
# calculating mean 
meancal = data['calories'].mean()
meancal_fat = data['cal_fat'].mean()
meantotal_fat = data['total_fat'].mean()
meansat_fat = data['sat_fat'].mean()
meantrans_fat = data['trans_fat'].mean()
meancholesterol = data['cholesterol'].mean()
meansodium = data['sodium'].mean()
meantotal_carb = data['total_carb'].mean()
meanfiber = data['fiber'].mean()
meansugar = data['sugar'].mean()
meanprotein = data['protein'].mean()
meanvit_a = data['vit_a'].mean()
meanvit_c = data['vit_c'].mean()
meancalcium = data['calcium'].mean()

print("Calories mean: " , meancal)
print("Cal_Fat mean: " , meancal_fat)
print("Total Fat mean: " , meantotal_fat)
print("Sat_fat mean: " , meansat_fat)
print("Trans_fat mean: " , meantrans_fat)
print("Cholesterol mean: " , meancholesterol)
print("Sodium mean: " , meansodium)
print("Total Carb mean: " , meantotal_carb)
print("Fiber mean: " , meanfiber)
print("Sugar mean: " , meansugar)
print("Protein mean: " , meanprotein)
print("Vit A mean: " , meanvit_a)
print("Vit C mean: " , meanvit_c)
print("Calcium mean: " , meancalcium)

# Data Preprocessing 
#  Drop unneeded columns
We decided to remove the Salad column since most of the items are not salads.and vit_a, vit_c and calcium since they contain a lot of missing values
#  Convert some data types
We have also decided to convert some of the columns types since some of them are of type object and they are needed to converted to string.
#  Drop rows with missing values
we have also decided that we don't need any item that does not contain all the nutritions information
#  Normalization
Last thing we should do is normalization on the some columns of the dataset

In [None]:
# first we have to get the dataset info 
data.info()

In [None]:
#We will drop unneeded columnn such as the salad, vit_a, vit_c and calcium 
data.drop(columns='salad', inplace=True, axis=1)
data.drop(columns='vit_a', inplace=True, axis=1)
data.drop(columns='vit_c', inplace=True, axis=1)
data.drop(columns='calcium', inplace=True, axis=1)

In [None]:
# then we have to make some changes in some variables types 
# Converting restaurant, item to string
data['restaurant'] = data['restaurant'].astype("string")
data['item'] = data['item'].astype("string")

In [None]:
# showing the dataset info after converting the variabe types and removing unneeded columns
data.info()

# Dropping rows with missing values
we only need the rows that has all the nutrition information
so as we can see that the dataset does not contain any missing values

In [None]:
#drop rows with missing values
data = data.dropna()

In [None]:
#dataset info after dropping rows with missing values
data.isnull().sum()

In [None]:
#number of observation after dropping rows with missing values
len(data)