In [1]:
# https://github.com/ArashVafa/DESC624/blob/master/nutrition_subset
# download and save as .csv
import pandas as pd
import numpy as np
df = pd.read_csv('nutrition_subset.csv')

# Q.1

In [2]:
# create a new column saturated_fat_per_gram
df['saturated_fat_per_gram'] = df['saturated_fat'] / df['weight_in_grams']

In [3]:
# sort the dataframe 
df.sort_values('saturated_fat_per_gram',ascending=False,inplace=True)
# reset the index
df.reset_index(drop=True,inplace=True)

In [4]:
df.head(5) #  listing of the five food items highest in saturated fat per gram

Unnamed: 0,food item,weight_in_grams,saturated_fat,cholesterol,saturated_fat_per_gram
0,BUTTER; SALTED 1 TBSP,14.0,7.1,31,0.507143
1,BUTTER; UNSALTED 1 TBSP,14.0,7.1,31,0.507143
2,BUTTER; UNSALTED 1/2 CUP,113.0,57.1,247,0.50531
3,BUTTER; SALTED 1/2 CUP,113.0,57.1,247,0.50531
4,BUTTER; UNSALTED 1 PAT,5.0,2.5,11,0.5


In [5]:
# food having the most saturated fat per gram?
list(df['food item'].head(1))[0]

'BUTTER; SALTED                1 TBSP  '

# Q.2

Reference: https://medium.datadriveninvestor.com/finding-outliers-in-dataset-using-python-efc3fce6ce32

In [6]:
# apply z-score to Standardize saturated_fat_per_gram column (please refer the above link)
mean = df['saturated_fat_per_gram'].mean()
standard_deviation = df['saturated_fat_per_gram'].std()
df['saturated_fat_per_gram'] = (df['saturated_fat_per_gram']  -mean)/ standard_deviation

In [7]:
df # display the data frame

Unnamed: 0,food item,weight_in_grams,saturated_fat,cholesterol,saturated_fat_per_gram
0,BUTTER; SALTED 1 TBSP,14.0,7.1,31,7.106775
1,BUTTER; UNSALTED 1 TBSP,14.0,7.1,31,7.106775
2,BUTTER; UNSALTED 1/2 CUP,113.0,57.1,247,7.079055
3,BUTTER; SALTED 1/2 CUP,113.0,57.1,247,7.079055
4,BUTTER; UNSALTED 1 PAT,5.0,2.5,11,6.998763
...,...,...,...,...,...
956,TOMATOES; RAW 1 TOMATO,123.0,0.0,0,-0.562043
957,RASPBERRIES; RAW 1 CUP,123.0,0.0,0,-0.562043
958,TOMATO JUICE; CANNED W/O SALT 1 CUP,244.0,0.0,0,-0.562043
959,TOMATO JUICE; CANNED WITH SALT1 CUP,244.0,0.0,0,-0.562043


In [8]:
# compute first quartile and third quartile
qtr1, qtr3= np.percentile(df['saturated_fat_per_gram'],[25,75])

In [9]:
iqr = qtr3 - qtr1 # compute IQR

In [10]:
iqr

0.7159620336105645

In [11]:
# compute lower and upper bound
lower_bound = qtr1 -(1.5 * iqr) 
upper_bound = qtr3 +(1.5 * iqr)

In [12]:
lower_bound 

-1.6359856281662304

In [13]:
upper_bound

1.2278625062760282

In [14]:
# list of items outlier at low and high end of scale
df.loc[(df['saturated_fat_per_gram']<lower_bound) | (df['saturated_fat_per_gram']>upper_bound)]

Unnamed: 0,food item,weight_in_grams,saturated_fat,cholesterol,saturated_fat_per_gram
0,BUTTER; SALTED 1 TBSP,14.0,7.1,31,7.106775
1,BUTTER; UNSALTED 1 TBSP,14.0,7.1,31,7.106775
2,BUTTER; UNSALTED 1/2 CUP,113.0,57.1,247,7.079055
3,BUTTER; SALTED 1/2 CUP,113.0,57.1,247,7.079055
4,BUTTER; UNSALTED 1 PAT,5.0,2.5,11,6.998763
...,...,...,...,...,...
80,BEEF ROAST; RIB; LEAN + FAT 3 OZ,85.0,10.8,72,1.359292
81,MAYONNAISE; REGULAR 1 TBSP,14.0,1.7,8,1.274153
82,LIGHT; COFFEE OR TABLE CREAM 1 TBSP,15.0,1.8,10,1.252551
83,LIGHT; COFFEE OR TABLE CREAM 1 CUP,240.0,28.8,159,1.252551


In [15]:
# outlier at high end of scale
df[df['saturated_fat_per_gram']>upper_bound]

Unnamed: 0,food item,weight_in_grams,saturated_fat,cholesterol,saturated_fat_per_gram
0,BUTTER; SALTED 1 TBSP,14.0,7.1,31,7.106775
1,BUTTER; UNSALTED 1 TBSP,14.0,7.1,31,7.106775
2,BUTTER; UNSALTED 1/2 CUP,113.0,57.1,247,7.079055
3,BUTTER; SALTED 1/2 CUP,113.0,57.1,247,7.079055
4,BUTTER; UNSALTED 1 PAT,5.0,2.5,11,6.998763
...,...,...,...,...,...
80,BEEF ROAST; RIB; LEAN + FAT 3 OZ,85.0,10.8,72,1.359292
81,MAYONNAISE; REGULAR 1 TBSP,14.0,1.7,8,1.274153
82,LIGHT; COFFEE OR TABLE CREAM 1 TBSP,15.0,1.8,10,1.252551
83,LIGHT; COFFEE OR TABLE CREAM 1 CUP,240.0,28.8,159,1.252551


In [16]:
# There is no outlier at low end of scale
df[df['saturated_fat_per_gram']<lower_bound]

Unnamed: 0,food item,weight_in_grams,saturated_fat,cholesterol,saturated_fat_per_gram


In [17]:
len(df[df['saturated_fat_per_gram']<lower_bound])

0