## Import libraries and load data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import Markdown as md
pd.options.plotting.backend = "plotly"
%matplotlib inline

raw_df = pd.read_csv("data/en.openfoodfacts.org.products.tsv", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


ModuleNotFoundError: No module named 'seaborn'

## Data Exploration

#### First let's take a peek at the data

In [2]:
raw_df.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,...,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
0,3087,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1474103866,2016-09-17T09:17:46Z,1474103893,2016-09-17T09:18:13Z,Farine de blé noir,,1kg,...,,,,,,,,,,
1,4530,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Banana Chips Sweetened (Whole),,,...,,,,,,,14.0,14.0,,
2,4559,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Peanuts,,,...,,,,,,,0.0,0.0,,
3,16087,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055731,2017-03-09T10:35:31Z,1489055731,2017-03-09T10:35:31Z,Organic Salted Nut Mix,,,...,,,,,,,12.0,12.0,,
4,16094,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055653,2017-03-09T10:34:13Z,1489055653,2017-03-09T10:34:13Z,Organic Polenta,,,...,,,,,,,,,,


In [3]:
raw_df.shape

(356027, 163)

#### Available columns

In [5]:
for i, x in enumerate(raw_df.columns):
    print("{}: {}".format(i, x))

0: code
1: url
2: creator
3: created_t
4: created_datetime
5: last_modified_t
6: last_modified_datetime
7: product_name
8: generic_name
9: quantity
10: packaging
11: packaging_tags
12: brands
13: brands_tags
14: categories
15: categories_tags
16: categories_en
17: origins
18: origins_tags
19: manufacturing_places
20: manufacturing_places_tags
21: labels
22: labels_tags
23: labels_en
24: emb_codes
25: emb_codes_tags
26: first_packaging_code_geo
27: cities
28: cities_tags
29: purchase_places
30: stores
31: countries
32: countries_tags
33: countries_en
34: ingredients_text
35: allergens
36: allergens_en
37: traces
38: traces_tags
39: traces_en
40: serving_size
41: no_nutriments
42: additives_n
43: additives
44: additives_tags
45: additives_en
46: ingredients_from_palm_oil_n
47: ingredients_from_palm_oil
48: ingredients_from_palm_oil_tags
49: ingredients_that_may_be_from_palm_oil_n
50: ingredients_that_may_be_from_palm_oil
51: ingredients_that_may_be_from_palm_oil_tags
52: nutrition_grade_

#### Now let's start cleaning the data

In [6]:
df = raw_df.copy()

In [7]:
# Remove columns with all NaN
df = df.dropna(how="all", axis=1)

#### Some rows indicate more than one country so we will separate them

In [8]:
[x for x in list(df["countries_en"].unique()) if "," in str(x)][:10]

['France,United States',
 'France,United Kingdom',
 'Belgium,France,Netherlands,United Kingdom',
 'United Kingdom,United States',
 'Canada,United States',
 'Australia,France',
 'Canada,France,Switzerland,United States',
 'France,Réunion',
 'Australia,Belgium,Switzerland',
 'France,Germany']

In [9]:
# Make each row a unique country
df["countries_en"] = df["countries_en"].str.split(",") 
df = df.explode('countries_en').reset_index(drop=True)
gained_rows = df.shape[0] - raw_df.shape[0] 
df.shape

(363872, 147)

In [10]:
md("**We have gained {} more rows**".format(gained_rows))

**We have gained 7845 more rows**

In [39]:
country_pct_count = (df.groupby("countries_en")["code"].count()/np.sum(df.groupby("countries_en")["code"].count()))
country_pct_count.rename("percentage").sort_values(ascending=False).head(10).to_frame()

Unnamed: 0_level_0,percentage
countries_en,Unnamed: 1_level_1
United States,0.477783
France,0.355694
Switzerland,0.04732
Germany,0.025863
Spain,0.016671
United Kingdom,0.016467
Belgium,0.011244
Australia,0.006378
Russia,0.004514
Italy,0.004489


In [12]:
md("**The United States and France make up {:.2f}% of the data**".format((country_pct_count["United States"] + country_pct_count["France"])*100))

**The United States and France make up 83.35% of the data**

In [13]:
us_df = df[df["countries_en"] == "United States"]

In [45]:
us_df["main_category_en"].value_counts().nlargest(10).plot(kind="bar").update_layout(showlegend=False)

In [52]:
us_df.corr()

Unnamed: 0,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,...,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
additives_n,1.000000,0.005347,0.242053,-0.014508,-0.101322,-0.149549,-0.061000,,,,...,0.295603,,,-0.305279,,,-0.590283,,0.158047,0.157326
ingredients_from_palm_oil_n,0.005347,1.000000,0.006887,0.003544,0.051917,0.002154,0.002241,,,,...,,,,,,,,,0.006445,0.006453
ingredients_that_may_be_from_palm_oil_n,0.242053,0.006887,1.000000,-0.033238,0.175392,-0.013706,0.014613,,,,...,,,,,,,,,0.030940,0.030163
energy_100g,-0.014508,0.003544,-0.033238,1.000000,0.772946,0.741531,0.531799,,,,...,-0.027583,,,-0.078907,,,0.792415,1.0,0.490296,0.491777
energy-from-fat_100g,-0.101322,0.051917,0.175392,0.772946,1.000000,0.992143,0.714867,,,,...,,,,,,,,,0.552104,0.580010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
collagen-meat-protein-ratio_100g,,,,,,,,,,,...,,,,,,,,,,
cocoa_100g,-0.590283,,,0.792415,,0.814743,0.726411,,,,...,,,,,,,1.000000,,0.113677,0.113677
carbon-footprint_100g,,,,1.000000,,1.000000,1.000000,,,,...,,,,,,,,1.0,1.000000,1.000000
nutrition-score-fr_100g,0.158047,0.006445,0.030940,0.490296,0.552104,0.564262,0.641661,,,,...,0.352856,,,-0.724156,,,0.113677,1.0,1.000000,0.998978
