# Scraping A Large Dataset of Foods

## Getting the lists of foods for training
in order to get a list of foods for training, we will be scraping the website "Open Food Facts" that has thousands of foods, their nutrition facts, as well as their Nutri-score

## Step 1: Get the list of foods from the dataset
This takes about a while, so we have saved the data to a csv to load to save time

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

def get_food_links(url):
    '''Takes a url and returns a dataframe with the food names and links to the food pages
    
    Args:
        url (str): url to scrape

    Returns:
        foods (dataframe): dataframe with food names and links to the food pages
    '''

    # Get the data
    r = requests.get(url)
    # soup response
    soup = BeautifulSoup(r.text, "html.parser")
    # Create a dataframe
    foods = pd.DataFrame(columns=["name", "url"])
    foods = foods.set_index("name")

    # Get unordered list of foods
    ul = soup.find_all('ul')[10]
    # Get the links from the foods
    a = ul.find_all('a')
    for i in a:
        # extract and add their title and href to the dataframe
        foods.loc[i.get("title")] = "https://us.openfoodfacts.org"+i.get("href")
        
    return foods

In [4]:
df_food_links = pd.DataFrame(columns=["name", "url"])
df_food_links = df_food_links.set_index("name")

for i in range(1, 500):
    url = "https://us.openfoodfacts.org/" + str(i)
    df_food_links = pd.concat([df_food_links, get_food_links(url)])
    if (i % 5 == 0):
        df_food_links.to_csv("food_links.csv")

df_food_links.to_csv("food_links.csv")

### Load CSV

In [1]:
df_food_links = pd.read_csv("food_links.csv")

NameError: name 'pd' is not defined

## Step 2: Finding the nutritional data from each foods

### Finding the Nutri-score for each food

In [6]:
from bs4 import BeautifulSoup
import requests


def find_nutriscore(url):
    """Find the nutriscore of the product
    
    Args:
        url (str): url of the product
        
    Returns:
        nutriscore (int): nutriscore of the product"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    strongs = soup.find_all('strong')
    for strong in strongs:
        if strong is not None:
            if ('Score nutritionnel' in strong.text):
                return strong.text.split(": ")[1]

### Extracting the nutritional data for each food

In [7]:
''' 
So currently, this function isn't optimal
- Issues with the dataframe, where the column titles we want in the main dataframe are all stored in a column and
  values are stored in the column next to it
- I guess we could look at the 100g/100ml column, or the serving size column(just need to drop some columns on the end of scraped df)
- Need to figure out which nutrient rows to keep and which to get rid of, not all food items have the same things
  - Could just add a 0 or NULL value if it doesn't exist, but then normalizing would get weird because we have to drop NaN values
- I completely forgot to scrape the nutrition score letter
'''



def extract_nutrition(url):
    """ takes href input to pull out the nutrition information

    Args:
        url (str): link to pull nutrition information from

    Output:
        table (dataframe): something to add onto the row of the original dataframe
    """
    # get all tables
    list_of_tables = pd.read_html(url)
    
    # select table of nutrients
    table = list_of_tables[0]
    table["Nutrient"] = table["Nutrition facts"]

    # get only the column that is as sold for 100g or 100ml and the nutrients
    table = table[['As sold for 100 g / 100 ml', 'Nutrient']]

    table = table.set_index('Nutrient')
    table.loc['nutriscore'] = find_nutriscore(url)

    return table

In [8]:
example_food = extract_nutrition("https://us.openfoodfacts.org/product/3017620422003/nutella-ferrero")

In [9]:
example_food

Unnamed: 0_level_0,As sold for 100 g / 100 ml
Nutrient,Unnamed: 1_level_1
Energy,"2,252 kj (539 kcal)"
Fat,30.9 g
Saturated fat,10.6 g
Salt,0.107 g
Carbohydrates,57.5 g
Sugars,56.3 g
Proteins,6.3 g
Alcohol,0 % vol
Fruits‚ vegetables‚ nuts and rapeseed‚ walnut and olive oils (estimate from ingredients list analysis),0 %
nutriscore,26


In [10]:
def format_food(name, item_df):
    """ formats this item dataframe to be added to the main dataframe
    
    Args:
        name (str): name of the food item
        item_df (dataframe): dataframe of the food item to be formatted
        
    Returns:
        new_item_df (dataframe): formatted dataframe
    """

    new_item_df = item_df.swapaxes("index", "columns")
    new_item_df = new_item_df.reset_index(drop=True)
    new_item_df["item"] = name
    new_item_df = new_item_df.set_index("item")
    
    return new_item_df

### Putting new nutritional information into a new training pandas dataframe

In [12]:
from tqdm import tqdm 
import warnings

warnings.filterwarnings("ignore")

df_train = pd.DataFrame()

# for each row in the dataframe, get the nutrition information
for index, row in tqdm(df_food_links.iterrows(), "Extracting Nutritional Information"):
    try:
        df_to_add = format_food(row['name'], extract_nutrition(row['url']))
        df_train = pd.concat([df_train, df_to_add])
        # export to csv
        if (index % 10 == 0):
            df_train.to_csv("train_unprocessed.csv")
    except:
        continue

Extracting Nutritional Information: 48it [01:46,  2.16s/it]

Error. Skipping...


Extracting Nutritional Information: 153it [05:47,  1.67s/it]

Error. Skipping...


Extracting Nutritional Information: 175it [06:34,  1.91s/it]

Error. Skipping...


Extracting Nutritional Information: 188it [07:02,  1.80s/it]

Error. Skipping...


Extracting Nutritional Information: 249it [09:16,  1.90s/it]

Error. Skipping...


Extracting Nutritional Information: 286it [10:36,  1.94s/it]

Error. Skipping...


Extracting Nutritional Information: 323it [12:07,  1.82s/it]

Error. Skipping...


Extracting Nutritional Information: 345it [12:54,  1.78s/it]

Error. Skipping...


Extracting Nutritional Information: 399it [14:51,  2.02s/it]

Error. Skipping...


Extracting Nutritional Information: 402it [14:57,  1.99s/it]

Error. Skipping...


Extracting Nutritional Information: 415it [15:24,  1.85s/it]

Error. Skipping...


Extracting Nutritional Information: 463it [17:03,  1.83s/it]

Error. Skipping...


Extracting Nutritional Information: 577it [21:32,  1.94s/it]

Error. Skipping...


Extracting Nutritional Information: 601it [22:23,  1.88s/it]

Error. Skipping...


Extracting Nutritional Information: 652it [24:09,  1.85s/it]

Error. Skipping...


Extracting Nutritional Information: 812it [47:26,  1.77s/it] 

Error. Skipping...


Extracting Nutritional Information: 986it [53:24,  1.87s/it]

Error. Skipping...


Extracting Nutritional Information: 1050it [55:34,  1.68s/it]

Error. Skipping...


Extracting Nutritional Information: 1102it [58:07,  1.91s/it]

Error. Skipping...


Extracting Nutritional Information: 1367it [1:07:43,  1.86s/it]

Error. Skipping...


Extracting Nutritional Information: 1493it [1:12:54,  1.88s/it]

Error. Skipping...


Extracting Nutritional Information: 1652it [1:19:22,  2.26s/it]

Error. Skipping...


Extracting Nutritional Information: 1716it [1:21:57,  2.09s/it]

Error. Skipping...


Extracting Nutritional Information: 1838it [1:27:11,  1.84s/it]

Error. Skipping...


Extracting Nutritional Information: 2000it [1:34:31,  3.36s/it]

Error. Skipping...


Extracting Nutritional Information: 2090it [1:38:04,  1.79s/it]

Error. Skipping...


Extracting Nutritional Information: 2233it [1:43:16,  1.68s/it]

Error. Skipping...


Extracting Nutritional Information: 2583it [1:56:04,  1.92s/it]

Error. Skipping...


Extracting Nutritional Information: 2612it [1:57:23,  1.96s/it]

Error. Skipping...


Extracting Nutritional Information: 2624it [1:57:47,  2.02s/it]

Error. Skipping...


Extracting Nutritional Information: 2749it [2:02:15,  2.10s/it]

Error. Skipping...


Extracting Nutritional Information: 2909it [2:07:53,  1.93s/it]

Error. Skipping...


Extracting Nutritional Information: 3087it [2:14:13,  1.95s/it]

Error. Skipping...


Extracting Nutritional Information: 3333it [2:24:05,  2.08s/it]

Error. Skipping...


Extracting Nutritional Information: 3422it [2:28:12,  1.86s/it]

Error. Skipping...


Extracting Nutritional Information: 3456it [2:29:42,  1.92s/it]

Error. Skipping...


Extracting Nutritional Information: 3497it [2:31:06,  1.86s/it]

Error. Skipping...


Extracting Nutritional Information: 3771it [2:40:41,  1.85s/it]

Error. Skipping...


Extracting Nutritional Information: 4036it [2:50:34,  1.88s/it]

Error. Skipping...


Extracting Nutritional Information: 4201it [2:56:04,  1.71s/it]

Error. Skipping...


Extracting Nutritional Information: 4208it [2:56:15,  1.67s/it]

Error. Skipping...


Extracting Nutritional Information: 4220it [2:56:42,  1.91s/it]

Error. Skipping...


Extracting Nutritional Information: 4236it [2:57:15,  1.75s/it]

Error. Skipping...


Extracting Nutritional Information: 4239it [2:57:21,  2.00s/it]

Error. Skipping...


Extracting Nutritional Information: 4328it [3:00:16,  1.73s/it]

Error. Skipping...


Extracting Nutritional Information: 4380it [3:01:59,  1.80s/it]

Error. Skipping...


Extracting Nutritional Information: 4431it [3:03:39,  1.90s/it]

Error. Skipping...


Extracting Nutritional Information: 4530it [3:06:53,  1.85s/it]

Error. Skipping...


Extracting Nutritional Information: 4557it [3:07:45,  1.63s/it]

Error. Skipping...


Extracting Nutritional Information: 4721it [3:13:10,  1.79s/it]

Error. Skipping...


Extracting Nutritional Information: 4742it [3:13:55,  1.81s/it]

Error. Skipping...


Extracting Nutritional Information: 4749it [3:14:07,  1.63s/it]

Error. Skipping...


Extracting Nutritional Information: 4766it [3:14:41,  1.75s/it]

Error. Skipping...


Extracting Nutritional Information: 4992it [3:22:45,  2.11s/it]

Error. Skipping...


Extracting Nutritional Information: 5145it [3:28:38,  1.90s/it]

Error. Skipping...


Extracting Nutritional Information: 5248it [3:32:07,  1.73s/it]

Error. Skipping...


Extracting Nutritional Information: 5313it [3:34:07,  1.63s/it]

Error. Skipping...


Extracting Nutritional Information: 5327it [3:34:56,  3.97s/it]

Error. Skipping...


Extracting Nutritional Information: 5343it [3:35:25,  1.75s/it]

Error. Skipping...


Extracting Nutritional Information: 5351it [3:35:40,  1.67s/it]

Error. Skipping...


Extracting Nutritional Information: 5368it [3:36:10,  1.84s/it]

Error. Skipping...


Extracting Nutritional Information: 5602it [3:43:54,  1.74s/it]

Error. Skipping...


Extracting Nutritional Information: 5736it [3:47:54,  1.54s/it]

Error. Skipping...


Extracting Nutritional Information: 5844it [3:51:18,  1.95s/it]

Error. Skipping...


Extracting Nutritional Information: 5881it [3:52:23,  1.55s/it]

Error. Skipping...


Extracting Nutritional Information: 5919it [3:53:35,  1.77s/it]

Error. Skipping...


Extracting Nutritional Information: 5960it [3:54:39,  1.47s/it]

Error. Skipping...


Extracting Nutritional Information: 6254it [4:05:15,  2.50s/it]

Error. Skipping...


Extracting Nutritional Information: 6255it [4:05:17,  2.48s/it]

Error. Skipping...


Extracting Nutritional Information: 6641it [4:18:48,  2.08s/it]

Error. Skipping...


Extracting Nutritional Information: 6772it [4:23:54,  1.94s/it]

In [None]:
df = pd.read_csv("train_unprocessed.csv")
df

In [9]:
import pandas as pd

df_items = pd.read_csv("train_unprocessed.csv")
# only keep certain columns
df_items = df_items[['item', 'Energy', 'Fat', 'Saturated fat', 'Carbohydrates', 'Sugars', 'Proteins', 'Fruits‚ vegetables‚ nuts and rapeseed‚ walnut and olive oils (estimate from ingredients list analysis)', 'nutriscore']]
# add units to column names and remove from entries
df_items.columns = ['item', 'Energy (kcal)', 'Fat (g)', 'Saturated fat (g)', 'Carbohydrates (g)', 'Sugars (g)', 'Proteins (g)', 'Fruits (%)', 'nutriscore']
# replace energy with just the number of kcal
df_items['Energy (kcal)'] = df_items['Energy (kcal)'].str.replace(r'[^(]*\(|\)[^)]*', '')
df_items['Energy (kcal)'] = df_items['Energy (kcal)'].str.replace('kcal', '')
df_items['Fruits (%)'] = df_items['Fruits (%)'].str.replace('%', '')
# replace all g with just the number
to_change = ['Fat (g)', 'Saturated fat (g)', 'Carbohydrates (g)', 'Sugars (g)', 'Proteins (g)']
for i in to_change:
    df_items[i] = df_items[i].str.replace('g', '')

# for all columns, except for item and nutriscore drop non numberical entries
for i in df_items.columns:
    if i != 'item' and i != 'nutriscore':
        df_items[i] = pd.to_numeric(df_items[i], errors='coerce')

# drop rows with NaN values
df_items.dropna(inplace=True)

  df_items['Energy (kcal)'] = df_items['Energy (kcal)'].str.replace(r'[^(]*\(|\)[^)]*', '')


In [8]:
df_items.to_csv("train_final_2.csv")

In [4]:
import pandas as pd

df_items = pd.read_csv("train_final.csv")

to_change = ['Fat (g)', 'Saturated fat (g)', 'Carbohydrates (g)', 'Sugars (g)', 'Proteins (g)']
for i in to_change:
    df_items[i] = (df_items[i] * df_items[i].std() + df_items[i].mean()) 

df_items.to_csv("train_final.csv")

### We now have a CSV with the processed dataframe and it is ready to be used for training by our model