In [1]:
# Libraries to analyze web page source 
from bs4 import BeautifulSoup
import requests

import time

import pandas as pd

In [2]:
import numpy as np

def convert(value,initial_unit,final_unit):
    '''Returns value corresponding to amount with different unit
        value - nummerical value (e.g. 100) of amount (e.g. '100 g')
        intial_unit - unit of amount (e.g. 'g')
        final_unit - unit of amount with desired unit (e.g 'mg')
    '''
    # Metric prefixes
    pxs = ['y','z','a','f','p','n','µ','m','c','d','','da','h','k','M','G','T','P','E','Z','Y']

    # Exponents
    exps = [-24,-21,-18,-15,-12,-9,-6,-3,-2,-1,0,1,2,3,6,9,12,15,18,21,24]

    # Units 
    ref_units = ['A','K','s','m','g','cd']

    # Find unit 
    for ru in ref_units:
        
        if (initial_unit.find(ru) != -1) & (initial_unit[initial_unit.find(ru):] == ru):
            initial_ref_unit = ru
            
        if (final_unit.find(ru) != -1) & (final_unit[final_unit.find(ru):] == ru):
            final_ref_unit = ru
            
    if (final_ref_unit == initial_ref_unit):
        
        # Prefix of the intial and final unit
        initial_unit_prefix = initial_unit.split(initial_ref_unit)[0]
        final_unit_prefix = final_unit.split(final_ref_unit)[0]
        
        # Return value corresponding to amount with desired unit
        return(value * 10 ** (exps[pxs.index(initial_unit_prefix)] - exps[pxs.index(final_unit_prefix)])) 
    
    else:
        
        return(np.nan)

In [9]:
# Key word for USDA search engine
keyword = 'raw'

max_offset = int(str(list(list(BeautifulSoup(requests.get("https://ndb.nal.usda.gov/ndb/search/list?maxsteps=6&format=&count=&max=25&sort=default&fgcd=&manu=&lfacet=&qlookup={}&ds=&qt=&qp=&qa=&qn=&q=&ing=&offset={}&order=asc".format(keyword, 0)).content, 'html.parser').children)[31].children)[-3]).split('offset=')[1].split('&')[0])

# Empty lists to hold NDBIDs, food names, and food groups
NDBIDs = []
FoodNames = []
FoodGroups = []
    
for offset in range(0,max_offset + 1, 25):
        
    url = "https://ndb.nal.usda.gov/ndb/search/list?maxsteps=6&format=&count=&max=25&sort=default&fgcd=&manu=&lfacet=&qlookup={}&ds=&qt=&qp=&qa=&qn=&q=&ing=&offset={}&order=asc".format(keyword, offset)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
        
    # Create empty data frame during first iteration of loop
    if offset == 0:
        num_of_entries = int(soup.find('div',"alert alert-info result-message").text.split('f')[0].strip().replace(',',''))
        df = pd.DataFrame(index = range(0,num_of_entries))
        # Index to insert values into df
        index = 0
    
    # List containing NDBIDs, food names, and food groups for one web page
    List = list(list(list(list(list(soup.children)[33].children)[1].children)[3].children))
    
    for j in range(1, len(List),2):
        # String containing NDBID, food name, food group for a row
        StringWithIdentifiers = List[j].text.strip()
        # List containing NDBID, food name, food group for a row
        ListWithIdentifiers = StringWithIdentifiers[StringWithIdentifiers.find('\n'):].strip().split('\n')
        NDBID = ListWithIdentifiers[0].strip()
        NDBIDs.append(NDBID)
        FoodNames.append(ListWithIdentifiers[4].split('\t\t')[-1].strip())
        FoodGroups.append(ListWithIdentifiers[-1])
            
        url = "https://ndb.nal.usda.gov/ndb/foods/show/{}?fgcd=&manu=&format=&count=&max=25&offset={}&sort=default&order=asc&qlookup=raw&ds=&qt=&qp=&qa=&qn=&q=&ing=".format(NDBID, offset)
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
            
        column_name_elements = soup.find_all('tr')[0].find_all('th')[3].text.split(' ')
        denominator_value = float(column_name_elements[2])
        denominator_unit = column_name_elements[3]

        length = len(soup.find_all('td'))

        for i in range(1,length,6):
            try:
                nutrient = soup.find_all('td')[i].text.strip()
                val = float(soup.find_all('td')[i+2].text.strip()) 
                unit = soup.find_all('td')[i+1].text.strip() 

                try:
                    d = convert(denominator_value, denominator_unit, unit)
                    ratio = val / d
    
                except:
                    d = convert(denominator_value, denominator_unit, 'g')
                    ratio = val / d 
                    
                #Insert info into df
                if nutrient in df.columns:
                    df.loc[index][nutrient] = ratio
                else:
                    df[nutrient] = None
                    df.loc[index][nutrient] = ratio
            except:
                continue 
            
        index = index + 1
        
    # Delay 1 seconds
    time.sleep(1)

IndexError: list index out of range