In [1]:
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from io import StringIO
from ipynb.fs.defs.Barcode import BAR
import re
import requests

In [2]:
#Function to remove HTML Tags from string
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
#Function specific to our preprocessing needs
def Preprocessing(text):
    text=text.replace("<br/>"," ")
    text=strip_tags(str(text))
    text=text.replace("\n","")
    text=text.replace(",","")
    text=text.replace("[","")
    text=text.replace("]","")
    text=' '.join(text.split())
    return text

In [4]:
#Function to get columns of the nutrition table 
#Columns dictates what the numbers mean for each nutrient constituent 
#[Example of column : As sold for 100 g / 100 ml ]
#[Example of nutrient constituent : Carbohydrates]
def get_columns(text):
    tag1=r'th scope="col" style="max-width:15rem"'
    tag2=r'th scope="col"'
    tag_end='th'
    reg_str = "<" + tag1 + ">(.*?)</" + tag_end + ">"
    res1 = re.findall(reg_str, text)
    reg_str = "<" + tag2 + ">(.*?)</" + tag_end + ">"
    res2 = re.findall(reg_str, text)
    res1.extend(res2)
    columns=[]
    for i in res1:
        columns.append(Preprocessing(i))
    return columns

In [5]:
#Function to get the nutrient constituents
#Here parameters are the nutrient constituents
#returns [Carbohydrates, Protein, Saturated Fat] if those three are the ones in the table. 
def get_parameters(text):
    tag1=r'span style="padding-left: 0rem;"'
    tag2=r'span style="padding-left: 1rem;"'
    tag_end=r'span'
    reg_str = "<" + tag1 + ">(.*?)</" + tag_end + ">"
    res1 = re.findall(reg_str, text)
    reg_str = "<" + tag2 + ">(.*?)</" + tag_end + ">"
    res2 = re.findall(reg_str, text)
    res1.extend(res2)
    val=[]
    for i in res1:
        val.append(Preprocessing(i))
    return val

In [6]:
#Function to get the values of the table [numeric value of each nutrient constituent]
#This is the actual values of each nutrient constituent for each column 
def get_values(text):
    tag1=r'span'
    tag2=r'span class="green"'
    tag3=r'span class="red"'
    tag_end=r'span'
    reg_str = "<" + tag1 + ">(.*?)</" + tag_end + ">"
    res1 = re.findall(reg_str, text)
    reg_str = "<" + tag2 + ">(.*?)</" + tag_end + ">"
    res2 = re.findall(reg_str, text)
    reg_str = "<" + tag3 + ">(.*?)</" + tag_end + ">"
    res3 = re.findall(reg_str, text)
    res1.extend(res2)
    res1.extend(res3)
    val=[]
    for i in res1:
        val.append(Preprocessing(i))
    return val

In [1]:
#input -> location which is the location of the image
def Nutrition_web_scraper(location):
    #BAR(location) gets the barcode from the image
    #url will be (fixed url) + (barcode) for each specific product info
    url="https://world.openfoodfacts.org/product/"+BAR(location)
    page=requests.get(url)
    soup=BeautifulSoup(page.content,'html.parser')
    lists=soup.find_all('section',class_="row",id='health')
    for i in lists:
        nutrition=i.find('div',class_="content panel_content active expand-for-large",id="panel_nutrition_facts_table_content")
        nutrition_table=nutrition.find('table')
        nutrition_table_contents=nutrition_table.find_all('tr')
    table=[]
    table.append(get_columns(str(nutrition_table_contents[0]).replace("\n","")))
    for i in nutrition_table_contents[1:]:
        l=get_parameters(str(i).replace("\n",""))
        g=get_values(str(i).replace("\n",""))
        l.extend(g)
        table.append(l)
    return table

In [8]:
Nutrition_web_scraper(r'C:\Users\arviv\Documents\Downloads\food-barcode-label-1000x1000.jpg')

['Nutrition facts', 'As sold for 100 g / 100 ml']
['Fat'] ['?']
['Saturated fat'] ['?']
['Carbohydrates'] ['?']
['Sugars'] ['?']
['Fiber'] ['?']
['Proteins'] ['?']
['Salt'] ['?']
