In [1]:
import pandas as pd
import numpy as np
import scipy as sp
#import folium
import re
%matplotlib inline

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import seaborn as sns
import time

#from google.cloud import translate
#import pycountry
#import emoji

#translate_client = translate.Client()

import sys # for printing process
import unidecode # for normalizing text
from pathlib import Path # check files


from py_translator import Translator
translator = Translator()

# from googletrans import Translator

In [79]:
data_folder = './data/'
maps_folder = './maps/'

In [80]:
# CONSTANTS
# unknown values to use
UNKNOWN_NR='-1'
UNKNOWN_STR='unknown'
# delay between translation requests
TRANSLATION_DELAY=0.3
# progress in function
PROGRESS=0


In [81]:
def normalizeString(string):
    cleaned=''.join([i for i in string if (i.isalnum() & ~i.isdigit()) | i.isspace() ])
    return unidecode.unidecode(cleaned.lower().strip())
def showProgress(size):
    global PROGRESS 
    PROGRESS += 1
    progress_value = int(100*PROGRESS/size)/100 
    if (progress_value*100)%1==0:
        sys.stdout.write('\r'+'Progress {}%'.format(100*progress_value))
        sys.stdout.flush()

In [3]:
filename = 'en.openfoodfacts.org.products.csv'
countryfile = 'wikipedia-iso-country-codes.csv'
translationsfile = 'translations.csv'
foodfile = 'food.csv'

In [4]:
food_df = pd.read_csv(data_folder + filename, 
                      sep='\t',
                      header=0,
                      quotechar='"', 
                      low_memory=False)

In [106]:
desired_columns=[
    'ingredients_text'
]
result_column='ingredients_text'
ingredients_df=food_df.head(100).copy()

In [121]:
ingredients_df[ingredients_df['ingredients_text'].notna()]['ingredients_text']

10    antioxydant : érythorbate de sodium, colorant ...
15    Lait entier, sucre, amidon de maïs, cacao, Aga...
22    baguette Poite vin Pain baguette 50,6%: fqrine...
31    Paln suédois 42,6%: farine de BLÉ, eau, farine...
33    Taboulé 76,2%, légumes 12%, huile de colza, se...
34    Bananas, vegetable oil (coconut oil, corn oil ...
35    Peanuts, wheat flour, sugar, rice flour, tapio...
39    FROMAGE BLANC 7.896 MG 73,304, crème de marron...
44    pain aux 6 céréqles 61,7 % farine (farine de B...
46    Bâguette Bressan Pain baguette 46,2%,' fqrine ...
63    Organic hazelnuts, organic cashews, organic wa...
64                                      Organic polenta
65    Rolled oats, grape concentrate, expeller press...
66                        Organic long grain white rice
67    Org oats, org hemp granola (org oats, evaporat...
68    Organic chocolate liquor, organic raw cane sug...
69    Organic expeller pressed, refined high oleic s...
70                                 Organic adzuk

In [119]:
test=ingredients_df[ingredients_df[result_column].notna()][result_column].loc[10]
test_=test.split(',')
test_

['antioxydant : érythorbate de sodium',
 ' colorant : caramel - origine UE)',
 ' tomate 33',
 '3%',
 ' MAYONNAISE 11',
 '1% (huile de colza 78',
 '9%',
 ' eau',
 " jaunes d'OEUF 6%",
 ' vinaigre',
 ' MOUTARDE [eau',
 ' graines de MOUTARDE',
 ' sel',
 ' vinaigre',
 ' curcuma]',
 ' sel',
 ' dextrose',
 ' stabilisateur : gomme de cellulose',
 ' conservateur : sorbate de potassium',
 ' colorant : ?-carotène',
 ' arôme)']

In [120]:
cleanIngredients(test)

Progress 104.0%

['erythorbate de sodium',
 'caramel',
 'tomate',
 'mayonnaise',
 'eau',
 'jaunes doeuf',
 'vinaigre',
 'moutarde',
 'eau',
 'graines de moutarde',
 'sel',
 'vinaigre',
 'curcuma',
 'sel',
 'dextrose',
 'gomme de cellulose',
 'sorbate de potassium',
 'carotene',
 'arome']

In [116]:
def cleanIngredients(row):
    showProgress(ingredients_df.shape[0])
    if type(row) is not list and pd.notnull(row) :
        values=row.split(',')
        ingredients=[]
        for item in values:
            ingredient=item
            # format key:value
            # take only value
            if (':') in ingredient:
                info_ = ingredient.split(':')
                if len(info_) == 2:
                    ingredient=info_[1]
            # format value (info)
            # take only values
            if ('(') in ingredient:
                info_ = ingredient.split('(')
                ingredient=info_[0]
                
            # format value [value,...]
            # take all values
            if ('[') in ingredient:
                info_ = ingredient.split('[')
                if len(info_) >= 2:
                    ingredients.append(normalizeString(info_[0]))
                    ingredient=info_[1]
            # format value - info
            # take only values
            if (' - ') in ingredient:
                info_ = ingredient.split('-')
                ingredient=info_[0]
                
            # avoid empty strings
            ingredient=normalizeString(ingredient)
            if ingredient:
                ingredients.append(ingredient)

        return ingredients
    else:
        return row

In [123]:
PROGRESS=0
ingredients_df[result_column] = ingredients_df[result_column].apply(
    lambda x: cleanIngredients(x)
)

Progress 100.0%

In [128]:
ingredients_df[ingredients_df[result_column].notna()][result_column]

10    [erythorbate de sodium, caramel, tomate, mayon...
15    [lait entier, sucre, amidon de mais, cacao, ag...
22    [baguette poite vin pain baguette, fqrine de b...
31    [paln suedois, farine de ble, eau, farine de s...
33    [taboule, legumes, huile de colza, sel, menthe...
34    [bananas, vegetable oil, corn oil andor palm o...
35    [peanuts, wheat flour, sugar, rice flour, tapi...
39    [fromage blanc  mg, creme de marron vanillee, ...
44    [pain aux  cereqles, farine, gluten de ble, eu...
46    [baguette bressan pain baguette, fqrine de ble...
63    [organic hazelnuts, organic cashews, organic w...
64                                    [organic polenta]
65    [rolled oats, grape concentrate, expeller pres...
66                      [organic long grain white rice]
67    [org oats, org hemp granola, evaporated cane j...
68    [organic chocolate liquor, organic raw cane su...
69    [organic expeller pressed, refined high oleic ...
70                               [organic adzuki