# Preparar entorno de trabajo

In [None]:
################################################################################
# Preparar entorno de trabajo
################################################################################
!pip install pyspark
!pip install scikit-learn
!pip install tensorflow
!pip install torch
!pip install mtranslate
!pip install findspark
!pip install unidecode
!pip install deep_translator
!pip install langdetect
!pip install pycountry
!pip install pycountry_convert

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285388 sha256=aaed8f67839927ce160291cd37258c8410662072dd5dcf1981d05f1ea89feba4
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
Collecting mtranslate
  Downloading mtranslate-1.8.tar.gz (2.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mtranslate
  Building wheel for mtranslate (setup.py) ... [?25l[?25hdone
  Created wheel for mtranslate: filename=

## Librerías

In [None]:
################################################################################
# Preparar entorno de trabajo
################################################################################
# IGNORAR WARNINGS
import warnings
warnings.filterwarnings('ignore')

# Importación de el contenido en el DRIVE
from google.colab import drive
drive.mount('/content/drive')

# Datos
import pandas as pd
import numpy as np
import json
import requests
from datetime import datetime
from unidecode import unidecode
import time
from unidecode import unidecode

# Pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.sql.types import *
import findspark
findspark.init()

# sklearn Modelos
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm

# Tensoflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Activation
from tensorflow.keras.optimizers import SGD

# Torch
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Preparar datos
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, roc_curve, confusion_matrix, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from scipy.stats import reciprocal, uniform

# Gráficos
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import seaborn as sns

# Traductor
from mtranslate import translate
from deep_translator import GoogleTranslator
from langdetect import detect

# Paises
import pycountry
import pycountry_convert as pc

Mounted at /content/drive


### Importar datos

In [None]:
################################################################################
# CREAMOS EL SPARK SESSION
################################################################################

spark = SparkSession.builder.appName("TFM").getOrCreate()

################################################################################
# Datos Open Food Facts
# URL = https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv
################################################################################

df_datos = spark.read.options(header="True", inferSchema='True', delimiter='\t').csv("/content/drive/MyDrive/Datos TFM/en.openfoodfacts.org.products.csv")

### Funciones

In [None]:
################################################################################
# Función para realizar la traducción
################################################################################
def translate_text(text, target_language):
  time.sleep(1)  # Pausa de 1 segundo entre solicitudes
  translation = translate(text, target_language)

  return translation

################################################################################
# Funciones de limpieza de datos (DataFrame Pandas)
################################################################################
def datos_a_mayusculas(df):

  df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
  return df
################################################################################
################################################################################
################################################################################
def eliminar_nulos(df):

  df = df.dropna(axis=0, how='all')

  return df
################################################################################
################################################################################
################################################################################
def eliminar_duplicados(df):

  df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

  return df
################################################################################
################################################################################
################################################################################
def reemplazar_caracteres(df):
  df = df.str.upper()
  df = df.str.replace('/',' ')
  df = df.str.replace(',',' ')
  df = df.str.replace('-',' ')
  df = df.str.replace(' ','_')
  df = df.str.replace('Á','A')
  df = df.str.replace('É','E')
  df = df.str.replace('Í','I')
  df = df.str.replace('Ó','O')
  df = df.str.replace('Ú','U')
  df = df.str.replace('Ü','U')
  df = df.str.replace('Ñ','N')

  return df
################################################################################
# Función detección idioma
################################################################################
def detect_language(s):
    try:
        return detect(s)
    except:
        return None
################################################################################
################################################################################
################################################################################
def get_iso3(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
################################################################################
################################################################################
################################################################################
def get_continent(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.continent.name
    except LookupError:
        return None
################################################################################
################################################################################
################################################################################
def standardize_country_name(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.name
    except LookupError:
        return country_name

# Procesado de datos

In [None]:
################################################################################
# Transformar nombre del producto
################################################################################

# Convertir la columna 'product_name' a minúsculas
df_datos = df_datos.withColumn("product_name", lower(col("product_name")))

# Aplica las funciones para eliminar los tildes y convertir a minúsculas
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[áäàâãå]", "a")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[éëèê]", "e")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[íïìî]", "i")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[óöòôõ]", "o")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[úüùû]", "u")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[ñ]", "n")))

# Transformar la columna 'product_name' al tipo StringType
df_datos = df_datos.withColumn("product_name", col("product_name").cast("string"))

In [None]:
################################################################################
# Generar lista de busquedas en distintos idiomas:
# - Seitan
# - Soja
# - Tofu
################################################################################

lista_include = ['seitan', 'tofu', 'soja', 'carne vegetal']
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_include = []

for frase in lista_include:
  full_include.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include.append(translated)

# Convertir todos los elementos de la lista a minúsculas
full_include = [elemento.lower() for elemento in full_include]

# Obtener los elementos distintos de la lista
full_include_distinct = list(set(full_include))

print(full_include_distinct)

['tahu', 'tofu', 'soja', 'kedelai', 'carne vegetal', 'daržovių mėsa', 'sebze eti', 'carn vegetal', 'szeitán', 'sojos', 'viande végétale', 'thịt rau', 'karne ng gulay', 'cig llysiau', 'گوشت سبزیجات', 'soy', 'növényi hús', 'carne de legume', 'seiten', 'سویا', 'gemüsefleisch', 'seitanas', 'soya peyniri', 'hilibka khudradda', 'đậu hũ', 'سیتان', 'tokwa', 'toyo', 'vegetable meat', 'nyama ya mboga', 'vegetabiliskt kött', 'soya', 'thịt trắng', 'soi', 'biljno meso', 'توفو', 'soia', 'szója', 'daging sayur', 'mięso warzywne', 'seitan', 'đậu nành', 'vegetabilsk kjøtt']


In [None]:
################################################################################
# Obtener los productos
################################################################################

# Filtrar los datos basados en los valores de la lista
df_products_distinct = df_datos.filter(col("product_name").rlike("|".join(full_include_distinct)))

df_products_distinct.show(5, truncate=False)

+------------+---------------------------------------------------------------------------------------------------------+--------------------+----------+-------------------+---------------+----------------------+----------------+------------------------------------+------------------------+-------------------------------------+--------+--------------------+-----------------------+-----------------+--------------+--------------------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+-------+------------+----------+--------------------+-------------------------+--------

In [None]:
################################################################################
# Convertir el DataFrame de PySpark a un DataFrame de pandas
################################################################################
pandas_df = df_products_distinct.toPandas()

################################################################################
# Inference los tipos de datos automáticamente
################################################################################
pandas_df = pandas_df.infer_objects()

################################################################################
# Imprimir los tipos de datos de cada columna
################################################################################
print(pandas_df.shape)
print(pandas_df.dtypes)

(14747, 201)
code                         float64
url                           object
creator                       object
created_t                      int32
created_datetime      datetime64[ns]
                           ...      
choline_100g                 float64
phylloquinone_100g           float64
beta-glucan_100g             float64
inositol_100g                float64
carnitine_100g               float64
Length: 201, dtype: object


In [None]:
################################################################################
# Generar lista de exclusiones
################################################################################
'''
lista_exclude = ["salsa de soja","sauce soja","sauce soja","bebida de soja","yaourt soja","yogurt de soja","galletas","chocolat","spaghetti","arroz y soja","leche","mousse","milk","aceite","dessert",
                 "pan soja","boisson soja","glace","sauce de soja","sauce","salsa","bibeda de soja","lait soja","vivesoy soja","lait de soja","yaourt","postre","muesli","yogur","bebida",
                 "margarina","vinaigrette", "salsa de soja","sauce soja", "sauce soja","bebida de soja", "yaourt soja","yogurt de soja","chocolat","arroz y soja","leche","mousse","milk",
                 "aceite","dessert","pan soja","boisson soja","glace","ml","sauce de soja","sauce","salsa","bibeda de soja","lait soja","vivesoy soja","lait de soja","yaourt","postre","muesli",
                 "yogur","bebida","margarina","vinaigrette","drink","sauce","batido","lait","mayonnaise","cafe","vanille","nata","yogurt","lactovisoy","alimento de soya","alimento liquido de soya",
                 "bevanda","sweet soy","vanille","vanilla","unsweetened","sweetened","strawberry","protein isolate","soya calcium","blueberry","beverage","barista","soya a tartiner","jugo","protein powder",
                 "protein isolate","cacao","calcium","calcio","nutri soja","soia bianco cremoso","lecitina","lecithine","latte","haricots","germe","gelato","gateau","fromage frais","infusion","soybean paste",
                 "mixbeans","miso soup","soybeans","pate de soja","plain soya","petit’soif","soybeans","framboise","mango","douceur","sojabohnenkeimlinge","keimlinge","mangue","peche","manzana","vichyssoise",
                 "chocoavena","avena"]
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_exclude = []

for frase in lista_exclude:
  full_exclude.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_exclude.append(translated)


# Convertir todos los elementos de la lista a minúsculas
full_exclude = [elemento.lower() for elemento in full_exclude]

# Obtener los elementos distintos de la lista
full_exclude_distinct = list(set(full_exclude))

print(full_exclude_distinct)'''
full_exclude_distinct = ['cocoa', 'bakterie', 'margaryna', 'makanan kedelai cair', 'diod', 'feijão', 'vanília', 'comida líquida de soja', 'llus', 'lody', 'kalsiyum', 'şekerli', 'obuolys',
'pasta sojowa', 'sbageti', 'spagečiai', 'smoothie', 'salsa', 'aliment liquid de soja', 'pagkain ng toyo', 'pâte de soja', 'sojabönpasta', 'pasta de soia', 'amestec de fasole',
'çikolatalı yulaf ezmesi', 'mansanas', 'شیرین نشده', 'fűzfa', 'putėsiai', 'nutri soja', 'mjölk', 'kruh od soje', 'miso suppe', 'nutri soya', 'był kremowo biały', 'sojos maistas',
'iogurte', 'izole protein', 'protein pulver', 'sojayoghurt', 'cacao', 'tsokolate oatmeal', 'tauchen', 'soabohnenkeimlinge', 'đậu tương', 'sauce soja', 'soya fasulyesi',
 'calsiwm soi', 'kecap', 'soy food', 'soyabønner', 'fromage frais', 'waxay ahayd cadaan labeen leh', 'hidangan penutup', 'kopi', 'slatka sam', 'blåbär', 'çilek', 'kuman',
 'مخلوط دانه ها', 'isolate protein', 'zalążek', 'söğüt', 'soia vivesoy', 'yaourt soja', 'stroberi', 'blawd ceirch', 'kafeega', 'دسر', 'ماست سویا', 'leche', 'sữa chua', 'saule de soja',
 'myrtilles', 'framboise', 'gerti', 'mieszanka fasoli', 'boisson soja', 'portão', 'himbeere', 'walang tamis', 'yaban mersini', 'gėrimas', 'jogurtas', 'busa', 'jordgubbe',
 'batut', 'biji cokelat', 'riža i soja', 'nutri szója', 'soya yoğurdu', 'mango', 'soi i tartiner', 'sos sojowy', 'xocolata', 'pão de soja', 'niesłodzone', 'vanilj', 'infusão',
 'haricot', 'biscoitos', 'cô lập protein', 'maraq miso', 'ویشیسواز', 'سویا', 'bột đậu nành', 'adoçado', 'jalaato', 'شکلات', 'borovnice', 'món tráng miệng', 'frambuaz', 'roti kedelai',
 'cálcio', 'fframws', 'riz et soja', 'aveia', 'soia yaourt', 'krem', 'sojos pupelės', 'kava', 'lecitin', 'kalcium', 'vivesoy szója', 'người đánh cá', 'minyak', 'soy cad',
 'unsweetened', 'ovaz', "petit'soif", 'deser', 'ungesüßt', 'soya tupu', 'içmek', 'kedelai', 'kedelai bosson', 'jordbær', 'soja à tartiner', 'sudd', 'vanilija', 'lecithin',
 'blaubeeren', 'jis buvo kreminis baltas', 'cà phê', 'vinegretă', 'không đường', 'wyizolować białko', 'olej', 'margarină', 'csokoládé', 'jogurt sojowy', 'kalzium', 'minum',
 'folyékony szójaétel', 'calcio', 'espaguete', 'نوشیدنی', 'era branco cremoso', 'ciasto', 'ujj', 'diod soi', 'proteína isolada', 'لسیتین', 'doce eu sou', 'dharbaaxo', 'cawl miso',
 'frambois', 'sauce', 'llawes', 'chokoleti', 'ulje', 'szója kenyér', 'kalsiamu', 'öl', 'blueberries', 'آب میوه', 'soyabohnenkeimlinge', 'sojamat', 'vichysoise', 'vinaigretas',
 'infusjon', 'soi nutri', 'peći', 'olhar', 'civada', 'متصدی بار', 'arròs i soja', 'embe', 'taip', 'era alb crem', 'kaltsyum', 'yogur', 'desertas', 'izoliuoti baltymą', 'schokolade',
 'molho de soja', 'sữa đậu nành', 'suco', 'eiscreme', 'trinken', 'soi plaen', 'piće od soje', 'شیب', 'من شیرینم', 'melys ydw i', 'maionese', 'barmen', 'frischkäse', 'soja a tartiner',
 'szója ital', 'gröt', 'peçe', 'napiwek', 'neindulcit', 'tay áo', 'lecithine', 'küçük soif', 'دروازه', 'mầm', 'drink', 'sojasauce', 'lait de soia', 'llet', 'blandbønner',
 'soy yogurt', 'framug', 'buskudka', 'kremsi beyazdı', 'gesüßt', 'ang sweet ko', 'nesaldus', 'piće', 'napój', 'vichyssoise', 'barista', 'jiifay', 'flytende soyamat',
 'sojabohnenpaste', 'غذای سویا', 'cacau', 'şekersiz', 'سوپ میسو', 'یاورت', 'سس سویا', 'lango', 'روغن', 'soybean lait', 'vinägrett', 'rizs és szója', 'beras dan kedelai', 'susu', 'ärmel',
 'izolirati protein', 'sojagetränk', 'ryžiai ir soja', 'misusoppa', 'majonez', 'rukav', 'pwdin', 'kareem', 'spaghete', 'soya sosu', 'orez si soia', 'iogwrt', 'boisson-soja',
 'tambi', 'alimente lichide din soia', 'panghimagas', 'بید سویا', 'sötad', 'gạo và đậu nành', 'sojos jogurtas', 'afine', 'glass', 'soja yaourt', 'soya ya soya', 'infusion', 'chovya',
 'barmann', 'jeermiska', 'boorash', 'kem', 'sojina vrba', 'søtet', 'kalsium', 'łoj sojowy', 'alimento liquido de soya', 'pate de soja', 'موس', 'sos de soia',
 'toyo hanggang tartiner', 'hufen ia', 'sojaböna lait', 'soya a tartiner', 'óleo', 'laktovisoy', 'kunywa', 'توت فرنگی', 'پروتئین را جدا کنید', 'bột đạm', 'اسموتی', 'نوشیدنی سویا',
 'torte', 'bariis iyo soy', 'infüzyon', 'soya til tartiner', 'petit’soif', 'apfel', 'inuming toyo', 'słodki jestem', 'نان سویا', 'cuka', 'mâncare din soia',
 'soy calcium', 'mlijeko', 'sojų pupelių laik', 'soia la tartinere', 'napój sojowy', 'selje', 'mikrop', 'manga', 'leit soja', 'vivesoy sojos', 'băutură de soia',
 'šokolado', 'hab', 'karışık fasulye', 'majoneza', 'rice and soy', 'fraise', 'ca cao', 'uống', 'schokoladen-haferflocken', 'żywność sojowa', 'pâine de soia', 'dầu',
 'yogurt kedelai', 'soia simplă', 'yaourt sojaböna', 'nước ép', 'mayones', 'canxi', 'mergulhar', 'mangó', 'sweet sóc', 'soya', 'soja ao tartiner', 'calcium soy', 'dopp', 'băutură',
 'vanilla', 'liễu đậu nành', 'sausainiai', 'soyayoghurt', 'sjokolade', 'iogurte de soja', 'کرم رنگ', 'bila sukari', 'yaourt soya', 'choklad', 'کیملینگه', 'vartai', 'hufen', 'وینگرت',
 'informasjonskapsler', 'سویا به تارتینر', 'supa miso', 'mayonnaise', 'chocolat', 'havermut', 'soya söğüdü', 'apel', 'soy sauce', 'arroz e soja', 'yaourt soybean', 'pieno', 'müsli',
 'krema', 'proteinski prah', 'reis a soi', 'špageti', 'glasiert', 'zanurzać', 'vanilė', 'poda ya protini', 'sojų gluosniai', 'szójafűz', 'söt jag är', 'köpük', 'yulaf ezmesi',
 'sladoled', 'jutalom', 'izolați proteine', 'kapı', 'musli', 'bánh mì đậu nành', 'zabpehely', 'ledai', 'briwsion', 'pasta od soje', 'safsaf soy', 'calciu de soia', 'kedelai yaourt',
 'quả táo', 'galetes', 'leite de soja', 'gruau', 'rękaw', 'soja calcium', 'chleb sojowy', 'weka soya', 'maraqa soy', 'yatırmak', 'truskawka', 'nata', 'gelato', 'gateau', 'بلوبری', 'milk',
 'lapte', 'sinh tố', 'salze soja', 'desert', 'rooti soy ah', 'سویا vivesoy', 'szója kalcium', 'mì ống ý', 'yg tak diberi gula', 'roedd yn wyn hufennog', 'laktowizoj', 'ice cream', 'cudud',
 'lécithine', 'bartender', 'cây liễu', 'sojajoghurt', 'vivesoy soya', 'cabbid', 'sup miso', 'koktajl', 'glace', 'berbaring', 'faleebo', 'haferflocken', 'pot de soja', 'ihiwalay ang protina',
 'tekuća hrana od soje', 'kurabiye', 'سویا بویسون', 'chocolate', 'kalcio', 'pirinç ve soya', 'soya kalsiyum', 'getränk', 'soja till tartiner', 'jugo', 'gác cổng', 'zaslađen', 'mleko', 'juisi',
 'miso sopas', 'بلغور جو دوسر شکلاتی', 'saws soî', 'soya ekmeği', 'ffa cymysgedd', 'choklad havregryn', 'pinatamis', 'sojos gėrimas', 'olja', 'isawsaw', 'zwykła soja', 'cookies', 'mielas as',
 'arroz y soja', 'chocolate oatmeal', 'بلغور جو دوسر', 'quả dâu', 'dondurma', 'ml', 'gatas', 'wierzba', 'yaourt au soja', 'giống đậu tương', 'mayonesa', 'pa soja', 'manche', 'sauce de soja',
 'kolačići', 'deine sojabohne', 'dyppe', 'bohnen', 'barman', 'soyamat', 'موسلی', 'keimlinge', 'doux je suis', 'coffi', 'mélanger les haricots', 'margarin', 'umočiti', 'cremă', 'proteína em pó',
 'torta', 'soyapil', 'sojabönor', 'kedelai biasa', 'soy bread', 'coco', 'eple', 'zobena kaša', 'nutri soia', 'mefus', 'afal', 'quả xoài', 'beguda', 'katas', 'bánh xốp', 'calci', 'einfaches soja',
 'maioneză', 'oatmeal cokelat', 'sütiket', 'pan soja', 'margarine', 'flakonik na sole trzeźwiące', 'aveia com chocolate', 'latte', 'krémfehér volt', 'taze peynir', 'sojapil', 'canxi đậu nành',
 'pagbubuhos', 'cookie-uri', 'mencelupkan', 'sweet i am', 'vinaigrette', 'it was creamy white', 'zrna soje', 'mổ xẻ', 'soyadrikk', 'braškių', 'tufaha', 'kremas', 'doçura', 'huile', 'sucré',
 'kinywaji cha soya', 'tamu', 'suapan', 'maharagwe mchanganyiko', 'mgando', 'safsaf', 'osłodzony', 'peche', 'bột yến mạch sô cô la', 'برنج و سویا', 'søt jeg er', 'soya yemeği', 'boisson au soja',
 'fehérjét izolálni', 'پودر پروتئین', 'coffee', 'bluberi', 'infusi', 'bebida de soja', 'elbise kolu', 'yaourt szójabab', 'ffa soia', 'sojanahrung', 'lecitinas', 'iskrem', 'szója a tartinerhez',
 'soja vivesoy', 'pain de soja', 'šokoladiniai avižiniai dribsniai', 'nährstoff-soja', 'czekoladowe płatki owsiane', 'soya calcium', 'tartiner için soya', 'maziwa', 'sojabrot', 'sojino mlijeko',
 'kakaw', 'انبه', 'helyg soi', 'sojabohnenkeimlinge', 'lukrowany', 'bibeda de soja', 'beverage', 'lecytyna', 'sojin kalcij', 'salsa de soja', 'postre', 'plain soy', 'sô cô la', 'juice', 'blåbær',
 'willow ya soya', 'boisson sojos', 'cukrozatlan', 'daldırma', 'soja simples', 'kaffe', 'jūsų sojos pupelės', 'qasacadaysan', 'isoler la protéine', 'inumin', 'vrba', 'soybeans', 'sojabohnen',
 'apple', 'desszert', 'aan la macaanayn', 'melys', 'xì dầu', 'fehérje por', 'quusin', 'morango', 'sorvete', 'café com leite', 'iogurt de soja', 'ciasteczka', 'sem açúcar', 'tremper', 'oil',
 'boire', 'vanilie', 'cabitaan soy ah', 'lait soia', 'nhân viên pha chế', 'soya ya lishe', 'vanila', 'salcie de soia', 'soybean paste', 'glacé', 'thức ăn từ đậu nành', 'خمیر سویا', 'miso leves',
 'lengan baju', 'olew', 'sima szója', 'shukulaatada', 'umak od soje', 'boisson szója', 'iogurt', 'ärm', 'calcium', 'sobremesa', 'shukulaatada oatmeal', 'soia boisson', 'periuţă', 'căpșună',
 'spagetti', 'barafu', 'macmacaan', 'lait soja', 'kakaó', 'yourt', 'soya fasulyesi ezmesi', 'yfed', 'kakao', 'beguda de soja', 'soyakalsium', 'rankovė', 'avena', 'soya ya vivesoy', 'germen',
 'saliid', 'kekse', 'koollada soybeanka', 'majones', 'ماست', 'sulčių', 'es war cremeweiß', 'wanilia', 'mousse', 'helyg', 'calsiwm', 'سویای ساده', 'kahve', 'soybean pasta', 'toyo', 'kavu s mlijekom',
 'لونی ها', 'سس مایونز', 'supu ya miso', 'tatlı', 'jabuka', 'flytande sojamat', 'infuzija', 'đậu trộn', 'manggas', 'boisson đậu nành', 'kue', 'meyve suyu', 'manzana', 'germ', 'măr', 'olje',
 'yumurtlanmış soya', 'yoğurt', 'protein tozu', 'llefrith', 'mafuta', 'mėlynės', 'lait ffa soia', 'وانیل', 'baunilha', 'mixbeans', 'makanan kedelai', 'nourriture de soja', 'fanila',
 'mayonez', 'batido', 'smwddi', 'comida de soja', 'vanlig soja', 'bevanda', "flocons d'avoine au chocolat", 'salgueiro de soja', 'unga wa soya', 'ito ay creamy white',
 'yaourt soya fasulyesi', 'infuzie', 'lesitin', 'شیر', 'caano fadhi', 'eper', 'sok', 'ynysu protein', 'îndulcit', 'salgueiro', 'muesli', 'småkakor', 'protein isolieren',
 'لاته', 'soja boisson', 'chakula cha soya', 'oatmeal', 'zupa miso', 'sjokolade havregryn', 'minuman', 'mânecă', 'پچه', 'suc', 'đậu nành vivesoy', 'tufaax', 'bebida',
 'đậu nành nguyên chất', 'sade soya', 'giấm', 'es krim', 'amoras', 'pomme', 'sojakalcium', 'plain soya', 'szója szósz', 'havregryn', 'proteinpulver', 'macaantay',
 'yaourt soyabønner', 'đậu nành dinh dưỡng', 'cambe', 'kinywaji', 'soy la galiyay', 'میلی لیتر', 'milch', 'salze', 'lait', 'maharagwe ya soya', 'lleyg', 'digirta', 'lactofisoi',
 'làm ngọt', 'macaan waxaan ahay', 'fulgi de ovaz de ciocolata', 'kanin at toyo', 'mikrobyo', 'truyền dịch', 'mchuzi wa soya', "go'doomin borotiinka", 'saule', 'scufundare', 'mangga',
 'dip', 'tej', 'کلسیم سویا', 'soja zu tartiner', 'protein powder', 'kawa', 'kavos', 'lait de soja', 'đẻ', 'majarini', 'vidakuzi', 'infúzió', 'aceite', 'pasta de soja', 'poarta', 'kalsium kedelai',
 'pit', 'dulce sunt', 'سفید مایل به کرم بود', 'cabitaanka', 'rượu dâu cất', 'بستنی', 'lait szója', 'budada borotiinka', 'obična soja', 'blawd ceirch siocled', 'تزریق', 'tenga protini', 'miso juha', 'jus',
 'بید', 'csokis zabpehely', 'műzli', 'sojos kalcio', 'soja sås', 'sojabröd', 'laini', 'tuổi già', 'đậu nành để khai vị', 'osötad', 'kookaha', 'szójajoghurt', 'sữa', 'majonezo', 'itu putih krem',
 'oli', 'گلس', 'miso çorbası', 'muslis', 'lecithini', 'içecek', 'płynna żywność sojowa', 'csíra', 'kedelai vivesoy', 'بیسکویت ها', 'miso soup', 'boabe de soia', 'poma', 'tsokolate', 'sojų duona',
 'soybean laat', 'spaghetti', 'alyva', 'mkate wa soya', 'banilya', 'yogurt soy', 'isolera protein', 'ryż i soja', 'sojadryck', 'salze de soja', 'vide', 'vinagrete', 'nó có màu trắng kem',
 'yoourt zrna soje', 'creme', 'vanlig soya', 'strawberry', 'sojos iki tartinerio', 'غذای مایع سویا', 'soupe miso', 'süß, ich bin', 'białko w proszku', 'thực phẩm đậu nành lỏng', 'reis und soja',
 'płatki musli', 'gelat', 'tatlıyım ben', 'margarino', 'vivesoy soy', 'ris og soya', 'soy willow', 'nutri soy', 'panirimas', 'دوسر', 'kijidudu', 'minuman kedelai', 'دراز کشیدن', 'boisson soya', 'klica',
 'yağ', 'çikolata', 'ris och soja', 'csapos', 'paprastos sojos', 'đồ uống', 'kape', 'sıvı soya gıda', 'gluosnis', 'yoghurt', 'crème', 'quả việt quất', 'casiir', 'mysli', 'کلسیم', 'plain soja',
 'poudre de protéine', 'den var gräddvit', 'jégkrém', 'čokoladna zobena kaša', 'yogurt', 'sojamilch', 'mengisolasi protein', 'lait soya', 'jogurt', 'kacang kedelai', 'kedelai untuk tartiner',
 'flüssiges sojafutter', 'bemárt', 'wapń sojowy', 'ffa soia yaourt', 'kakavos', 'kalcij', 'manis aku', 'pecze', 'cream', 'سیب', 'kutoka frais', 'ilikuwa nyeupe krimu', 'bara soi', 'postres',
 'mtindi', 'buzlu', 'soyabønner lait', 'soi vivesoy', 'owsianka', 'sojų padažas', 'willow', 'cafe', 'soyabønnepasta', 'skystas sojos maistas', 'jabłko', 'sojin jogurt', 'süt', 'wapń', 'saft',
 'yogurt de soja', 'joghurt', 'alma', 'sojų pasta', 'سویا لایت', 'soia bianco cremoso', 'soya içeceği', 'mchele na soya', 'szójabab', 'calcium de soja', 'cháo bột yến mạch', 'keimling', 'pupelių mišinio',
 'avižiniai dribsniai', 'krim', 'szarkalábak', 'mtindi wa soya', 'wilow', 'miso-suppe', 'مارگارین', 'soyabrød', 'bánh quy', 'nhúng', 'baltymų milteliai', 'yaourt', 'spageti', 'galletas', 'erme', 'fasola',
 'tårta', 'cuntada soy', 'dessert', 'vanilya', 'szója étel', 'erdbeere', 'soja do tartinera', 'kuru fasulye', 'pudră de proteine', 'soy to tartiner', 'ریشه', 'cafea', 'mànec', 'dipermanis', 'tinapay na toyo',
 'leite', 'cunto soy dareere ah', 'pêche', 'elma', 'cálcio de soja', 'kalsiamu ya soya', 'majonéz', 'iaurt de soia', 'non sucré', 'vivesoy soja', 'calciu', 'margarina', 'soja nature', 'xocoavena',
 'kaffee', 'grädde', 'mimi ni mtamu', 'gemalas', 'miso sriuba', 'napar', 'bubuk protein', 'áfonya', 'aliment de soja', 'wierzba sojowa', 'protina pulbos', 'سویای نوتری', 'efterrätt', 'bilo je kremasto bijelo',
 'jagody', 'mixbohnen', 'güler yüzlü', 'vanilje', 'chakula cha soya kioevu', 'سویا یاورت', 'margarîn', 'dryck', 'nachtisch', 'édes vagyok', 'pasaldintas', 'soja-kalzium', 'barmenas', 'mangue', 'lecitina',
 'melk', 'iaurt', 'lecitină', 'isolere protein', 'biscuits', 'bơ thực vật', 'chocoavena', 'bwyd soi', 'lactovisoy', 'liquid soy food', 'soyasaus', 'soia bianco cremós', 'siocled', 'muraayad', 'cokelat',
 'lai rai', 'branza proaspata', 'nutri sojos', 'past ffa soia', 'soja', 'vaisių kokteilio', 'boisson soi', 'bwyd soi hylif', 'café', 'sorbetes', 'nezaslađen', 'soy drink', 'sữa chua đậu nành', 'kahawa',
 'powdr protein', 'alimento de soya', 'nourriture liquide à base de soja', 'olaj', 'majonnäs', 'weide', 'sojaweide', 'sopa de missô', 'mhudumu wa baa', 'hrana od soje', 'boisson', 'mixbean', 'heb ei felysu',
 'mga harikot', 'germe', 'sweet soy', 'vivesoi soja', 'haricots', 'jagoda', 'framboesa', 'buah mangga', 'شیرین شده', 'kim', 'iogwrt soi', 'ماکارونی', "c'était blanc crème", 'soja odżywcza', 'äpple',
 'ka soo jeeda frais', 'kávé', 'pasta kedelai', 'likidong pagkain ng toyo', 'langis', 'ulei', 'boisson soy', 'blanda bönor', 'usøtet', 'کاکائو', 'gâteau', 'پتی سوف', 'gyümölcslé', 'rhewlif',
 'den var kremhvit', 'salcie', 'keim', 'trwyth', 'قهوه', 'leżeć', 'băng dính', 'blueberry', 'vanille', 'látte', 'mus', 'douceur', 'oatmeal ya chokoleti', 'soya kwa tartiner', 'harcots', 'čokolada',
 'mayonaise', 'protein isolate', 'sweetened', 'người đi rừng', 'înghețată', 'nước đậu nành', 'ital', 'latté', 'soja za tartiner', 'czekolada', 'turmix', 'caano', 'súp miso', 'pohon willow', 'maçã',
 'sleeve', 'tôi ngọt ngào', 'soy ilaa tartiner', 'drikke', 'queijo fresco', 'تمشک', 'sirke', 'آستین', 'krém', 'édesítve', 'ciocolată', 'szójabab paszta']

In [None]:
# Crear una máscara booleana para identificar las filas que deben eliminarse
mask = pandas_df['product_name'].str.contains('|'.join(full_exclude_distinct))

# Eliminar las filas utilizando la máscara
pandas_df.drop(pandas_df[mask].index, inplace=True)

# Reiniciar los índices del DataFrame resultante
pandas_df.reset_index(drop=True, inplace=True)

# Mostrar el DataFrame resultante
print(pandas_df.shape)
pandas_df.head()

(5170, 201)


Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,product_name,abbreviated_product_name,...,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
0,5015.0,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1597688122,2020-08-17 18:15:22,1597688123,2020-08-17 18:15:23,kiliweb,organic pesto with tofu,,...,,,,,,,,,,
1,1522.0,http://world-en.openfoodfacts.org/product/0000...,eduardo,1586365881,2020-04-08 17:11:21,1644514901,2022-02-10 17:41:41,packbot,seitan a la plancha,,...,,,,,,,,,,
2,503500000.0,http://world-en.openfoodfacts.org/product/0000...,foodvisor,1649165957,2022-04-05 13:39:17,1649165957,2022-04-05 13:39:17,foodvisor,tofu nature,,...,,,,,,,,,,
3,6539.0,http://world-en.openfoodfacts.org/product/0000...,foodvisor,1631471233,2021-09-12 18:27:13,1631471233,2021-09-12 18:27:13,foodvisor,tofu fume,,...,,,,,,,,,,
4,11110910000.0,http://world-en.openfoodfacts.org/product/0001...,usda-ndb-import,1489062658,2017-03-09 12:30:58,1545997356,2018-12-28 11:42:36,teolemon,tofu firm,,...,,,,,,,,,,


In [None]:
################################################################################
#En este paso se convienten los datos contenidos en los dataframe a mayúsculas,
# esto con el fin de estandarizarlos.
################################################################################

pandas_df = datos_a_mayusculas(pandas_df)

################################################################################
# Se eliminan las filas en las que todos los valores son nulos,
# ya que se trata de filas que no contribuyen en los análisis que se harán en las etapas siguientes.
################################################################################

pandas_df = eliminar_nulos(pandas_df)

################################################################################
# identificar las columnas que son de tipo string y las columnas que son numéricas.
################################################################################

# Identificar columnas que son de tipo string
str_cols = pandas_df.select_dtypes(include=['object']).columns

# Identificar columnas que son numéricas
num_cols = pandas_df.select_dtypes(include=['float64', 'int64']).columns

################################################################################
#Se reemplazan los casos en que existe un NaN en cada uno de los dataframe creados por un dato:
# - "vacío" cuando es String.
# - "0" cuando es Numérica.
################################################################################

# Reemplazar NaN en columnas de tipo string con ''
pandas_df[str_cols] = pandas_df[str_cols].fillna('')

# Reemplazar NaN en columnas numéricas con 0
pandas_df[num_cols] = pandas_df[num_cols].fillna(0)

################################################################################
#Se eliminan en este paso los datos duplicados, si los hubiera, en el dataframe.
################################################################################

pandas_df = eliminar_duplicados(pandas_df)

################################################################################
# Con el fin de no tener problemas al momento de invocar las columnas,
# se estandarizan los nombres que actualmente existen en el dataframe.
################################################################################

pandas_df.columns = reemplazar_caracteres(pandas_df.columns)

In [None]:
################################################################################
# Selección de columnas
################################################################################

# Seleccionamos las columnas que tengas 100 en su nombre
columnas_100G = [col for col in pandas_df.columns if '100' in col]

# Creamos un array axuliar para obtener las columnas que tengas 100 en su nombre
df_aux100 = pandas_df[columnas_100G]

# Seleccionar columnas cuya suma de filas es mayor a 0
columnas_suma_mayor_0 = df_aux100.columns[df_aux100.sum(axis=0) > 0]

# Columnas adicionales y complementarias al analisis
columnas_adicionales = ['PRODUCT_NAME', 'ECOSCORE_GRADE', 'ECOSCORE_SCORE', 'NUTRISCORE_GRADE', 'NUTRISCORE_SCORE', 'COUNTRIES_EN', 'NOVA_GROUP', "CHOLESTEROL_100G", "SUGARS_100G", "SALT_100G", "SODIUM_100G"]

# Columnas finales
columnas_adicionales.extend(columnas_suma_mayor_0)
columnas_adicionales

# Dataframe final
pandas_df = pandas_df[columnas_adicionales]

In [None]:
################################################################################
# ISO3 y Continente de pais
################################################################################
# Quedarse solo con el primer país en la columna 'COUNTRIES_EN'
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].str.split(',').str[0]
# Nomalización de nombres de paises
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].apply(standardize_country_name)
# ISO3 Por pais
pandas_df['ISO3'] = pandas_df['COUNTRIES_EN'].apply(get_iso3)

In [None]:
################################################################################
# Variables categoricas a numericas
################################################################################

# Inicializar el codificador
label_encoder = LabelEncoder()

# Codificar las variables categóricas en el dataframe
pandas_df['ECOSCORE_GRADE'] = label_encoder.fit_transform(pandas_df['ECOSCORE_GRADE'])
pandas_df['NUTRISCORE_GRADE'] = label_encoder.fit_transform(pandas_df['NUTRISCORE_GRADE'])

In [None]:
################################################################################
# PIB (US$ a precios actuales)
################################################################################

####################################################
# Lectura de datos desde Banco mundial
####################################################
df_m_bank_pib = pd.read_excel("https://api.worldbank.org/v2/es/indicator/NY.GDP.MKTP.CD?downloadformat=excel", skiprows=3, decimal=',')

####################################################
# Selecionar datos no vacios
####################################################
df_m_bank_pib = df_m_bank_pib[df_m_bank_pib['Country Name'].notnull()].reset_index(drop=True)

####################################################
# Reemplazamos NaN por ''
####################################################
df_m_bank_pib.fillna(0, inplace = True)

####################################################
# Inference los tipos de datos automáticamente
####################################################
df_m_bank_pib = df_m_bank_pib.infer_objects()

In [None]:
####################################################
# Datos PIB
####################################################
df_m_bank_pib_year = df_m_bank_pib[['Country Name', 'Country Code','2020']].reset_index(drop=True)
df_m_bank_pib_year.rename(columns={"2020": "PIB", 'Country Name' : 'COUNTRIES', 'Country Code': 'ISO'}, inplace=True)
df_m_bank_pib_year['PIB'] = df_m_bank_pib_year['PIB'].astype('float64')
df_m_bank_pib_year['PIB'] = round(df_m_bank_pib_year['PIB'], 0)

####################################################
# Merge de dataframes
####################################################
pandas_df = pd.merge(left=df_m_bank_pib_year, right=pandas_df, how='right', left_on='ISO', right_on='ISO3')

# Eliminar columna ISO
pandas_df = pandas_df.drop(columns=['ISO', 'COUNTRIES'])

# Convertir los nulos a 0
pandas_df['PIB'].fillna(0, inplace = True)

pandas_df.head()

Unnamed: 0,PIB,PRODUCT_NAME,ECOSCORE_GRADE,ECOSCORE_SCORE,NUTRISCORE_GRADE,NUTRISCORE_SCORE,COUNTRIES_EN,NOVA_GROUP,CHOLESTEROL_100G,SUGARS_100G,...,FRUITS_VEGETABLES_NUTS_100G,FRUITS_VEGETABLES_NUTS_DRIED_100G,FRUITS_VEGETABLES_NUTS_ESTIMATE_100G,FRUITS_VEGETABLES_NUTS_ESTIMATE_FROM_INGREDIENTS_100G,COCOA_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,NUTRITION_SCORE_FR_100G,PHYLLOQUINONE_100G,ISO3
0,2639009000000.0,ORGANIC PESTO WITH TOFU,7,0.0,0,0.0,France,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FRA
1,1276963000000.0,SEITAN A LA PLANCHA,2,79.0,0,0.0,Spain,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ESP
2,21060470000000.0,TOFU NATURE,7,0.0,0,0.0,United States,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,USA
3,21060470000000.0,TOFU FUME,7,0.0,0,0.0,United States,0.0,0.0,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,USA
4,21060470000000.0,TOFU FIRM,7,0.0,0,0.0,United States,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,USA


In [None]:
################################################################################
# Ordenar nombres de columnas
################################################################################

# Obtener el nombre de las columnas
columnas = pandas_df.columns.tolist()

# Ordenar las columnas por nombre, manteniendo 'PRODUCT_NAME' como primera columna
columnas_ordenadas = sorted(columnas, key=lambda x: (x != 'PRODUCT_NAME', x))

# Crear un nuevo DataFrame con las columnas ordenadas
pandas_df = pandas_df[columnas_ordenadas]

pandas_df.head(3)

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,ORGANIC PESTO WITH TOFU,0.0,0.0,0.0,0.0,12.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SEITAN A LA PLANCHA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TOFU NATURE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].str.upper()

In [None]:
pandas_df.count()

PRODUCT_NAME                 5170
ALCOHOL_100G                 5170
ALPHA_LINOLENIC_ACID_100G    5170
BIOTIN_100G                  5170
CALCIUM_100G                 5170
                             ... 
VITAMIN_D_100G               5170
VITAMIN_E_100G               5170
VITAMIN_K_100G               5170
VITAMIN_PP_100G              5170
ZINC_100G                    5170
Length: 86, dtype: int64

In [None]:
################################################################################
# Exportar el DataFrame de pandas como un archivo CSV
################################################################################
pandas_df.to_csv("/content/drive/MyDrive/Datos TFM/alimentos.csv", index=False)

# Seperacion de alimentos

In [None]:
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']

In [None]:
################################################################################
# SEITAN - before
################################################################################

lista_include_seitan = ['seitan', 'carne vegetal']
full_include_seitan = []

for frase in lista_include_seitan:
  full_include_seitan.append(frase)
  for idioma in idiomas:
    translated_seitan = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include_seitan.append(translated_seitan)

# Convertir todos los elementos de la lista a minúsculas
full_include_seitan = [elemento.upper() for elemento in full_include_seitan]

# Obtener los elementos distintos de la lista
seitan_names = list(set(full_include_seitan))

# Filtro
filtro_seitan = '|'.join(seitan_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_seitan = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_seitan, case=False, regex=True)]
df_filtrado_seitan

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
1,SEITAN A LA PLANCHA,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
42,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,6.1404,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
45,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,4.4200,0.0,0.0,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0000,0.0,0.0,0.0,0.040708,0.00133
46,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,7.0588,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
47,CHIPOTLE STYLE SEITAN,0.0,0.0,0.0,0.0,7.9600,0.0,0.0,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0011,0.0,0.0,0.0,0.042478,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5153,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
5154,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENA",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
5155,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
5156,"BIO BURGER VEGETAL CON SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000


In [None]:
################################################################################
# TOFU - before
################################################################################

lista_include_tofu = ['tofu']
full_include_tofu = []

for frase in lista_include_tofu:
  full_include_tofu.append(frase)
  for idioma in idiomas:
    translated_tofu = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include_tofu.append(translated_tofu)

# Convertir todos los elementos de la lista a minúsculas
full_include_tofu = [elemento.upper() for elemento in full_include_tofu]

# Obtener los elementos distintos de la lista
tofu_names = list(set(full_include_tofu))

# Filtro
filtro_tofu = '|'.join(tofu_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_tofu = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_tofu, case=False, regex=True)]
df_filtrado_tofu

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,ORGANIC PESTO WITH TOFU,0.0,0.0,0.0,0.000,12.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TOFU NATURE,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TOFU FUME,0.0,0.0,0.0,0.000,2.100000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TOFU FIRM,0.0,0.0,0.0,0.127,2.530000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,TOFU DE GRAINES DE CITROUILLES ORIGINALE,0.0,0.0,0.0,0.000,6.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5159,TOFU NATURE BIO,0.0,0.0,0.0,0.000,1.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5160,TOFU FUME BIO,0.0,0.0,0.0,0.000,2.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5164,"ORGANIC TOFU, EXTRA FIRM",0.0,0.0,0.0,0.000,3.571429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5165,TOFU,0.0,0.0,0.0,0.000,2.700000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
################################################################################
# SOJA - before
################################################################################

lista_include_soja = ['soja', 'texturizada']
full_include_soja = []

for frase in lista_include_soja:
  full_include_soja.append(frase)
  for idioma in idiomas:
    translated_soja = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include_soja.append(translated_soja)

# Convertir todos los elementos de la lista a minúsculas
full_include_soja = [elemento.upper() for elemento in full_include_soja]

# Obtener los elementos distintos de la lista
soja_names = list(set(full_include_soja))

# Filtro
filtro_soja = '|'.join(soja_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_soja = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_soja, case=False, regex=True)]
df_filtrado_soja

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
7,SOY CHILE,0.0,0.0,0.0,0.000,62.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,SOY MISO GLAZE,0.0,0.0,0.0,0.000,17.647059,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,"BARBECUE RUB, SIZZLING, SWEET AND SPICY SOY",0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,GINGER SOY,0.0,0.0,0.0,0.000,16.428571,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,"SIMPLE TRUTH ORGANIC, ROASTED & SALTED SOYNUTS",0.0,0.0,0.0,0.267,30.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5162,SOYSUN ABR GOYAVE,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5163,SOYTICS,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5166,ПРОТЕИН OPTIMUM 100% SOY PROTEIN,0.0,0.0,0.0,0.000,6.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5167,ПРОТЕИН PUREPROTEIN SOY ISOLATE НАТУРАЛЬНЫЙ ВКУС,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
################################################################################
# SOJA - after
################################################################################
# Crear una máscara booleana para identificar las filas que deben eliminarse
mask_soja_seitan = df_filtrado_soja['PRODUCT_NAME'].str.contains('|'.join(seitan_names))
mask_soja_tofu = df_filtrado_soja['PRODUCT_NAME'].str.contains('|'.join(tofu_names))

# Eliminar las filas utilizando la máscara
df_filtrado_soja.drop(df_filtrado_soja[mask_soja_seitan].index, inplace=True)
df_filtrado_soja.drop(df_filtrado_soja[mask_soja_tofu].index, inplace=True)

# Reiniciar los índices del DataFrame resultante
df_filtrado_soja = df_filtrado_soja.reset_index(drop=True)

# Exportar el DataFrame de pandas como un archivo CSV
df_filtrado_soja.to_csv("/content/drive/MyDrive/Datos TFM/soja.csv", index=False)

df_filtrado_soja

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,SOY CHILE,0.0,0.0,0.0,0.000,62.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SOY MISO GLAZE,0.0,0.0,0.0,0.000,17.647059,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"BARBECUE RUB, SIZZLING, SWEET AND SPICY SOY",0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GINGER SOY,0.0,0.0,0.0,0.000,16.428571,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"SIMPLE TRUTH ORGANIC, ROASTED & SALTED SOYNUTS",0.0,0.0,0.0,0.267,30.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,SOYSUN ABR GOYAVE,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1793,SOYTICS,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1794,ПРОТЕИН OPTIMUM 100% SOY PROTEIN,0.0,0.0,0.0,0.000,6.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1795,ПРОТЕИН PUREPROTEIN SOY ISOLATE НАТУРАЛЬНЫЙ ВКУС,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
################################################################################
# TOFU - after
################################################################################
# Crear una máscara booleana para identificar las filas que deben eliminarse
mask_tofu_soja = df_filtrado_tofu['PRODUCT_NAME'].str.contains('|'.join(soja_names))
mask_tofu_seitan = df_filtrado_tofu['PRODUCT_NAME'].str.contains('|'.join(seitan_names))

# Eliminar las filas utilizando la máscara
df_filtrado_tofu.drop(df_filtrado_tofu[mask_tofu_soja].index, inplace=True)
df_filtrado_tofu.drop(df_filtrado_tofu[mask_tofu_seitan].index, inplace=True)

# Reiniciar los índices del DataFrame resultante
df_filtrado_tofu = df_filtrado_tofu.reset_index(drop=True)

# Exportar el DataFrame de pandas como un archivo CSV
df_filtrado_tofu.to_csv("/content/drive/MyDrive/Datos TFM/tofu.csv", index=False)
df_filtrado_tofu

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,ORGANIC PESTO WITH TOFU,0.0,0.0,0.0,0.000,12.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TOFU NATURE,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TOFU FUME,0.0,0.0,0.0,0.000,2.100000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TOFU FIRM,0.0,0.0,0.0,0.127,2.530000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TOFU DE GRAINES DE CITROUILLES ORIGINALE,0.0,0.0,0.0,0.000,6.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,TOFU NATURE BIO,0.0,0.0,0.0,0.000,1.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2565,TOFU FUME BIO,0.0,0.0,0.0,0.000,2.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2566,"ORGANIC TOFU, EXTRA FIRM",0.0,0.0,0.0,0.000,3.571429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2567,TOFU,0.0,0.0,0.0,0.000,2.700000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
################################################################################
# SEITAN - after
################################################################################
# Crear una máscara booleana para identificar las filas que deben eliminarse
mask_seitan_soja = df_filtrado_seitan['PRODUCT_NAME'].str.contains('|'.join(soja_names))
mask_seitan_tofu = df_filtrado_seitan['PRODUCT_NAME'].str.contains('|'.join(tofu_names))

# Eliminar las filas utilizando la máscara
df_filtrado_seitan.drop(df_filtrado_seitan[mask_seitan_soja].index, inplace=True)
df_filtrado_seitan.drop(df_filtrado_seitan[mask_seitan_tofu].index, inplace=True)

# Reiniciar los índices del DataFrame resultante
df_filtrado_seitan = df_filtrado_seitan.reset_index(drop=True)

# Exportar el DataFrame de pandas como un archivo CSV
df_filtrado_seitan.to_csv("/content/drive/MyDrive/Datos TFM/seitan.csv", index=False)
df_filtrado_seitan

Unnamed: 0,PRODUCT_NAME,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,CHLORIDE_100G,CHOLESTEROL_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,SEITAN A LA PLANCHA,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
1,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,6.1404,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
2,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,4.4200,0.0,0.0,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0000,0.0,0.0,0.0,0.040708,0.00133
3,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,7.0588,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
4,CHIPOTLE STYLE SEITAN,0.0,0.0,0.0,0.0,7.9600,0.0,0.0,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0011,0.0,0.0,0.0,0.042478,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
727,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENA",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
728,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
729,"BIO BURGER VEGETAL CON SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
