# Preparar entorno de trabajo

In [1]:
################################################################################
# Preparar entorno de trabajo
################################################################################
!pip install pyspark
!pip install scikit-learn
!pip install tensorflow
!pip install torch
!pip install mtranslate
!pip install findspark
!pip install unidecode
!pip install deep_translator
!pip install langdetect
!pip install pycountry
!pip install pycountry_convert

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=cb59116e02a507bbb1f4b82ed14c7e4e8ed6fea4ad3bff6837cc8a4196724f3a
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1
Collecting mtranslate
  Downloading mtranslate-1.8.tar.gz (2.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mtranslate
  Building wheel for mtranslate (setup.py) ... [?25l[?25hdone
  Created wheel for mtranslate: filename=

## Librerías

In [2]:
################################################################################
# Preparar entorno de trabajo
################################################################################
# IGNORAR WARNINGS
import warnings
warnings.filterwarnings('ignore')

# Importación de el contenido en el DRIVE
from google.colab import drive
drive.mount('/content/drive')

# Datos
import pandas as pd
import numpy as np
import json
import requests
from datetime import datetime
from unidecode import unidecode
import time

# Pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.sql.types import *
import findspark
findspark.init()

# sklearn Modelos
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import svm

# Tensoflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Activation
from tensorflow.keras.optimizers import SGD

# Torch
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Preparar datos
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, roc_curve, confusion_matrix, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from scipy.stats import reciprocal, uniform

# Gráficos
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import seaborn as sns

# Traductor
from mtranslate import translate
from deep_translator import GoogleTranslator
from langdetect import detect

# Paises
import pycountry
import pycountry_convert as pc

Mounted at /content/drive


### Importar datos

In [3]:
################################################################################
# CREAMOS EL SPARK SESSION
################################################################################

spark = SparkSession.builder.appName("TFM").getOrCreate()

################################################################################
# Datos Open Food Facts
# URL = https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv
################################################################################

df_datos = spark.read.options(header="True", inferSchema='True', delimiter='\t').csv("/content/drive/MyDrive/Datos TFM/en.openfoodfacts.org.products.csv")

### Funciones

In [4]:
################################################################################
# Función para realizar la traducción
################################################################################
def translate_text(text, target_language):
  time.sleep(1)  # Pausa de 1 segundo entre solicitudes
  translation = translate(text, target_language)

  return translation

################################################################################
# Funciones de limpieza de datos (DataFrame Pandas)
################################################################################
def datos_a_mayusculas(df):

  df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
  return df
################################################################################
################################################################################
################################################################################
def eliminar_nulos(df):

  df = df.dropna(axis=0, how='all')

  return df
################################################################################
################################################################################
################################################################################
def eliminar_duplicados(df):

  df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

  return df
################################################################################
################################################################################
################################################################################
def reemplazar_caracteres(df):
  df = df.str.upper()
  df = df.str.replace('/',' ')
  df = df.str.replace(',',' ')
  df = df.str.replace('-',' ')
  df = df.str.replace(' ','_')
  df = df.str.replace('Á','A')
  df = df.str.replace('É','E')
  df = df.str.replace('Í','I')
  df = df.str.replace('Ó','O')
  df = df.str.replace('Ú','U')
  df = df.str.replace('Ü','U')
  df = df.str.replace('Ñ','N')

  return df
################################################################################
# Función detección idioma
################################################################################
def detect_language(s):
    try:
        return detect(s)
    except:
        return None
################################################################################
################################################################################
################################################################################
def get_iso3(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.alpha_3
    except LookupError:
        return None
################################################################################
################################################################################
################################################################################
def get_continent(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.continent.name
    except LookupError:
        return None
################################################################################
################################################################################
################################################################################
def standardize_country_name(country_name):
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        return country.name
    except LookupError:
        return country_name

# Procesado de datos

In [5]:
'''
################################################################################
# Traducir nombre de productos
################################################################################

# Definir la función UDF (User-Defined Function) para aplicar la traducción
translate_udf = udf(lambda text: translate_text(text, 'es'), StringType())

# Aplicar la traducción a la columna 'PRODUCT_NAME'
df_datos = df_datos.withColumn('product_name_es', translate_udf('product_name'))

# Transformar la columna 'columna1' al tipo StringType
df_datos = df_datos.withColumn("product_name_es", col("product_name_es").cast("string"))

df_datos.select('product_name_es').show(5, truncate=False)
'''

'\n################################################################################\n# Traducir nombre de productos\n################################################################################\n\n# Definir la función UDF (User-Defined Function) para aplicar la traducción\ntranslate_udf = udf(lambda text: translate_text(text, \'es\'), StringType())\n\n# Aplicar la traducción a la columna \'PRODUCT_NAME\'\ndf_datos = df_datos.withColumn(\'product_name_es\', translate_udf(\'product_name\'))\n\n# Transformar la columna \'columna1\' al tipo StringType\ndf_datos = df_datos.withColumn("product_name_es", col("product_name_es").cast("string"))\n\ndf_datos.select(\'product_name_es\').show(5, truncate=False)\n'

In [6]:
################################################################################
# Transformar nombre del producto
################################################################################
# Transformar los valores nulos en la columna 'product_name' a una cadena vacía
#df_datos = df_datos.withColumn("product_name", col("product_name").na.fill(''))
#df_datos.select('product_name').show(50, truncate=False)
################################################################################

# Convertir la columna 'product_name' a minúsculas
df_datos = df_datos.withColumn("product_name", lower(col("product_name")))

# Aplica las funciones para eliminar los tildes y convertir a minúsculas
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[áäàâãå]", "a")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[éëèê]", "e")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[íïìî]", "i")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[óöòôõ]", "o")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[úüùû]", "u")))
df_datos = df_datos.withColumn("product_name", lower(regexp_replace(col("product_name"), "[ñ]", "n")))

# Transformar la columna 'product_name' al tipo StringType
df_datos = df_datos.withColumn("product_name", col("product_name").cast("string"))

In [7]:
'''
MUY LEEEEEEEEENTO
################################################################################
# Crear columna 'language' desde el nombre del producto
################################################################################

# Crear un UDF a partir de detect_language
detect_language_udf = udf(detect_language, StringType())

# Aplica la función de detección de idioma a la columna 'product_name'
# Aplicar el UDF a una columna específica en el DataFrame
df_datos = df_datos.withColumn('language', detect_language_udf(df_datos['product_name']))

# Esperar 5 minutos
time.sleep(300)

################################################################################
# Listado de idiomas
################################################################################
# Obtener los valores distintos de la columna 'language'
distinct_languages = df_datos.select('language').distinct().rdd.flatMap(lambda x: x).collect()

# Convertir la lista de valores distintos a un array de NumPy
languages_array = np.array(distinct_languages)

# Imprimir el array de NumPy
languages_array
'''

"\nMUY LEEEEEEEEENTO\n################################################################################\n# Crear columna 'language' desde el nombre del producto\n################################################################################\n\n# Crear un UDF a partir de detect_language\ndetect_language_udf = udf(detect_language, StringType())\n\n# Aplica la función de detección de idioma a la columna 'product_name'\n# Aplicar el UDF a una columna específica en el DataFrame\ndf_datos = df_datos.withColumn('language', detect_language_udf(df_datos['product_name']))\n\n# Esperar 5 minutos\ntime.sleep(300)\n\n################################################################################\n# Listado de idiomas\n################################################################################\n# Obtener los valores distintos de la columna 'language'\ndistinct_languages = df_datos.select('language').distinct().rdd.flatMap(lambda x: x).collect()\n\n# Convertir la lista de valores distintos a u

In [8]:
################################################################################
# Generar lista de busquedas en distintos idiomas:
# - Seitan
# - Soja
# - Tofu
################################################################################

lista_include = ['seitan', 'tofu', 'soja', 'carne vegetal']
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_include = []

for frase in lista_include:
  full_include.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include.append(translated)

# Obtener los elementos distintos de la lista
full_include_distinct = list(set(full_include))

print(full_include_distinct)

['thịt trắng', 'daging sayur', 'mięso warzywne', 'soia', 'seiten', 'tokwa', 'karne ng gulay', 'vegetabilsk kjøtt', 'seitan', 'vegetable meat', 'سیتان', 'hilibka khudradda', 'vegetabiliskt kött', 'soja', 'kedelai', 'toyo', 'توفو', 'carne de legume', 'Tofu', 'carne vegetal', 'sebze eti', 'viande végétale', 'seitanas', 'Tahu', 'soi', 'đậu nành', 'Gemüsefleisch', 'cig llysiau', 'soy', 'soya peyniri', 'đậu hũ', 'Soja', 'carn vegetal', 'szeitán', 'thịt rau', 'növényi hús', 'daržovių mėsa', 'sojos', 'szója', 'سویا', 'tofu', 'soya', 'nyama ya mboga', 'biljno meso', 'گوشت سبزیجات', 'Seitan']


In [9]:
################################################################################
# Obtener los productos
################################################################################

# Filtrar los datos basados en los valores de la lista
df_products_distinct = df_datos.filter(col("product_name").rlike("|".join(full_include_distinct)))

df_products_distinct.show(5, truncate=False)

+------------+---------------------------------------------------------------------------------------------------------+--------------------+----------+-------------------+---------------+----------------------+----------------+------------------------------------+------------------------+-------------------------------------+--------+--------------------+-----------------------+-----------------+--------------+--------------------+--------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+-------+------------+----------+--------------------+-------------------------+--------

In [10]:
'''
################################################################################
# Obtener los productos
################################################################################

# Aplica el filtro al DataFrame
df_products_distinct = df_datos.filter(
    (col("product_name").like("%seitan%")) |
    (col("product_name").like("%carne vegetal%")) |
    (col("product_name").like("%vegetable meat%")) |
    (col("product_name").like("%viande vegetale%")) |
    (col("product_name").like("%tofu%")) |
    (col("product_name").like("%soja%")) |
    (col("product_name").like("%soya%"))
)

# Aplica distinct al DataFrame filtrado
df_products_distinct = df_products_distinct.distinct()

df_products_distinct.show(5, truncate=False)
'''

'\n################################################################################\n# Obtener los productos\n################################################################################\n\n# Aplica el filtro al DataFrame\ndf_products_distinct = df_datos.filter(\n    (col("product_name").like("%seitan%")) |\n    (col("product_name").like("%carne vegetal%")) |\n    (col("product_name").like("%vegetable meat%")) |\n    (col("product_name").like("%viande vegetale%")) |\n    (col("product_name").like("%tofu%")) |\n    (col("product_name").like("%soja%")) |\n    (col("product_name").like("%soya%"))\n)\n\n# Aplica distinct al DataFrame filtrado\ndf_products_distinct = df_products_distinct.distinct()\n\ndf_products_distinct.show(5, truncate=False)\n'

In [11]:
################################################################################
# Número de registros
################################################################################

# Realizar el conteo de registros en el DataFrame
count = df_products_distinct.count()

# Imprimir el resultado
print("Número de registros: ", count)

Número de registros:  14745


In [12]:
################################################################################
# Convertir el DataFrame de PySpark a un DataFrame de pandas
################################################################################
pandas_df = df_products_distinct.toPandas()

################################################################################
# Inference los tipos de datos automáticamente
################################################################################
pandas_df = pandas_df.infer_objects()

################################################################################
# Imprimir los tipos de datos de cada columna
################################################################################
print(pandas_df.dtypes)

code                         float64
url                           object
creator                       object
created_t                      int32
created_datetime      datetime64[ns]
                           ...      
choline_100g                 float64
phylloquinone_100g           float64
beta-glucan_100g             float64
inositol_100g                float64
carnitine_100g               float64
Length: 201, dtype: object


In [13]:
################################################################################
# Generar lista de exclusiones
################################################################################

lista_exclude = ["salsa de soja","sauce soja","sauce soja","bebida de soja","yaourt soja","yogurt de soja","galletas","chocolat","spaghetti","arroz y soja","leche","mousse","milk","aceite","dessert","pan soja","boisson soja","glace","sauce de soja","sauce","salsa","bibeda de soja","lait soja","vivesoy soja","lait de soja","yaourt","postre","muesli","yogur","bebida","margarina","vinaigrette"]
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_exclude = []

for frase in lista_exclude:
  full_exclude.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_exclude.append(translated)

# Obtener los elementos distintos de la lista
full_exclude_distinct = list(set(full_exclude))

print(full_exclude_distinct)

['uống', 'soia vivesoy', 'szójabab', 'sütiket', 'shukulaatada', 'piće', 'reis a soi', 'szója szósz', 'mì ống Ý', 'sojų duona', 'arroz y soja', 'bebida de soja', 'sauce de soja', 'گلس', 'margarine', 'Sirke', 'margarina', 'boisson au soja', 'desertas', 'minyak', 'bánh xốp', 'Lait Soja', 'postres', 'ماست سویا', 'giấm', 'lait de soia', 'sos de soia', 'vide', 'boisson soi', 'jogurtas', 'nước đậu nành', 'sjokolade', 'csokoládé', 'saliid', 'putėsiai', 'Milch', 'panirimas', 'kanin at toyo', 'soia boisson', 'muraayad', 'soya fasulyesi', 'شیب', 'سویا vivesoy', 'jūsų sojos pupelės', 'băng keo', 'موسلی', 'vivesoy soy', 'diod soi', 'fűzfa', 'pohon willow', 'maraqa soy', 'kinywaji cha soya', 'leit soja', 'سویا بویسون', 'milk', 'tej', 'yaourt soyabønner', 'desszert', 'soya yoğurdu', 'salcie de soia', 'sojapil', 'vinagrete', 'sữa chua', 'salsa de soja', 'mleko', 'سویا یاورت', 'Margarina', 'weka soya', 'caano', 'biscoitos', 'yourt', 'soia yaourt', 'arroz e soja', 'mjölk', 'Weide', 'susu', 'beguda', 'me

In [14]:
# Crear una máscara booleana para identificar las filas que deben eliminarse
mask = pandas_df['product_name'].str.contains('|'.join(full_exclude_distinct))

# Eliminar las filas utilizando la máscara
pandas_df.drop(pandas_df[mask].index, inplace=True)

# Reiniciar los índices del DataFrame resultante
pandas_df.reset_index(drop=True, inplace=True)

# Mostrar el DataFrame resultante
print(pandas_df.shape)
pandas_df.head()

(9605, 201)


Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,last_modified_by,product_name,abbreviated_product_name,...,carbon-footprint-from-meat-or-fish_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
0,5015.0,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1597688122,2020-08-17 18:15:22,1597688123,2020-08-17 18:15:23,kiliweb,organic pesto with tofu,,...,,,,,,,,,,
1,1522.0,http://world-en.openfoodfacts.org/product/0000...,eduardo,1586365881,2020-04-08 17:11:21,1644514901,2022-02-10 17:41:41,packbot,seitan a la plancha,,...,,,,,,,,,,
2,460938714.0,http://world-en.openfoodfacts.org/product/0000...,allfitnessfactory-de,1483097990,2016-12-30 11:39:50,1644575592,2022-02-11 10:33:12,packbot,100% soja protein haselnuss,,...,,,,,,,,,,
3,503500014.0,http://world-en.openfoodfacts.org/product/0000...,foodvisor,1649165957,2022-04-05 13:39:17,1649165957,2022-04-05 13:39:17,foodvisor,tofu nature,,...,,,,,,,,,,
4,6539.0,http://world-en.openfoodfacts.org/product/0000...,foodvisor,1631471233,2021-09-12 18:27:13,1631471233,2021-09-12 18:27:13,foodvisor,tofu fume,,...,,,,,,,,,,


In [15]:
################################################################################
#En este paso se convienten los datos contenidos en los dataframe a mayúsculas,
# esto con el fin de estandarizarlos.
################################################################################

pandas_df = datos_a_mayusculas(pandas_df)

################################################################################
# Se eliminan las filas en las que todos los valores son nulos,
# ya que se trata de filas que no contribuyen en los análisis que se harán en las etapas siguientes.
################################################################################

pandas_df = eliminar_nulos(pandas_df)

################################################################################
# identificar las columnas que son de tipo string y las columnas que son numéricas.
################################################################################

# Identificar columnas que son de tipo string
str_cols = pandas_df.select_dtypes(include=['object']).columns

# Identificar columnas que son numéricas
num_cols = pandas_df.select_dtypes(include=['float64', 'int64']).columns

################################################################################
#Se reemplazan los casos en que existe un NaN en cada uno de los dataframe creados por un dato:
# - "vacío" cuando es String.
# - "0" cuando es Numérica.
################################################################################

# Reemplazar NaN en columnas de tipo string con ''
pandas_df[str_cols] = pandas_df[str_cols].fillna('')

# Reemplazar NaN en columnas numéricas con 0
pandas_df[num_cols] = pandas_df[num_cols].fillna(0)

################################################################################
#Se eliminan en este paso los datos duplicados, si los hubiera, en el dataframe.
################################################################################

pandas_df = eliminar_duplicados(pandas_df)

################################################################################
# Con el fin de no tener problemas al momento de invocar las columnas,
# se estandarizan los nombres que actualmente existen en el dataframe.
################################################################################

pandas_df.columns = reemplazar_caracteres(pandas_df.columns)

In [16]:
################################################################################
# Selección de columnas
################################################################################

# Seleccionamos las columnas que tengas 100 en su nombre
columnas_100G = [col for col in pandas_df.columns if '100' in col]

# Creamos un array axuliar para obtener las columnas que tengas 100 en su nombre
df_aux100 = pandas_df[columnas_100G]

# Seleccionar columnas cuya suma de filas es mayor a 0
columnas_suma_mayor_0 = df_aux100.columns[df_aux100.sum(axis=0) > 0]

# Columnas adicionales y complementarias al analisis
columnas_adicionales = ['PRODUCT_NAME', 'ECOSCORE_GRADE', 'ECOSCORE_SCORE', 'NUTRISCORE_GRADE', 'NUTRISCORE_SCORE', 'COUNTRIES_EN']
# Pais a ISO3-2
columnas_adicionales.extend(columnas_suma_mayor_0)
columnas_adicionales

# Dataframe final
pandas_df = pandas_df[columnas_adicionales]

In [17]:
################################################################################
# ISO3 y Continente de pais
################################################################################
# Quedarse solo con el primer país en la columna 'COUNTRIES_EN'
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].str.split(',').str[0]
# Nomalización de nombres de paises
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].apply(standardize_country_name)
# ISO3 Por pais
pandas_df['ISO3'] = pandas_df['COUNTRIES_EN'].apply(get_iso3)

In [18]:
################################################################################
# Variables categoricas a numericas
################################################################################

# Inicializar el codificador
label_encoder = LabelEncoder()

# Codificar las variables categóricas en el dataframe
pandas_df['ECOSCORE_GRADE'] = label_encoder.fit_transform(pandas_df['ECOSCORE_GRADE'])
pandas_df['NUTRISCORE_GRADE'] = label_encoder.fit_transform(pandas_df['NUTRISCORE_GRADE'])

In [19]:
'''
################################################################################
# Escalar datos
################################################################################

# Inicializar el escalador
scaler = MinMaxScaler()

# Identificar columnas que son numéricas
num_cols_df = pandas_df.select_dtypes(include=['float64', 'int64']).columns

# Escalar las columnas numéricas en el dataframe
pandas_df[num_cols_df] = scaler.fit_transform(pandas_df[num_cols_df])

# Verificar el resultado
pandas_df.head(3)
'''

"\n################################################################################\n# Escalar datos\n################################################################################\n\n# Inicializar el escalador\nscaler = MinMaxScaler()\n\n# Identificar columnas que son numéricas\nnum_cols_df = pandas_df.select_dtypes(include=['float64', 'int64']).columns\n\n# Escalar las columnas numéricas en el dataframe\npandas_df[num_cols_df] = scaler.fit_transform(pandas_df[num_cols_df])\n\n# Verificar el resultado\npandas_df.head(3)\n"

In [20]:
################################################################################
# PIB (US$ a precios actuales)
################################################################################

####################################################
# Lectura de datos desde Banco mundial
####################################################
df_m_bank_pib = pd.read_excel("https://api.worldbank.org/v2/es/indicator/NY.GDP.MKTP.CD?downloadformat=excel", skiprows=3, decimal=',')

####################################################
# Selecionar datos no vacios
####################################################
df_m_bank_pib = df_m_bank_pib[df_m_bank_pib['Country Name'].notnull()].reset_index(drop=True)

####################################################
# Reemplazamos NaN por ''
####################################################
df_m_bank_pib.fillna(0, inplace = True)

####################################################
# Inference los tipos de datos automáticamente
####################################################
df_m_bank_pib = df_m_bank_pib.infer_objects()

In [21]:
####################################################
# Datos PIB
####################################################
df_m_bank_pib_year = df_m_bank_pib[['Country Name', 'Country Code','2020']].reset_index(drop=True)
df_m_bank_pib_year.rename(columns={"2020": "PIB", 'Country Name' : 'COUNTRIES', 'Country Code': 'ISO'}, inplace=True)
df_m_bank_pib_year['PIB'] = df_m_bank_pib_year['PIB'].astype('float64')
df_m_bank_pib_year['PIB'] = round(df_m_bank_pib_year['PIB'], 0)

####################################################
# Merge de dataframes
####################################################
pandas_df = pd.merge(left=df_m_bank_pib_year, right=pandas_df, how='right', left_on='ISO', right_on='ISO3')

# Eliminar columna ISO
pandas_df = pandas_df.drop(columns=['ISO', 'COUNTRIES'])

# Convertir los nulos a 0
pandas_df['PIB'].fillna(0, inplace = True)

pandas_df.head()

Unnamed: 0,PIB,PRODUCT_NAME,ECOSCORE_GRADE,ECOSCORE_SCORE,NUTRISCORE_GRADE,NUTRISCORE_SCORE,COUNTRIES_EN,ENERGY_KJ_100G,ENERGY_KCAL_100G,ENERGY_100G,...,FRUITS_VEGETABLES_NUTS_100G,FRUITS_VEGETABLES_NUTS_DRIED_100G,FRUITS_VEGETABLES_NUTS_ESTIMATE_100G,FRUITS_VEGETABLES_NUTS_ESTIMATE_FROM_INGREDIENTS_100G,COCOA_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,NUTRITION_SCORE_FR_100G,PHYLLOQUINONE_100G,ISO3
0,2639009000000.0,ORGANIC PESTO WITH TOFU,7,0.0,0,0.0,France,0.0,642.0,2686.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FRA
1,1276963000000.0,SEITAN A LA PLANCHA,2,79.0,0,0.0,Spain,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ESP
2,3889669000000.0,100% SOJA PROTEIN HASELNUSS,7,0.0,0,0.0,Germany,1590.0,0.0,1590.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DEU
3,21060470000000.0,TOFU NATURE,7,0.0,0,0.0,United States,0.0,129.100006,540.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,USA
4,21060470000000.0,TOFU FUME,7,0.0,0,0.0,United States,0.0,154.0,644.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,USA


In [22]:
'''
# Eliminar las filas con valores igual a cero en la columna "Values"
pandas_df = pandas_df.loc[pandas_df['PIB'] != 0]

# Reiniciar los índices del DataFrame resultante
pandas_df.reset_index(drop=True, inplace=True)
pandas_df['PIB'].value_counts().sort_index()
'''

'\n# Eliminar las filas con valores igual a cero en la columna "Values"\npandas_df = pandas_df.loc[pandas_df[\'PIB\'] != 0]\n\n# Reiniciar los índices del DataFrame resultante\npandas_df.reset_index(drop=True, inplace=True)\npandas_df[\'PIB\'].value_counts().sort_index()\n'

In [23]:
################################################################################
# Ordenar nombres de columnas
################################################################################

# Obtener el nombre de las columnas
columnas = pandas_df.columns.tolist()

# Ordenar las columnas por nombre, manteniendo 'PRODUCT_NAME' como primera columna
columnas_ordenadas = sorted(columnas, key=lambda x: (x != 'PRODUCT_NAME', x))

# Crear un nuevo DataFrame con las columnas ordenadas
pandas_df = pandas_df[columnas_ordenadas]

pandas_df.head(3)

Unnamed: 0,PRODUCT_NAME,ADDED_SUGARS_100G,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,BUTYRIC_ACID_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,ORGANIC PESTO WITH TOFU,0.0,0.0,0.0,0.0,0.0,0.0,12.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,SEITAN A LA PLANCHA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100% SOJA PROTEIN HASELNUSS,0.0,0.0,0.0,0.0,0.0,0.0,1.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
pandas_df['COUNTRIES_EN'] = pandas_df['COUNTRIES_EN'].str.upper()

In [25]:
pandas_df.count()

PRODUCT_NAME                 9605
ADDED_SUGARS_100G            9605
ALCOHOL_100G                 9605
ALPHA_LINOLENIC_ACID_100G    9605
BIOTIN_100G                  9605
                             ... 
VITAMIN_D_100G               9605
VITAMIN_E_100G               9605
VITAMIN_K_100G               9605
VITAMIN_PP_100G              9605
ZINC_100G                    9605
Length: 78, dtype: int64

In [26]:
################################################################################
# Exportar el DataFrame de pandas como un archivo CSV
################################################################################
pandas_df.to_csv("/content/drive/MyDrive/Datos TFM/alimentos.csv", index=False)

# Seperacion de alimentos

In [27]:
################################################################################
# SEITAN
################################################################################

lista_include = ['seitan', 'carne vegetal']
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_include = []

for frase in lista_include:
  full_include.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include.append(translated)

# Obtener los elementos distintos de la lista
seitan_names = list(set(full_include))

print(seitan_names)

['thịt trắng', 'daging sayur', 'mięso warzywne', 'seiten', 'karne ng gulay', 'vegetabilsk kjøtt', 'seitan', 'vegetable meat', 'سیتان', 'hilibka khudradda', 'vegetabiliskt kött', 'carne de legume', 'carne vegetal', 'sebze eti', 'viande végétale', 'seitanas', 'Gemüsefleisch', 'cig llysiau', 'carn vegetal', 'szeitán', 'thịt rau', 'növényi hús', 'daržovių mėsa', 'nyama ya mboga', 'biljno meso', 'گوشت سبزیجات', 'Seitan']


In [30]:
# Filtro
filtro_seitan = '|'.join(seitan_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_seitan = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_seitan, case=False, regex=True)]


df_filtrado_seitan = df_filtrado_seitan.reset_index(drop=True)
df_filtrado_seitan

Unnamed: 0,PRODUCT_NAME,ADDED_SUGARS_100G,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,BUTYRIC_ACID_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,SEITAN A LA PLANCHA,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
1,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,0.0,0.0,6.1404,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
2,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,0.0,0.0,4.4200,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0000,0.0,0.0,0.0,0.040708,0.00133
3,TRADITIONAL SEITAN,0.0,0.0,0.0,0.0,0.0,0.0,7.0588,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
4,CHIPOTLE STYLE SEITAN,0.0,0.0,0.0,0.0,0.0,0.0,7.9600,0.0,0.0,...,0.0,0.002106,0.000796,0.0,0.0011,0.0,0.0,0.0,0.042478,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
760,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
761,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENA",0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
762,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000
763,"BIO BURGER VEGETAL CON SEITAN, ALGAS Y BERENJENAS",0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,0.0,0.000000,0.00000


In [28]:
################################################################################
# TOFU
################################################################################

lista_include = ['tofu']
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_include = []

for frase in lista_include:
  full_include.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include.append(translated)

# Obtener los elementos distintos de la lista
tofu_names = list(set(full_include))

print(tofu_names)

['Tahu', 'توفو', 'tokwa', 'tofu', 'Tofu', 'soya peyniri', 'đậu hũ']


In [31]:
# Filtro
filtro_tofu = '|'.join(tofu_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_tofu = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_tofu, case=False, regex=True)]


df_filtrado_tofu = df_filtrado_tofu.reset_index(drop=True)
df_filtrado_tofu

Unnamed: 0,PRODUCT_NAME,ADDED_SUGARS_100G,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,BUTYRIC_ACID_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,ORGANIC PESTO WITH TOFU,0.0,0.0,0.0,0.0,0.0,0.000,12.200000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TOFU NATURE,0.0,0.0,0.0,0.0,0.0,0.000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TOFU FUME,0.0,0.0,0.0,0.0,0.0,0.000,2.100000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TOFU FIRM,0.0,0.0,0.0,0.0,0.0,0.127,2.530000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,TOFU DE GRAINES DE CITROUILLES ORIGINALE,0.0,0.0,0.0,0.0,0.0,0.000,6.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2832,TOFU NATURE BIO,0.0,0.0,0.0,0.0,0.0,0.000,1.200000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2833,TOFU FUME BIO,0.0,0.0,0.0,0.0,0.0,0.000,2.200000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2834,"ORGANIC TOFU, EXTRA FIRM",0.0,0.0,0.0,0.0,0.0,0.000,3.571429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2835,TOFU,0.0,0.0,0.0,0.0,0.0,0.000,2.700000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
################################################################################
# SOJA
################################################################################

lista_include = ['soja']
idiomas = ['en', 'pt','de','ca','vi','tl','tr','id','so','sw','hr','hu','cy','no','fr','es','sv','pl','ro','lt','fa']
full_include = []

for frase in lista_include:
  full_include.append(frase)
  for idioma in idiomas:
    translated = GoogleTranslator(source='spanish', target=str(idioma)).translate(str(frase))
    full_include.append(translated)

# Obtener los elementos distintos de la lista
soja_names = list(set(full_include))

print(soja_names)

['kedelai', 'toyo', 'soi', 'soia', 'đậu nành', 'sojos', 'szója', 'سویا', 'soya', 'soy', 'soja', 'Soja']


In [32]:
# Filtro
filtro_soja = '|'.join(soja_names)

# Filtrar el DataFrame utilizando la expresión regular
df_filtrado_soja = pandas_df[pandas_df['PRODUCT_NAME'].str.contains(filtro_soja, case=False, regex=True)]


df_filtrado_soja = df_filtrado_soja.reset_index(drop=True)
df_filtrado_soja

Unnamed: 0,PRODUCT_NAME,ADDED_SUGARS_100G,ALCOHOL_100G,ALPHA_LINOLENIC_ACID_100G,BIOTIN_100G,BUTYRIC_ACID_100G,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CARBON_FOOTPRINT_FROM_MEAT_OR_FISH_100G,...,VITAMIN_B1_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_B9_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_E_100G,VITAMIN_K_100G,VITAMIN_PP_100G,ZINC_100G
0,100% SOJA PROTEIN HASELNUSS,0.0,0.0,0.0,0.0,0.0,0.0,1.100000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,PROTEINES DE SOJA ISOLAT BULK,0.0,0.0,0.0,0.0,0.0,0.0,6.100000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SOYA ROTI SALE,0.0,0.0,0.0,0.0,0.0,0.0,28.571429,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SOY CREAMER,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SOYA PROTEIN ISOLATE,0.0,0.0,0.0,0.0,0.0,0.0,0.800000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6098,ДЕСЕРТ ALPRO SOYA ВАНИЛЬНЫЙ,0.0,0.0,0.0,0.0,0.0,0.0,12.700000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6099,ПРОТЕИН OPTIMUM 100% SOY PROTEIN,0.0,0.0,0.0,0.0,0.0,0.0,6.500000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6100,ПРОТЕИН PUREPROTEIN SOY ISOLATE НАТУРАЛЬНЫЙ ВКУС,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6101,ПРОТЕИН PUREPROTEIN SOY ISOLATE ШОКОЛАДНОЕ ПЕЧ...,0.0,0.0,0.0,0.0,0.0,0.0,15.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
################################################################################
# Exportar el DataFrame de pandas como un archivo CSV
################################################################################
df_filtrado_soja.to_csv("/content/drive/MyDrive/Datos TFM/soja.csv", index=False)
df_filtrado_tofu.to_csv("/content/drive/MyDrive/Datos TFM/tofu.csv", index=False)
df_filtrado_seitan.to_csv("/content/drive/MyDrive/Datos TFM/seitan.csv", index=False)