In [1]:
################################################################################
# Preparar entorno de trabajo
################################################################################
!pip install pyspark
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=6330b68381be65a0b22ce4df637d7b798430e72ae8f553eb15efe024631efdfa
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
################################################################################
# Preparar entorno de trabajo
################################################################################
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# IGNORAR WARNINGS
import warnings
warnings.filterwarnings('ignore')

# Importar datos desde Drive
from google.colab import drive
drive.mount('/content/drive')

# Pandas
import pandas as pd

# sklearn
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

Mounted at /content/drive


In [3]:
################################################################################
# CREAMOS EL SPARK SESSION
################################################################################
spark = SparkSession.builder.appName("Nutricion").getOrCreate()

In [4]:
################################################################################
# Datos Open Food Facts
# URL = https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv
################################################################################
df_datos = spark.read.options(header="True", inferSchema='True', delimiter='\t').csv("/content/drive/MyDrive/Datos TFM/en.openfoodfacts.org.products.csv")

In [5]:
################################################################################
# DATAFRAME A SQL
################################################################################
df_datos.createOrReplaceTempView("openfoodfacts")

In [6]:
################################################################################
# Cantidad de productos
################################################################################
#df_datos.select(df_datos.product_name).count()

In [7]:
################################################################################
# Cantidad de productos distintos
################################################################################
#df_datos.select(df_datos.product_name).distinct().count()

In [8]:
################################################################################
# Schema de datos
################################################################################
#df_datos.printSchema()

In [9]:
################################################################################
# Describir cada una de las columnas y determinar si son cualitativas o cuantitativas
################################################################################

# Obtén el esquema del DataFrame
schema = df_datos.schema

# Crea una lista vacía para almacenar los nombres de las columnas
column_names = []

# Crea dos listas vacías para almacenar los tipos de datos y los tipos de variable de cada columna
data_types = []
variable_types = []

# Itera sobre cada columna del esquema
for field in schema.fields:
    # Obtén el nombre de la columna
    column_name = field.name
    # Agrega el nombre de la columna a la lista correspondiente
    column_names.append(column_name)

    # Obtén el tipo de datos de la columna
    data_type = str(field.dataType)

    # Agrega el tipo de datos a la lista correspondiente
    data_types.append(data_type)

    # Determina si la columna es cualitativa o cuantitativa
    if data_type.startswith("StringType"):
        variable_type = "Cualitativa"
    else:
        variable_type = "Cuantitativa"
    # Agrega el tipo de variable a la lista correspondiente
    variable_types.append(variable_type)

# Crea un DataFrame para mostrar los resultados
result_df = spark.createDataFrame(zip(column_names, data_types, variable_types), ["Columna", "Tipo de datos", "Tipo de variable"])
result_df.show(n=300, truncate=False, vertical=False)

+-----------------------------------------------------+---------------+----------------+
|Columna                                              |Tipo de datos  |Tipo de variable|
+-----------------------------------------------------+---------------+----------------+
|code                                                 |DoubleType()   |Cuantitativa    |
|url                                                  |StringType()   |Cualitativa     |
|creator                                              |StringType()   |Cualitativa     |
|created_t                                            |IntegerType()  |Cuantitativa    |
|created_datetime                                     |TimestampType()|Cuantitativa    |
|last_modified_t                                      |IntegerType()  |Cuantitativa    |
|last_modified_datetime                               |TimestampType()|Cuantitativa    |
|last_modified_by                                     |StringType()   |Cualitativa     |
|product_name        

In [10]:
################################################################################
# Obtener los productos de tipo:
# - Seitan
# - Soja texturizada
# - Tofu
################################################################################
df_products_distinct = spark.sql('''SELECT DISTINCT * FROM openfoodfacts
                                                    WHERE
                                                        product_name LIKE "%seitan%" OR product_name LIKE "%tofu%" OR
                                                        (product_name LIKE "%soja%" AND product_name LIKE "%tex%") ''')

df_products_distinct.show(truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------+-----------------+----------+-------------------+---------------+----------------------+-----------------+---------------------------------------------------------+------------------------+------------+--------+--------------------+-----------------------+--------------------+--------------+------------------------+----------------------+-----------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+-------+------------+----------+--------------------+-------------------------+-----------------------------------------+--------------------------------

In [11]:
################################################################################
# Obtener los productos de tipo:
# - Veganos y Vegetarianos
################################################################################
df_products_vegan = spark.sql('''SELECT DISTINCT * FROM openfoodfacts
                                                    WHERE
                                                        categories LIKE '%vegan%' OR labels_en LIKE "%vegetarian%" OR labels_en LIKE "%vegan%" OR
                                                        main_category LIKE "%vegetarian%" OR main_category LIKE "%vegetarien%" OR main_category LIKE "%vegan%" ''')
df_products_vegan.show(truncate=False)

+---------------+-------------------------------------------------------------------------------------------------------------------------------+--------------------------+----------+-------------------+---------------+----------------------+-------------------------+--------------------------------------------------------+------------------------+--------------------------+------------------------------+---------------------------------------+--------------------------------------------+--------------------------------------+--------------+-------------------------------------------------+------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------

In [12]:
#df_products_vegan.select(df_products_vegan.product_name).distinct().count()

In [13]:
################################################################################
# Convertir el DataFrame de PySpark a un DataFrame de pandas
################################################################################
pandas_df = df_products_distinct.toPandas()

In [14]:
################################################################################
# Funciones de limpieza de datos (DataFrame Pandas)
################################################################################
def datos_a_mayusculas(df):

    df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

    return df
################################################################################
################################################################################
################################################################################
def eliminar_nulos(df):
    df = df.dropna(axis=0, how='all')

    return df
################################################################################
################################################################################
################################################################################
def eliminar_duplicados(df):
    df = df.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

    return df
################################################################################
################################################################################
################################################################################
def reemplazar_caracteres(df):
    df = df.str.upper()
    df = df.str.replace('/',' ')
    df = df.str.replace(',',' ')
    df = df.str.replace('-',' ')
    df = df.str.replace(' ','_')
    df = df.str.replace('Á','A')
    df = df.str.replace('É','E')
    df = df.str.replace('Í','I')
    df = df.str.replace('Ó','O')
    df = df.str.replace('Ú','U')
    df = df.str.replace('Ü','U')
    df = df.str.replace('Ñ','N')

    return df

In [15]:
################################################################################
#En este paso se convienten los datos contenidos en los dataframe a mayúsculas,
# esto con el fin de estandarizarlos.
################################################################################

pandas_df = datos_a_mayusculas(pandas_df)

In [16]:
################################################################################
# Se eliminan las filas en las que todos los valores son nulos,
# ya que se trata de filas que no contribuyen en los análisis que se harán en las etapas siguientes.
################################################################################

pandas_df = eliminar_nulos(pandas_df)

In [17]:
################################################################################
# identificar las columnas que son de tipo string y las columnas que son numéricas.
################################################################################

# Identificar columnas que son de tipo string
str_cols = pandas_df.select_dtypes(include=['object']).columns

# Identificar columnas que son numéricas
num_cols = pandas_df.select_dtypes(include=['float64', 'int64']).columns

In [18]:
################################################################################
#Se reemplazan los casos en que existe un NaN en cada uno de los dataframe creados por un dato:
# - "vacío" cuando es String.
# - "0" cuando es Numérica.
################################################################################
# Reemplazar NaN en columnas de tipo string con ''
pandas_df[str_cols] = pandas_df[str_cols].fillna('')

# Reemplazar NaN en columnas numéricas con 0
pandas_df[num_cols] = pandas_df[num_cols].fillna(0)

In [19]:
################################################################################
#Se eliminan en este paso los datos duplicados, si los hubiera, en el dataframe.
################################################################################

pandas_df = eliminar_duplicados(pandas_df)

In [20]:
################################################################################
# Con el fin de no tener problemas al momento de invocar las columnas,
# se estandarizan los nombres que actualmente existen en el dataframe.
################################################################################

pandas_df.columns = reemplazar_caracteres(pandas_df.columns)

In [21]:
################################################################################
# DataFrame Pre-Procesado
################################################################################
#pandas_df

In [22]:
################################################################################
# Contador de datos por columna
################################################################################

#pandas_df_count = pandas_df.count()
#pandas_df_count

In [23]:
#pandas_df.drop('CODE', inplace=True, axis=1)
#pandas_df.drop('CREATOR', inplace=True, axis=1)
#pandas_df.drop('CREATED_T', inplace=True, axis=1)
#pandas_df.drop('CREATED_DATETIME', inplace=True, axis=1)
#pandas_df.drop('LAST_MODIFIED_T', inplace=True, axis=1)
#pandas_df.drop('LAST_MODIFIED_DATETIME', inplace=True, axis=1)
#pandas_df.drop('LAST_MODIFIED_BY', inplace=True, axis=1)
#pandas_df.drop('URL', inplace=True, axis=1)
#pandas_df.drop('ABBREVIATED_PRODUCT_NAME', inplace=True, axis=1)
#pandas_df.drop('GENERIC_NAME', inplace=True, axis=1)
#pandas_df['INGREDIENTS_TAGS'].unique()

In [24]:
################################################################################
# Selección de columnas
################################################################################

# Seleccionamos las columnas que tengas 100 en su nombre
columnas_100G = [col for col in pandas_df.columns if '100' in col]

# Creamos un array axuliar para obtener las columnas que tengas 100 en su nombre
df_aux100 = pandas_df[columnas_100G]

# Seleccionar columnas cuya suma de filas es mayor a 0
columnas_suma_mayor_0 = df_aux100.columns[df_aux100.sum(axis=0) > 0]

# Columnas adicionales y complementarias al analisis
columnas_adicionales = ['PRODUCT_NAME', 'CREATOR', 'INGREDIENTS_TEXT', 'INGREDIENTS_TAGS', 'ECOSCORE_GRADE', 'ECOSCORE_SCORE', 'NUTRISCORE_GRADE', 'NUTRISCORE_SCORE', 'FOOD_GROUPS_EN', 'ADDITIVES_EN', 'TRACES_EN', 'ALLERGENS']
columnas_adicionales.extend(columnas_suma_mayor_0)
columnas_adicionales

# Dataframe final
pandas_df = pandas_df[columnas_adicionales]
#pandas_df

In [30]:
################################################################################
# Variables categoricas a numericas
################################################################################

# Inicializar el codificador
label_encoder = LabelEncoder()

# Codificar las variables categóricas en el dataframe
pandas_df['INGREDIENTS_TEXT_encoded'] = label_encoder.fit_transform(pandas_df['INGREDIENTS_TEXT'])
pandas_df['INGREDIENTS_TAGS_encoded'] = label_encoder.fit_transform(pandas_df['INGREDIENTS_TAGS'])
pandas_df['ECOSCORE_GRADE_encoded'] = label_encoder.fit_transform(pandas_df['ECOSCORE_GRADE'])
pandas_df['NUTRISCORE_GRADE_encoded'] = label_encoder.fit_transform(pandas_df['NUTRISCORE_GRADE'])
pandas_df['FOOD_GROUPS_EN_encoded'] = label_encoder.fit_transform(pandas_df['FOOD_GROUPS_EN'])
pandas_df['ADDITIVES_EN_encoded'] = label_encoder.fit_transform(pandas_df['ADDITIVES_EN'])
pandas_df['TRACES_EN_encoded'] = label_encoder.fit_transform(pandas_df['TRACES_EN'])
pandas_df['ALLERGENS_encoded'] = label_encoder.fit_transform(pandas_df['ALLERGENS'])
pandas_df['CREATOR_encoded'] = label_encoder.fit_transform(pandas_df['CREATOR'])

# Verificar el nuevo dataframe con las variables codificadas
pandas_df.head()

Unnamed: 0,PRODUCT_NAME,ADDITIVES_EN,ADDITIVES_EN_encoded,ALLERGENS,ALLERGENS_encoded,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CHOLESTEROL_100G,CREATOR,...,TRACES_EN_encoded,TRANS_FAT_100G,VITAMIN_A_100G,VITAMIN_B12_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_K_100G,CREATOR_encoded
0,TRADITIONAL SEITAN,,0,,0,0.0,0.082978,0.0,0.0,KILIWEB,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50
1,BACKED TOFU,,0,,0,0.0,0.1092,0.0,0.0,KILIWEB,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50
2,TRADITIONAL SEITAN,,0,,0,0.0,0.095389,0.0,0.0,KILIWEB,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50
3,FROZEN FISH TOFU,,0,,0,0.0,0.083784,0.0,0.0,KILIWEB,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50
4,EDAMAME + TOFU PONZU BOWL WITH CABBAGE AND PIC...,"E129 - ALLURA RED,E202 - POTASSIUM SORBATE,E33...",1,"EN:GLUTEN,EN:SOYBEANS",30,0.000546,0.158514,0.0,0.0,ORG-DATABASE-USDA,...,0,0.0,0.013599,0.0,0.0,0.0,0.371324,0.0,0.0,73


In [31]:
################################################################################
# Escalar datos
################################################################################

# Inicializar el escalador
scaler = MinMaxScaler()

# Identificar columnas que son numéricas
num_cols_df = pandas_df.select_dtypes(include=['float64', 'int64']).columns

# Escalar las columnas numéricas en el dataframe
pandas_df[num_cols_df] = scaler.fit_transform(pandas_df[num_cols_df])

# Verificar el resultado
pandas_df.head()

Unnamed: 0,PRODUCT_NAME,ADDITIVES_EN,ADDITIVES_EN_encoded,ALLERGENS,ALLERGENS_encoded,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CHOLESTEROL_100G,CREATOR,...,TRACES_EN_encoded,TRANS_FAT_100G,VITAMIN_A_100G,VITAMIN_B12_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_K_100G,CREATOR_encoded
0,TRADITIONAL SEITAN,,0.0,,0.0,0.0,0.082978,0.0,0.0,KILIWEB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47619
1,BACKED TOFU,,0.0,,0.0,0.0,0.1092,0.0,0.0,KILIWEB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47619
2,TRADITIONAL SEITAN,,0.0,,0.0,0.0,0.095389,0.0,0.0,KILIWEB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47619
3,FROZEN FISH TOFU,,0.0,,0.0,0.0,0.083784,0.0,0.0,KILIWEB,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47619
4,EDAMAME + TOFU PONZU BOWL WITH CABBAGE AND PIC...,"E129 - ALLURA RED,E202 - POTASSIUM SORBATE,E33...",0.015152,"EN:GLUTEN,EN:SOYBEANS",0.638298,0.000546,0.158514,0.0,0.0,ORG-DATABASE-USDA,...,0.0,0.0,0.013599,0.0,0.0,0.0,0.371324,0.0,0.0,0.695238


In [32]:
################################################################################
# Ordenar nombres de columnas
################################################################################

# Obtener el nombre de las columnas
columnas = pandas_df.columns.tolist()

# Ordenar las columnas por nombre, manteniendo 'PRODUCT_NAME' como primera columna
columnas_ordenadas = sorted(columnas, key=lambda x: (x != 'PRODUCT_NAME', x))

# Crear un nuevo DataFrame con las columnas ordenadas
pandas_df = pandas_df[columnas_ordenadas]

pandas_df

Unnamed: 0,PRODUCT_NAME,ADDITIVES_EN,ADDITIVES_EN_encoded,ALLERGENS,ALLERGENS_encoded,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CHOLESTEROL_100G,CREATOR,...,TRACES_EN,TRACES_EN_encoded,TRANS_FAT_100G,VITAMIN_A_100G,VITAMIN_B12_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_K_100G
0,TRADITIONAL SEITAN,,0.000000,,0.000000,0.000000,0.082978,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1,BACKED TOFU,,0.000000,,0.000000,0.000000,0.109200,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
2,TRADITIONAL SEITAN,,0.000000,,0.000000,0.000000,0.095389,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
3,FROZEN FISH TOFU,,0.000000,,0.000000,0.000000,0.083784,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
4,EDAMAME + TOFU PONZU BOWL WITH CABBAGE AND PIC...,"E129 - ALLURA RED,E202 - POTASSIUM SORBATE,E33...",0.015152,"EN:GLUTEN,EN:SOYBEANS",0.638298,0.000546,0.158514,0.0,0.0,ORG-DATABASE-USDA,...,,0.0,0.0,0.013599,0.0,0.0,0.0,0.371324,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1201,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",,0.000000,,0.000000,0.000000,0.000000,0.0,0.0,ELCOCO,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1202,PEKING MARINATED TOFU,,0.000000,,0.000000,0.000000,0.110811,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1203,"BIO BURGER VEGETAL SEITAN, ALGAS Y BERENJENAS",,0.000000,,0.000000,0.000000,0.000000,0.0,0.0,ELCOCO,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
1204,ORGANIC TOFU,,0.000000,,0.000000,0.000000,0.045946,0.0,0.0,KILIWEB,...,,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0


In [33]:
################################################################################
# Descripción de datos por columna
################################################################################
pandas_df.describe(include='all')

Unnamed: 0,PRODUCT_NAME,ADDITIVES_EN,ADDITIVES_EN_encoded,ALLERGENS,ALLERGENS_encoded,CALCIUM_100G,CARBOHYDRATES_100G,CARBON_FOOTPRINT_100G,CHOLESTEROL_100G,CREATOR,...,TRACES_EN,TRACES_EN_encoded,TRANS_FAT_100G,VITAMIN_A_100G,VITAMIN_B12_100G,VITAMIN_B2_100G,VITAMIN_B6_100G,VITAMIN_C_100G,VITAMIN_D_100G,VITAMIN_K_100G
count,1206,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206,...,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0,1206.0
unique,1030,67.0,,48.0,,,,,,106,...,84.0,,,,,,,,,
top,FIRM TOFU,,,,,,,,,KILIWEB,...,,,,,,,,,,
freq,15,1059.0,,990.0,,,,,,749,...,1078.0,,,,,,,,,
mean,,,0.076109,,0.122155,0.000929,0.133106,0.000829,0.002409,,...,,0.058802,0.000913,0.007731,0.001381,0.000829,0.000829,0.006938,0.001935,0.000829
std,,,0.228648,,0.287448,0.028828,0.139251,0.028796,0.042998,,...,,0.195075,0.028937,0.070209,0.03457,0.028796,0.028796,0.059811,0.039545,0.028796
min,,,0.0,,0.0,0.0,0.0,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,,0.0,,0.0,0.0,0.031796,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,,0.0,,0.0,0.0,0.091892,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,,0.0,,0.0,0.0,0.189189,0.0,0.0,,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
################################################################################
# Exportar el DataFrame de pandas como un archivo CSV
################################################################################
pandas_df.to_csv("/content/drive/MyDrive/Datos TFM/alimentos.csv", index=False)