# Extract data to a Spark DataFrame

In [1]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Example: Reading a CSV file
df = spark.read.option("sep", "\t").csv("data/en.openfoodfacts.org.products.csv.gz", header=True, inferSchema=True)
# df.show()

# Generate a table in a CSV with statistical descriptions of the DataFrame

In [2]:
# df.describe().toPandas().to_csv("describe_summary.csv", index=False)

# Take a sample of the DF

In [3]:
# FRACTION_SIZE = 0.00001

In [4]:
# columnsToSample = ['quantity', 'serving_size', 'serving_quantity', 'product_quantity']

In [5]:
# samples_not_clean_df = df.select(columnsToSample).dropna(how='all').cache()

In [6]:
# samples_df = samples_not_clean_df.sample(withReplacement=False, fraction=FRACTION_SIZE).cache()

In [7]:
# samples_df.count()

In [8]:
# samples_df.show()

# Extract unique strings in a column (ingredients analysis tags in this example)

In [9]:
# def flatten_list(li):
#     flat_list = []
#     for row in li:
#         flat_list += row
#     return flat_list

# def make_list_unique(li):
#     return list(dict.fromkeys(li))

# def split_string_list_elements(li, sep):
#     return [x.split(sep) for x in li]

# def column_to_list(col):
#     return col.rdd.flatMap(lambda x: x).collect()

In [10]:
# categories_list = make_list_unique(flatten_list(split_string_list_elements(column_to_list(df.select('ingredients_analysis_tags').dropna()), ",")))

In [11]:
# categories_list

# Drop unecessary columns

In [11]:
kept_columns = ["code", "product_quantity", "energy-kcal_100g", "fat_100g", "saturated-fat_100g", "monounsaturated-fat_100g", "polyunsaturated-fat_100g", "trans-fat_100g", 
                "carbohydrates_100g", "sugars_100g", "starch_100g", "fiber_100g", "proteins_100g", "allergens", "traces", "vitamin-a_100g", "vitamin-c_100g", "vitamin-d_100g",
                "vitamin-e_100g", "vitamin-k_100g", "vitamin-b1_100g", "vitamin-b2_100g", "vitamin-b6_100g", "vitamin-b9_100g", "vitamin-b12_100g", "calcium_100g",
                "iron_100g", "magnesium_100g", "potassium_100g", "zinc_100g", "food_groups_tags", "serving_size", "serving_quantity", "cholesterol_100g", "salt_100g", "glycemic-index_100g"]

In [12]:
df_kept_columns = df.select(kept_columns)

# Data quality

## Drop null rows

In [13]:
columns_to_check = ["code", "product_quantity", "energy-kcal_100g", "fat_100g", "saturated-fat_100g", "monounsaturated-fat_100g",
                    "polyunsaturated-fat_100g", "trans-fat_100g", "carbohydrates_100g", "sugars_100g", "starch_100g", 
                    "fiber_100g", "proteins_100g", "allergens", "traces", "food_groups_tags", "serving_size", "serving_quantity"]

In [14]:
df_kept_columns = df_kept_columns.dropna(subset=columns_to_check, thresh=15)

## Drop duplicates

In [15]:
df_kept_columns = df_kept_columns.dropDuplicates().cache()

# Write DF to a database

## Get user's database credentials (not necessary)

In [7]:
# properties = {}
# url = {}
# for text in ["database URL", "table name"]
#     url[text] = input("Enter " + text + ": ")


# for text in ["user", "password", "driver"]:
#     properties[text] = input("Enter " + text + ": ")

## Write to the database using JDBC Driver

## Unpersist cached data of base DF

In [8]:
# df.unpersist()

In [16]:
df_kept_columns.count()

15980

In [10]:
properties = {
    "user": "user",
    "password": "userpassword",
    "driver": "com.mysql.cj.jdbc.Driver"
}

df_kept_columns.write.jdbc(url="jdbc:mysql://mysql:3306/openfoodfact", table="products", mode="append", properties=properties)