In [None]:
from pyspark.sql import SparkSession
import requests
import os
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder \
    .appName("OpenFoodFacts") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Define the URL and local path for the file
url = "https://huggingface.co/datasets/openfoodfacts/product-database/resolve/main/food.parquet"
local_file_path = "/home/jovyan/work/notebooks/openfoodfacts/food.parquet"

# Download the file if it doesn't exist locally
if not os.path.exists(local_file_path):
    print(f"Downloading file from {url}...")
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an exception for HTTP errors
    
    with open(local_file_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"File downloaded successfully to {local_file_path}")
else:
    print(f"File already exists at {local_file_path}")

# Read the Parquet file into a Spark DataFrame
df = spark.read.parquet(local_file_path)

# Show the first few rows to verify the data was loaded correctly
df.show(5)

# Print the schema to see the structure of the data
df.printSchema()

# Filter products related to Morocco
filtered_df = df.filter(
    (F.array_contains(F.col("countries_tags"), "en:morocco")) |
    (F.array_contains(F.col("countries_tags"), "fr:morocco")) |
    (F.array_contains(F.col("countries_tags"), "ar:morocco")) |
    (F.array_contains(F.col("main_countries_tags"), "en:morocco")) |
    (F.array_contains(F.col("origins_tags"), "en:morocco")) |
    (F.lower(F.col("manufacturing_places")).like("%morocco%"))
)

# Show the count of filtered data
print(f"Filtered data count: {filtered_df.count()}")

# Define output path for CSV
output_csv_path = "/home/jovyan/work/notebooks/openfoodfacts/openfoodfacts_morocco.csv"

# Save the filtered DataFrame as a CSV file
filtered_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_csv_path)
print(f"Filtered data saved to {output_csv_path}")

# Remove the large Parquet file to save space
os.remove(local_file_path)
print(f"Deleted large Parquet file: {local_file_path}")


Downloading file from https://huggingface.co/datasets/openfoodfacts/product-database/resolve/main/food.parquet...
File downloaded successfully to /home/jovyan/work/notebooks/openfoodfacts/food.parquet
+-----------+--------------+--------------+-----------+-------+--------------------+--------------------+-------------+---------------------+-----------+-------------+--------------------+--------+------------+--------------------+------------------+----------+---------------+------------------------+----------------------+--------------------------+--------------------+--------------------+--------------+--------------+-------------+-------+--------------+---------+--------------------+--------------------+------------+--------------------+--------------------+-------------------------+---------------------------+-------------+-------------------------+----------------------------+--------------------+--------------------+------------------------------------+-----------------------------