The dataset was downloaded locally from Hugging Face [here](https://huggingface.co/datasets/openfoodfacts/product-database/blob/main/food.parquet), then uploaded to our public S3 bucket.

In [None]:
# Install dependencies
!pip install --upgrade boto3 botocore awscli

In [12]:
# Import libraries
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import time

# Setup Athena Database and Query Engine

In [2]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-654654380268
Region: us-east-1


## Verify Access to Dataset from Public S3 Bucket

In [3]:
# Verify the public S3 bucket contents
!aws s3 ls s3://aai-540-openfoodfacts/

2025-09-18 04:28:03 4253536290 food.parquet


In [36]:
# Set S3 source location (public S3 bucket)
s3_public_folder_path = "s3://aai-540-openfoodfacts"
s3_public_path_parquet = "s3://aai-540-openfoodfacts/food.parquet"
%store s3_public_folder_path
%store s3_public_path_parquet

Stored 's3_public_folder_path' (str)
Stored 's3_public_path_parquet' (str)


## Create Athena Database

In [5]:
# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [23]:
# Create our database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS foodfacts


  pd.read_sql(statement, conn)


In [19]:
# Verify database was created
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,foodfacts


## Create and Run a Glue Crawler
We created a glue crawler to automatically create a table of the dataset file.

In [43]:
# Initialize the AWS Glue client
glue_client = boto3.client('glue', region_name=region)

crawler_name = 'food_products_crawler'

# Create the Glue Crawler
try:
    glue_client.create_crawler(
        Name=crawler_name,
        Role=role,
        DatabaseName=database_name,
        Targets={'S3Targets': [{'Path': s3_public_folder_path}]},
    )
    print(f"Crawler '{crawler_name}' created successfully.")
except glue_client.exceptions.AlreadyExistsException:
    print(f"Crawler '{crawler_name}' already exists.")

Crawler 'food_products_crawler' already exists.


In [44]:
# Verify Creation of Glue Crawler
crawlers = glue_client.get_crawlers()
for crawler in crawlers['Crawlers']:
    print(f"Crawler Name: {crawler['Name']}, State: {crawler['State']}")

Crawler Name: food_products_crawler, State: READY


In [39]:
# Start the Glue Crawler
print(f"Starting crawler '{crawler_name}'...")
glue_client.start_crawler(Name=crawler_name)

# Wait for the crawler to finish
while True:
    response = glue_client.get_crawler(Name=crawler_name)
    state = response['Crawler']['State']

    if state == 'READY':
        print("Crawler finished successfully.")
        break
    elif state == 'STOPPING':
        print("Crawler is stopping...")
    elif state == 'RUNNING':
        print("Crawler is running...")
    elif state == 'FAILED':
        print("Crawler failed.")
        break
    
    time.sleep(15) # Wait 15 seconds before checking the status again

Starting crawler 'food_products_crawler'...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler finished successfully.


In [45]:
# Verify table was created from crawler
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aai_540_openfoodfacts


In [41]:
# Save table name
raw_table_name = "aai_540_openfoodfacts"
%store raw_table_name

Stored 'raw_table_name' (str)


## Verify Querying the Database

In [42]:
# Run sample query
statement = """SELECT * FROM {}.{} LIMIT 5""".format(
    database_name, raw_table_name
)
print(statement)
df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM foodfacts.aai_540_openfoodfacts LIMIT 5


  df = pd.read_sql(statement, conn)


Unnamed: 0,additives_n,additives_tags,allergens_tags,brands_tags,brands,categories,categories_tags,categories_properties,checkers_tags,ciqual_food_name_tags,...,states_tags,stores_tags,stores,traces_tags,unique_scans_n,unknown_ingredients_n,unknown_nutrients_tags,vitamins_tags,with_non_nutritive_sweeteners,with_sweeteners
0,,,[],[xx:piu-buono],Più Buono,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,,[],,,
1,,,[],[xx:aicha],Aicha,en:strained-tomatoes,"[en:plant-based-foods-and-beverages, en:plant-...","{'ciqual_food_code': 20260, 'agribalyse_food_c...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],37.0,,[],,,
2,,[],[],[piraque],PIRAQUE,,[],"{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",[],,[],,,[],[],,
3,,,[],[xx:terra-etica],Terra Etica,"Snacks, Snacks sucrés, Cacao et dérivés, Choco...","[en:snacks, en:sweet-snacks, en:cocoa-and-its-...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",[],,[],,,[],[],,
4,,,[],,,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,,[],,,
