The dataset was downloaded locally from Hugging Face [here](https://huggingface.co/datasets/openfoodfacts/product-database/blob/main/food.parquet), then uploaded to our public S3 bucket.

In [1]:
# Install dependencies
!pip install --upgrade boto3 botocore awscli

Collecting boto3
  Downloading boto3-1.40.35-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore
  Downloading botocore-1.40.35-py3-none-any.whl.metadata (5.7 kB)
Collecting awscli
  Downloading awscli-1.42.35-py3-none-any.whl.metadata (11 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3)
  Using cached s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Collecting docutils<=0.19,>=0.18.1 (from awscli)
  Using cached docutils-0.19-py3-none-any.whl.metadata (2.7 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli)
  Using cached rsa-4.7.2-py3-none-any.whl.metadata (3.6 kB)
Downloading boto3-1.40.35-py3-none-any.whl (139 kB)
Downloading botocore-1.40.35-py3-none-any.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m260.9 MB/s[0m  [33m0:00:00[0m
[?25hUsing cached s3transfer-0.14.0-py3-none-any.whl (85 kB)
Downloading awscli-1.42.35-py3-none-any.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [3

In [2]:
# Import libraries
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import time

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# Setup Athena Database and Query Engine

In [3]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-975049911265
Region: us-east-1


## Verify Access to Dataset from Public S3 Bucket

In [4]:
# Verify the public S3 bucket contents
!aws s3 ls s3://aai-540-openfoodfacts/

2025-09-18 04:28:03 4253536290 food.parquet


In [5]:
# Set S3 source location (public S3 bucket)
s3_public_folder_path = "s3://aai-540-openfoodfacts"
s3_public_path_parquet = "s3://aai-540-openfoodfacts/food.parquet"
%store s3_public_folder_path
%store s3_public_path_parquet

Stored 's3_public_folder_path' (str)
Stored 's3_public_path_parquet' (str)


## Create Athena Database

In [6]:
# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [7]:
# Create our database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS foodfacts


  pd.read_sql(statement, conn)


In [8]:
# Verify database was created
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,aai540_assignment2_db
1,aai540_assignment2_db_csv
2,default
3,dsoaws
4,foodfacts


## Create and Run a Glue Crawler
We created a glue crawler to automatically create a table of the dataset file.

In [9]:
# Initialize the AWS Glue client
glue_client = boto3.client('glue', region_name=region)

crawler_name = 'food_products_crawler'

# Create the Glue Crawler
try:
    glue_client.create_crawler(
        Name=crawler_name,
        Role=role,
        DatabaseName=database_name,
        Targets={'S3Targets': [{'Path': s3_public_folder_path}]},
    )
    print(f"Crawler '{crawler_name}' created successfully.")
except glue_client.exceptions.AlreadyExistsException:
    print(f"Crawler '{crawler_name}' already exists.")

Crawler 'food_products_crawler' created successfully.


In [10]:
# Verify Creation of Glue Crawler
crawlers = glue_client.get_crawlers()
for crawler in crawlers['Crawlers']:
    print(f"Crawler Name: {crawler['Name']}, State: {crawler['State']}")

Crawler Name: aai540_homework2_crawler, State: READY
Crawler Name: food_products_crawler, State: READY


In [11]:
# Start the Glue Crawler - no need to re-run 
print(f"Starting crawler '{crawler_name}'...")
glue_client.start_crawler(Name=crawler_name)

# Wait for the crawler to finish
while True:
    response = glue_client.get_crawler(Name=crawler_name)
    state = response['Crawler']['State']

    if state == 'READY':
        print("Crawler finished successfully.")
        break
    elif state == 'STOPPING':
        print("Crawler is stopping...")
    elif state == 'RUNNING':
        print("Crawler is running...")
    elif state == 'FAILED':
        print("Crawler failed.")
        break
    
    time.sleep(15) # Wait 15 seconds before checking the status again

Starting crawler 'food_products_crawler'...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler finished successfully.


In [12]:
# Verify table was created from crawler
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aai_540_openfoodfacts


In [13]:
# Save table name
raw_table_name = "aai_540_openfoodfacts"
%store raw_table_name

Stored 'raw_table_name' (str)


## Verify Querying the Database

In [14]:
# Run sample query
statement = """SELECT * FROM {}.{} LIMIT 5""".format(
    database_name, raw_table_name
)
print(statement)
df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM foodfacts.aai_540_openfoodfacts LIMIT 5


  df = pd.read_sql(statement, conn)


Unnamed: 0,additives_n,additives_tags,allergens_tags,brands_tags,brands,categories,categories_tags,categories_properties,checkers_tags,ciqual_food_name_tags,...,states_tags,stores_tags,stores,traces_tags,unique_scans_n,unknown_ingredients_n,unknown_nutrients_tags,vitamins_tags,with_non_nutritive_sweeteners,with_sweeteners
0,,,[],[xx:boni],Boni,"Viandes et dérivés, Produits à tartiner, Viand...","[en:meats-and-their-products, en:spreads, en:m...","{'ciqual_food_code': 36016, 'agribalyse_food_c...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",[colruyt],Colruyt,[],1.0,,[],[],,
1,0.0,[],[],[vegetalia],Vegetalia,"Plant-based foods and beverages, Plant-based f...","[en:plant-based-foods-and-beverages, en:plant-...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],1.0,1.0,[],[],,
2,,,[],[xx:colfiorito],Colfiorito,"Cibi e bevande a base vegetale, Cibi a base ve...","[en:plant-based-foods-and-beverages, en:plant-...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,,[],,,
3,2.0,"[en:e407, en:e410]",[en:milk],[umpqua-dairy],Umpqua Dairy,"Dairies, Milks, Semi-skimmed milks","[en:dairies, en:milks, en:semi-skimmed-milks]","{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,1.0,[],[en:retinyl-palmitate],,
4,,,[],,,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,,[],[],,
