The dataset was downloaded locally from Hugging Face [here](https://huggingface.co/datasets/openfoodfacts/product-database/blob/main/food.parquet), then uploaded to our public S3 bucket.

In [None]:
# Install dependencies
!pip install --upgrade boto3 botocore awscli

In [37]:
# Import libraries
import boto3
import sagemaker
from pyathena import connect
import pandas as pd


# Setup Athena Database and Query Engine

In [18]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-654654380268
Region: us-east-1


## Verify Access to Dataset from Public S3 Bucket

In [42]:
# Verify the public S3 bucket contents
!aws s3 ls s3://aai-540-openfoodfacts/

2025-09-18 04:28:03 4253536290 food.parquet


In [43]:
# Set S3 source location (public S3 bucket)
s3_public_path_parquet = "s3://aai-540-openfoodfacts/food.parquet"
%store s3_public_path_parquet

Stored 's3_public_path_parquet' (str)


## Create Athena Database

In [22]:
# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [23]:
# Create our database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS foodfacts


  pd.read_sql(statement, conn)


In [44]:
# Verify database was created
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,default
1,dsoaws
2,foodfacts


We created a glue crawler to automatically create a table of the dataset file.

In [49]:
# Verify table was created from crawler
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aai_540_openfoodfacts


In [50]:
# Save table name
raw_table_name = "aai_540_openfoodfacts"
%store raw_table_name

Stored 'raw_table_name' (str)


In [51]:
# Run sample query
statement = """SELECT * FROM {}.{} LIMIT 5""".format(
    database_name, raw_table_name
)
print(statement)
df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM foodfacts.aai_540_openfoodfacts LIMIT 5


  df = pd.read_sql(statement, conn)


Unnamed: 0,additives_n,additives_tags,allergens_tags,brands_tags,brands,categories,categories_tags,categories_properties,checkers_tags,ciqual_food_name_tags,...,states_tags,stores_tags,stores,traces_tags,unique_scans_n,unknown_ingredients_n,unknown_nutrients_tags,vitamins_tags,with_non_nutritive_sweeteners,with_sweeteners
0,,,[],,,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],2.0,,[],[],,
1,,[],[],[la-costena],La Costena,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],1.0,,[],[],,
2,,,[],[xx:pr-ou],pr-ou,,[],"{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",[],,[],,,[],[],,
3,,,[],[xx:dia],dia,,[],"{'ciqual_food_code': None, 'agribalyse_food_co...",[],[unknown],...,"[en:to-be-completed, en:nutrition-facts-comple...",[],,[],,,[],[],,
4,,,[],[xx:hiruak],Hiruak,"Viandes et dérivés, Plats préparés, Viandes, C...","[en:meats-and-their-products, en:meals, en:mea...","{'ciqual_food_code': 30131, 'agribalyse_food_c...",[],[poultry-sausage],...,"[en:to-be-completed, en:nutrition-facts-comple...",,,[],,,[],[],,
