The dataset was downloaded locally from Hugging Face [here](https://huggingface.co/datasets/openfoodfacts/product-database/blob/main/food.parquet), then uploaded to our public S3 bucket.

In [None]:
# Install dependencies
!pip install --upgrade boto3 botocore awscli

In [1]:
# Import libraries
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import time

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# Setup Athena Database and Query Engine

In [2]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-975049911265
Region: us-east-1


## Verify Access to Dataset from Public S3 Bucket

In [3]:
# Verify the public S3 bucket contents
!aws s3 ls s3://aai-540-openfoodfacts/

2025-09-18 04:28:03 4253536290 food.parquet


In [4]:
# Set S3 source location (public S3 bucket)
s3_public_folder_path = "s3://aai-540-openfoodfacts"
s3_public_path_parquet = "s3://aai-540-openfoodfacts/food.parquet"
%store s3_public_folder_path
%store s3_public_path_parquet

Stored 's3_public_folder_path' (str)
Stored 's3_public_path_parquet' (str)


## Create Athena Database

In [5]:
# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [6]:
# Create our database
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)
pd.read_sql(statement, conn)

CREATE DATABASE IF NOT EXISTS foodfacts


  pd.read_sql(statement, conn)


In [7]:
# Verify database was created
statement = "SHOW DATABASES"
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,aai540_assignment2_db
1,aai540_assignment2_db_csv
2,default
3,dsoaws
4,foodfacts


## Create and Run a Glue Crawler
We created a glue crawler to automatically create a table of the dataset file.

In [8]:
# Initialize the AWS Glue client
glue_client = boto3.client('glue', region_name=region)

crawler_name = 'food_products_crawler'

# Create the Glue Crawler
try:
    glue_client.create_crawler(
        Name=crawler_name,
        Role=role,
        DatabaseName=database_name,
        Targets={'S3Targets': [{'Path': s3_public_folder_path}]},
    )
    print(f"Crawler '{crawler_name}' created successfully.")
except glue_client.exceptions.AlreadyExistsException:
    print(f"Crawler '{crawler_name}' already exists.")

Crawler 'food_products_crawler' already exists.


In [9]:
# Verify Creation of Glue Crawler
crawlers = glue_client.get_crawlers()
for crawler in crawlers['Crawlers']:
    print(f"Crawler Name: {crawler['Name']}, State: {crawler['State']}")

Crawler Name: aai540_homework2_crawler, State: READY
Crawler Name: food_products_crawler, State: READY


In [10]:
# Start the Glue Crawler - no need to re-run 
print(f"Starting crawler '{crawler_name}'...")
glue_client.start_crawler(Name=crawler_name)

# Wait for the crawler to finish
while True:
    response = glue_client.get_crawler(Name=crawler_name)
    state = response['Crawler']['State']

    if state == 'READY':
        print("Crawler finished successfully.")
        break
    elif state == 'STOPPING':
        print("Crawler is stopping...")
    elif state == 'RUNNING':
        print("Crawler is running...")
    elif state == 'FAILED':
        print("Crawler failed.")
        break
    
    time.sleep(15) # Wait 15 seconds before checking the status again

Starting crawler 'food_products_crawler'...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is running...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler is stopping...
Crawler finished successfully.


In [11]:
# Verify table was created from crawler
statement = "SHOW TABLES in {}".format(database_name)
df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,tab_name
0,aai_540_openfoodfacts


In [12]:
# Save table name
raw_table_name = "aai_540_openfoodfacts"
%store raw_table_name

Stored 'raw_table_name' (str)


## Verify Querying the Database

In [13]:
# Run sample query
statement = """SELECT * FROM {}.{} LIMIT 5""".format(
    database_name, raw_table_name
)
print(statement)
df = pd.read_sql(statement, conn)
df.head(5)

SELECT * FROM foodfacts.aai_540_openfoodfacts LIMIT 5


  df = pd.read_sql(statement, conn)


Unnamed: 0,additives_n,additives_tags,allergens_tags,brands_tags,brands,categories,categories_tags,categories_properties,checkers_tags,ciqual_food_name_tags,...,states_tags,stores_tags,stores,traces_tags,unique_scans_n,unknown_ingredients_n,unknown_nutrients_tags,vitamins_tags,with_non_nutritive_sweeteners,with_sweeteners
0,0.0,[],[en:milk],[xx:molkerei-huttenthal],Molkerei Hüttenthal,"Getränke, Milchprodukte, Fermentierte Lebensmi...","[en:beverages-and-beverages-preparations, en:b...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",[l-abbate-kasefabrik],L'Abbate Käsefabrik,[],1.0,0.0,[],[],,
1,,,[],,,"Plant-based foods and beverages, Plant-based f...","[en:plant-based-foods-and-beverages, en:plant-...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-to-be-...",,,[],,,[],,,
2,,,[],[coles],Coles,,,"{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",[coles],Coles,[],,,[],,,
3,,,[],[xx:conad],Conad,"Cibi e bevande a base vegetale, Cibi a base ve...","[en:plant-based-foods-and-beverages, en:plant-...","{'ciqual_food_code': None, 'agribalyse_food_co...",[],,...,"[en:to-be-completed, en:nutrition-facts-to-be-...",,,[],,,[],,,
4,0.0,[],"[en:eggs, en:gluten, en:milk, en:nuts, en:soyb...",[xx:farmer-s-market],Farmer's Market,"Snacks, Sweet snacks, Biscuits and cakes, Past...","[en:snacks, en:sweet-snacks, en:biscuits-and-c...","{'ciqual_food_code': 23900, 'agribalyse_food_c...",[],,...,"[en:to-be-completed, en:nutrition-facts-comple...",[real-canadian-superstore],Real Canadian Superstore,"[en:nuts, en:peanuts, en:sesame-seeds]",,4.0,[],[],,


## Perform Feature Engineering

In [15]:
# Subset of columns from the raw table
statement = f"""
SELECT 
    code,
    product_name,
    brands,
    categories,
    ecoscore_grade,
    ecoscore_score,
    nutriscore_grade,
    nutriscore_score,
    nova_group,
    ingredients_n,
    known_ingredients_n,
    unknown_ingredients_n,
    with_sweeteners,
    with_non_nutritive_sweeteners,
    unique_scans_n
FROM {database_name}.{raw_table_name}
LIMIT 20
"""

df_features = pd.read_sql(statement, conn)
df_features.head(10)


  df_features = pd.read_sql(statement, conn)


Unnamed: 0,code,product_name,brands,categories,ecoscore_grade,ecoscore_score,nutriscore_grade,nutriscore_score,nova_group,ingredients_n,known_ingredients_n,unknown_ingredients_n,with_sweeteners,with_non_nutritive_sweeteners,unique_scans_n
0,3256221412017,"[{'lang': 'main', 'text': 'Moutarde forte de D...",U,"Condiments, Sauces, Moutardes, Moutardes de Di...",,,e,21.0,3.0,9.0,8.0,1.0,,,17.0
1,3256221412055,"[{'lang': 'main', 'text': 'Moutarde forte de D...",U,"Condiments, Sauces, Moutardes, Moutardes de Di...",,,e,21.0,3.0,9.0,8.0,1.0,,,9.0
2,3256221412185,"[{'lang': 'main', 'text': 'Jus de citron jaune...",U,"Aliments et boissons à base de végétaux, Boiss...",,,b,-3.0,3.0,4.0,4.0,0.0,,,2.0
3,3256221415766,"[{'lang': 'main', 'text': 'Sandwich polaire ga...",U,"Sandwichs, Sandwichs à la volaille, Sandwichs ...",,,c,6.0,4.0,47.0,44.0,3.0,,,2.0
4,3256221415827,"[{'lang': 'main', 'text': 'Crevettes sauce Tha...",U,"Plats préparés, Frais, Plats à base de riz, Pl...",,,c,4.0,4.0,35.0,30.0,5.0,,,
5,3256221416114,"[{'lang': 'main', 'text': 'Spécialité fermenté...",U,"Plant-based foods and beverages,Fermented food...",,,a,-1.0,4.0,7.0,7.0,0.0,,,13.0
6,3256221416602,"[{'lang': 'main', 'text': 'Purée de pomme sans...","U, U Bio","Aliments et boissons à base de végétaux, Alime...",,,a,-2.0,1.0,3.0,3.0,0.0,,,4.0
7,3256221419061,"[{'lang': 'main', 'text': 'Tablette de chocola...",U,"Snacks, Snacks sucrés, Cacao et dérivés, Choco...",,,e,24.0,4.0,14.0,14.0,0.0,,,1.0
8,3256221420111,"[{'lang': 'main', 'text': 'Court bouillon'}, {...",U,"Produits déshydratés, Produits lyophilisés à r...",,,unknown,,4.0,27.0,27.0,0.0,,,1.0
9,3256221421996,"[{'lang': 'main', 'text': 'Haché de thon à la ...",U,"Surgelés, Plats préparés, Frais, Plats préparé...",,,a,-2.0,4.0,30.0,26.0,4.0,,,6.0


In [16]:
# Clean and normalize feature columns

# Extract 'text' value from product_name field if it's a list of dicts.
def extract_product_name(name_field):
    if isinstance(name_field, list) and len(name_field) > 0:
        return name_field[0].get("text", None)
    return name_field

# Apply cleaning
df_features["product_name"] = df_features["product_name"].apply(extract_product_name)

# Handle missing numeric fields by filling with 0
numeric_cols = [
    "ecoscore_score", "nutriscore_score", "ingredients_n", 
    "known_ingredients_n", "unknown_ingredients_n", 
    "with_sweeteners", "with_non_nutritive_sweeteners", "unique_scans_n"
]
df_features[numeric_cols] = df_features[numeric_cols].fillna(0)

# Convert grades to uppercase for consistency
df_features["ecoscore_grade"] = df_features["ecoscore_grade"].str.upper()
df_features["nutriscore_grade"] = df_features["nutriscore_grade"].str.upper()

df_features.head(10)


  df_features[numeric_cols] = df_features[numeric_cols].fillna(0)


Unnamed: 0,code,product_name,brands,categories,ecoscore_grade,ecoscore_score,nutriscore_grade,nutriscore_score,nova_group,ingredients_n,known_ingredients_n,unknown_ingredients_n,with_sweeteners,with_non_nutritive_sweeteners,unique_scans_n
0,3256221412017,Moutarde forte de Dijon verre décoré de 195g,U,"Condiments, Sauces, Moutardes, Moutardes de Di...",,0,E,21.0,3.0,9.0,8.0,1.0,0,0,17.0
1,3256221412055,Moutarde forte de Dijon pot standard de 72 cl,U,"Condiments, Sauces, Moutardes, Moutardes de Di...",,0,E,21.0,3.0,9.0,8.0,1.0,0,0,9.0
2,3256221412185,Jus de citron jaune 2x12,U,"Aliments et boissons à base de végétaux, Boiss...",,0,B,-3.0,3.0,4.0,4.0,0.0,0,0,2.0
3,3256221415766,Sandwich polaire garni de poulet roti traité e...,U,"Sandwichs, Sandwichs à la volaille, Sandwichs ...",,0,C,6.0,4.0,47.0,44.0,3.0,0,0,2.0
4,3256221415827,Crevettes sauce Thaï et riz basmati,U,"Plats préparés, Frais, Plats à base de riz, Pl...",,0,C,4.0,4.0,35.0,30.0,5.0,0,0,0.0
5,3256221416114,Spécialité fermentée au soja nature 6x100g,U,"Plant-based foods and beverages,Fermented food...",,0,A,-1.0,4.0,7.0,7.0,0.0,0,0,13.0
6,3256221416602,Purée de pomme sans sucres ajoutés 4X100g,"U, U Bio","Aliments et boissons à base de végétaux, Alime...",,0,A,-2.0,1.0,3.0,3.0,0.0,0,0,4.0
7,3256221419061,Tablette de chocolat noir fourrage à la mousse...,U,"Snacks, Snacks sucrés, Cacao et dérivés, Choco...",,0,E,24.0,4.0,14.0,14.0,0.0,0,0,1.0
8,3256221420111,Court bouillon,U,"Produits déshydratés, Produits lyophilisés à r...",,0,UNKNOWN,0.0,4.0,27.0,27.0,0.0,0,0,1.0
9,3256221421996,Haché de thon à la Provençale,U,"Surgelés, Plats préparés, Frais, Plats préparé...",,0,A,-2.0,4.0,30.0,26.0,4.0,0,0,6.0


## Initialize the Feature Store Session

In [17]:
# Initialize SageMaker Feature Store

from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.inputs import OfflineStoreConfig, OnlineStoreConfig, DataCatalogConfig, S3StorageConfig

# Name for the feature group
feature_group_name = "Foodlense-products-feature-group"

# Create the FeatureGroup object
food_feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=sess
)

print(f"Feature Group object initialized: {feature_group_name}")


Feature Group object initialized: Foodlense-products-feature-group


In [18]:
# Define the schema for the Foodlense Feature Group

import pandas as pd

# Add ingestion timestamp
df_features["event_time"] = pd.Timestamp.now().strftime("%Y-%m-%dT%H:%M:%SZ")

# Define feature defs from the dataframe
feature_definitions = [
    FeatureDefinition(feature_name="code", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="product_name", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="brands", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="categories", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="ecoscore_grade", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="ecoscore_score", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="nutriscore_grade", feature_type=FeatureTypeEnum.STRING),
    FeatureDefinition(feature_name="nutriscore_score", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="nova_group", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="ingredients_n", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="known_ingredients_n", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="unknown_ingredients_n", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="with_sweeteners", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="with_non_nutritive_sweeteners", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="unique_scans_n", feature_type=FeatureTypeEnum.INTEGRAL),
    FeatureDefinition(feature_name="event_time", feature_type=FeatureTypeEnum.STRING),
]

food_feature_group.feature_definitions = feature_definitions

print("Feature definitions assigned to Foodlense Feature Group.")


Feature definitions assigned to Foodlense Feature Group.


In [24]:
# Create the feature Group in SageMaker
response = food_feature_group.create(
    s3_uri=f"s3://{bucket}/feature-store/Foodlense/",
    record_identifier_name="code",
    event_time_feature_name="event_time",
    role_arn=role,
    enable_online_store=True,
    disable_glue_table_creation=False,  # keep Glue auto-create
    description="Foodlense Feature Group for OpenFoodFacts products"
)

print("Creating Foodlense Feature Group... this can take a few minutes.")


Creating Foodlense Feature Group... this can take a few minutes.


In [25]:
# Check SageMaker until the Foodlense Feature Group finishes creation.
import time

status = food_feature_group.describe().get("FeatureGroupStatus")
print("Current status:", status)

while status in ["Creating", "CreatingFailed"]:
    print("Waiting for feature group to be created...")
    time.sleep(30)  # wait 30s between checks
    status = food_feature_group.describe().get("FeatureGroupStatus")
    print("Current status:", status)

print("Final status:", status)


Current status: Created
Final status: Created


In [27]:
# Ingest Data into the Foodlense Feature Group --- TODO - amilad/Olga 