# Feature Engineering

In [1]:
# Import libraries
import boto3
import sagemaker
import pyathena
from pyathena import connect
import pandas as pd
import time
import numpy as numpy
import json
import os
from utils import feature_processing
from time import gmtime, strftime, sleep
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.feature_store.inputs import OfflineStoreConfig, OnlineStoreConfig, DataCatalogConfig, S3StorageConfig
from datetime import datetime
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

import warnings
warnings.filterwarnings('ignore', category=UserWarning)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-243285667099
Region: us-east-1


In [3]:
database_name = 'foodfacts'
raw_table_name = "aai_540_openfoodfacts"
us_table_name = 'food_us_100k'

# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [4]:
# View rows from subset data
statement = """SELECT * FROM {}.{} LIMIT 5;""".format(
    database_name, us_table_name
)
print(statement)
df_us = pd.read_sql(statement, conn)
df_us

SELECT * FROM foodfacts.food_us_100k LIMIT 5;


Unnamed: 0,code,product_name,nutriments,nova_group,additives_n,ingredients_n,nutriscore_score
0,38000524042,"[{'lang': 'main', 'text': 'Kellogg'S Treats Sq...","[{'name': 'sodium', 'value': 0.46666667, '100g...",4,4,26,22
1,876063007832,"[{'lang': 'main', 'text': 'Plant-Based Protein...","[{'name': 'fiber', 'value': 10.0, '100g': 3.03...",4,2,13,1
2,788434105187,"[{'lang': 'main', 'text': 'Bar Protéinée'}, {'...","[{'name': 'potassium', 'value': 323.0, '100g':...",4,5,21,13
3,70200551121,"[{'lang': 'main', 'text': 'Supreme Caesar'}, {...","[{'name': 'sodium', 'value': 0.26, '100g': 0.8...",4,1,27,23
4,850000223332,"[{'lang': 'main', 'text': 'Pumpkin Pie'}, {'la...","[{'name': 'fiber', 'value': 6.0, '100g': 9.09,...",4,0,13,8


In [5]:
# View sample product names
print(df_us["product_name"].iloc[0])

# View sample nutrient info
print(df_us["nutriments"].iloc[0])

[{'lang': 'main', 'text': "Kellogg'S Treats Squares Original 2.13Oz"}, {'lang': 'en', 'text': "Kellogg'S Treats Squares Original 2.13Oz"}]
[{'name': 'sodium', 'value': 0.46666667, '100g': 0.46666667, 'serving': 0.28, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'fat', 'value': 10.0, '100g': 10.0, 'serving': 6.0, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'vitamin-b6', 'value': 0.05, '100g': '5.0E-5', 'serving': '3.0E-5', 'unit': 'mg', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'potassium', 'value': 32.0, '100g': 0.032, 'serving': 0.0192, 'unit': 'mg', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'fiber', 'value': 0.0, '100g': 0.0, 'serving': 0.0, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_s

In [6]:
# Process in chunks
chunk_size = 10000
last_processed_code = None
all_processed_chunks = []

print('Starting processing by chunks...')
while True:
    # Build the WHERE clause dynamically
    where_clause = ""
    if last_processed_code is not None:
        where_clause = f"WHERE code > '{last_processed_code}'"
        
    # SQL query to get a chunk of data with LIMIT
    statement = f"""
    SELECT * FROM {database_name}.{us_table_name}
    {where_clause}
    ORDER BY code
    LIMIT {chunk_size};
    """
    
    try:
        chunk_df = pd.read_sql(statement, conn)
    except Exception as e:
        print(f"Error reading data: {e}")
        break

    if chunk_df.empty:
        # Stop the loop when there are no more rows to process
        break

    # Apply feature engineering function to the current chunk
    processed_chunk = feature_processing.process_data_chunk(chunk_df)
    
    # Add the processed chunk to our list
    all_processed_chunks.append(processed_chunk)
    
    # Update the tracking variable for the next query
    last_processed_code = chunk_df['code'].iloc[-1]
    print(f"Processed {len(all_processed_chunks) * chunk_size} rows...")

print('Finished processing all chunks.')

Starting processing by chunks...
Processed 10000 rows...
Processed 20000 rows...
Processed 30000 rows...
Processed 40000 rows...
Processed 50000 rows...
Processed 60000 rows...
Processed 70000 rows...
Processed 80000 rows...
Processed 90000 rows...
Processed 100000 rows...
Finished processing all chunks.


In [7]:
# Concatenate all processed chunks into a final DataFrame
features_df = pd.concat(all_processed_chunks, ignore_index=True)
print("Feature columns:", features_df.columns.tolist())
print("Shape:", features_df.shape)
features_df.head(3)

Feature columns: ['code', 'product_name', 'nova_group', 'additives_n', 'ingredients_n', 'nutriscore_score', 'energy_100g', 'salt_100g', 'carbohydrates_100g', 'cholesterol_100g', 'sodium_100g', 'fiber_100g', 'fruits_vegetables_legumes_estimate_from_ingredients_100g', 'sugars_100g', 'saturated_fat_100g', 'trans_fat_100g', 'fat_100g', 'proteins_100g', 'fruits_vegetables_nuts_estimate_from_ingredients_100g', 'energy_kcal_100g', 'nova_group_100g', 'nutrition_score_fr_100g', 'energy_kj_100g', 'potassium_100g', 'polyunsaturated_fat_100g', 'monounsaturated_fat_100g', 'vitamin_a_100g', 'vitamin_d_100g', 'magnesium_100g', 'vitamin_e_100g', 'caffeine_100g', 'selenium_100g', 'vitamin_b12_100g', 'iron_100g', 'zinc_100g', 'phosphorus_100g', 'choline_100g', 'vitamin_b6_100g', 'vitamin_k_100g', 'starch_100g', 'vitamin_b2_100g', 'vitamin_b1_100g', 'vitamin_b9_100g', 'copper_100g', 'calcium_100g', 'vitamin_c_100g', 'manganese_100g', 'added_sugars_100g', 'vitamin_pp_100g', 'polyols_100g', 'folates_100g',

Unnamed: 0,code,product_name,nova_group,additives_n,ingredients_n,nutriscore_score,energy_100g,salt_100g,carbohydrates_100g,cholesterol_100g,...,energy_g_100g,iro_100g,salt_equivalent_100g,vitamina_d3_100g,fr_matiere_minerales_100g,es_inulina_100g,es_isomaltitol_100g,es_maltitol_100g,beta_carotene_100g,inositol_100g
0,417,Owmy,4,2,11,0,15.0,0.006927,0.083126,0.002078,...,,,,,,,,,,
1,749,Cream soup air corners,3,0,6,23,1980.0,775.0,66.0,,...,,,,,,,,,,
2,105000417,Lagg's,1,0,6,0,0.0,0.0,0.0,,...,,,,,,,,,,


In [8]:
#  Quick data quality view 
null_rates = features_df.isna().mean().sort_values(ascending=False)
print("Null rate (top 20):")
print(null_rates.head(20))

# Drop columns that are mostly missing (>50% NaN)
mostly_missing = null_rates[null_rates > 0.50].index.tolist()

if mostly_missing:
    print("Dropping:", mostly_missing)
    print(f"Dropping a total of {len(mostly_missing)} columns")
    features_df.drop(columns=mostly_missing, inplace=True)

# Verify the changes
print("Number of columns after dropping:", len(features_df.columns))
features_df.head(3)

Null rate (top 20):
inositol_100g                              0.99999
lauric_acid_100g                           0.99999
palmitic_acid_100g                         0.99999
en_pottasium_100g                          0.99999
en_kalzium_100g                            0.99999
en_eisen_100g                              0.99999
en_vitamin_d_d3_cholecalciferol_100g       0.99999
es_acide_alpha_linolenique_omega_3_100g    0.99999
silica_100g                                0.99999
fr_citicoline_100g                         0.99999
fr_malic_acid_100g                         0.99999
fr_n_acetyl_l_tyrosine_100g                0.99999
fr_l_phenylalanine_100g                    0.99999
fr_glucuronic_acid_100g                    0.99999
protein_100g                               0.99999
omega_9_fat_100g                           0.99999
fr_0_100g                                  0.99999
es_maltitol_100g                           0.99999
es_isomaltitol_100g                        0.99999
gamma_linol

Unnamed: 0,code,product_name,nova_group,additives_n,ingredients_n,nutriscore_score,energy_100g,salt_100g,carbohydrates_100g,cholesterol_100g,...,fat_100g,proteins_100g,fruits_vegetables_nuts_estimate_from_ingredients_100g,energy_kcal_100g,nova_group_100g,nutrition_score_fr_100g,vitamin_a_100g,iron_100g,calcium_100g,vitamin_c_100g
0,417,Owmy,4,2,11,0,15.0,0.006927,0.083126,0.002078,...,0.083126,0.665004,0.0,3.602106,4.0,0.0,,,,
1,749,Cream soup air corners,3,0,6,23,1980.0,775.0,66.0,,...,19.8,4.6,0.0,474.0,3.0,23.0,,,,
2,105000417,Lagg's,1,0,6,0,0.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,1.0,0.0,,,,


In [9]:
# View full final column list
print("Feature columns:", features_df.columns.tolist())

Feature columns: ['code', 'product_name', 'nova_group', 'additives_n', 'ingredients_n', 'nutriscore_score', 'energy_100g', 'salt_100g', 'carbohydrates_100g', 'cholesterol_100g', 'sodium_100g', 'fiber_100g', 'fruits_vegetables_legumes_estimate_from_ingredients_100g', 'sugars_100g', 'saturated_fat_100g', 'trans_fat_100g', 'fat_100g', 'proteins_100g', 'fruits_vegetables_nuts_estimate_from_ingredients_100g', 'energy_kcal_100g', 'nova_group_100g', 'nutrition_score_fr_100g', 'vitamin_a_100g', 'iron_100g', 'calcium_100g', 'vitamin_c_100g']


In [10]:
# Filter for columns where the value is True (meaning they contain NaN)
columns_with_nan_series = features_df.isnull().any()
columns_with_nan = columns_with_nan_series[columns_with_nan_series].index.tolist()
print(f"Columns with NaN values: {columns_with_nan}")

Columns with NaN values: ['product_name', 'energy_100g', 'salt_100g', 'carbohydrates_100g', 'cholesterol_100g', 'sodium_100g', 'fiber_100g', 'fruits_vegetables_legumes_estimate_from_ingredients_100g', 'sugars_100g', 'saturated_fat_100g', 'trans_fat_100g', 'fat_100g', 'proteins_100g', 'fruits_vegetables_nuts_estimate_from_ingredients_100g', 'energy_kcal_100g', 'vitamin_a_100g', 'iron_100g', 'calcium_100g', 'vitamin_c_100g']


In [11]:
# Impute missing product_names
features_df['product_name'] = features_df['product_name'].fillna('nan_product_name')

# Impute core nutrient values using the median
median_impute_cols = [
    'energy_100g', 'sodium_100g', 'proteins_100g',
    'salt_100g', 'carbohydrates_100g', 'energy_kcal_100g',
    'sugars_100g', 'fat_100g', 'saturated_fat_100g',
    'fiber_100g', 'cholesterol_100g', 'calcium_100g',
    'iron_100g', 'vitamin_c_100g', 'vitamin_a_100g'
]

for col in median_impute_cols:
    features_df[col] = features_df[col].fillna(features_df[col].median())

# Impute other columns with 0, as they are not present
zero_impute_cols = [
    'trans_fat_100g',
    'fruits_vegetables_legumes_estimate_from_ingredients_100g',
    'fruits_vegetables_nuts_estimate_from_ingredients_100g'
]

for col in zero_impute_cols:
    features_df[col] = features_df[col].fillna(0)

# Verify that all NaNs have been handled
columns_with_nan_series = features_df.isnull().any()
columns_with_nan = columns_with_nan_series[columns_with_nan_series].index.tolist()
print(f"Columns with NaN values: {columns_with_nan}")

Columns with NaN values: []


In [12]:
# View data with all columns
pd.set_option('display.max_columns', None)
features_df.head()

Unnamed: 0,code,product_name,nova_group,additives_n,ingredients_n,nutriscore_score,energy_100g,salt_100g,carbohydrates_100g,cholesterol_100g,sodium_100g,fiber_100g,fruits_vegetables_legumes_estimate_from_ingredients_100g,sugars_100g,saturated_fat_100g,trans_fat_100g,fat_100g,proteins_100g,fruits_vegetables_nuts_estimate_from_ingredients_100g,energy_kcal_100g,nova_group_100g,nutrition_score_fr_100g,vitamin_a_100g,iron_100g,calcium_100g,vitamin_c_100g
0,417,Owmy,4,2,11,0,15.0,0.006927,0.083126,0.002078,0.002771,0.027709,0.0,0.055417,0.027709,0.083126,0.083126,0.665004,0.0,3.602106,4.0,0.0,0.0,0.00071,0.0254,0.0
1,749,Cream soup air corners,3,0,6,23,1980.0,775.0,66.0,0.0,310.0,6.0,0.0,5.0,0.0,0.0,19.8,4.6,0.0,474.0,3.0,23.0,0.0,0.00071,0.0254,0.0
2,105000417,Lagg's,1,0,6,0,0.0,0.0,0.0,0.0,0.0,1.4,0.0,6.67,1.79,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.00071,0.0254,0.0
3,111048403,100% Pure Canola Oil,2,0,2,2,3586.0,0.0,0.0,0.0,0.0,1.4,0.0,6.67,7.14,0.0,100.0,0.0,0.0,857.0,2.0,2.0,0.0,0.00071,0.0254,0.0
4,111301201,Canola Harvest® Original Vegetable Oil Spread Tub,4,4,17,40,19200.0,12.7,0.0,0.0,5.1,0.0,0.0,0.0,76.4,0.0,510.0,0.0,53.333332,4590.0,4.0,40.0,7.65e-09,0.00071,0.0254,0.0


## Create Feature Store and Feature Group

In [13]:
# Initialize SageMaker Feature Store
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)
feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

In [14]:
# Initialize feature group
feature_group_name = "foodlens-products-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

# Create the FeatureGroup object
food_feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=feature_store_session
)

print(f"Feature Group object initialized: {feature_group_name}")

Feature Group object initialized: foodlens-products-feature-group-11-21-18-59


In [15]:
# View feature data types
features_df.dtypes

code                                                         object
product_name                                                 object
nova_group                                                    int64
additives_n                                                   int64
ingredients_n                                                 int64
nutriscore_score                                              int64
energy_100g                                                 float64
salt_100g                                                   float64
carbohydrates_100g                                          float64
cholesterol_100g                                            float64
sodium_100g                                                 float64
fiber_100g                                                  float64
fruits_vegetables_legumes_estimate_from_ingredients_100g    float64
sugars_100g                                                 float64
saturated_fat_100g                              

In [16]:
# Define feature group schema
food_features_df = features_df.copy()

# Cast 'object' columns to string type
food_features_df['code'] = food_features_df['code'].astype('str').astype('string')
food_features_df['product_name'] = food_features_df['product_name'].astype('str').astype('string')

# Add ingestion timestamp
current_time_sec = int(round(time.time()))
food_features_df['EventTime'] = pd.Series(
    [current_time_sec] * len(food_features_df), dtype="float64"
)

# Load feature definitions to the feature group
food_feature_group.load_feature_definitions(data_frame=food_features_df)

print("Feature definitions assigned to Foodlens Feature Group.")

Feature definitions assigned to Foodlens Feature Group.


In [17]:
# View feature data types
food_features_df.dtypes

code                                                        string[python]
product_name                                                string[python]
nova_group                                                           int64
additives_n                                                          int64
ingredients_n                                                        int64
nutriscore_score                                                     int64
energy_100g                                                        float64
salt_100g                                                          float64
carbohydrates_100g                                                 float64
cholesterol_100g                                                   float64
sodium_100g                                                        float64
fiber_100g                                                         float64
fruits_vegetables_legumes_estimate_from_ingredients_100g           float64
sugars_100g              

In [18]:
# Create the feature group in sagemaker featurestore
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")

food_feature_group.create(
    s3_uri=f"s3://{bucket}/feature-store/Foodlens/",
    record_identifier_name="code",
    event_time_feature_name="EventTime",
    role_arn=role,
    enable_online_store=True,
    disable_glue_table_creation=False,  # keep Glue auto-create
    description="Foodlens Feature Group for OpenFoodFacts Products"
)

print("Creating Foodlens Feature Group... this can take a minute.")
wait_for_feature_group_creation_complete(feature_group=food_feature_group)

Creating Foodlens Feature Group... this can take a minute.
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup foodlens-products-feature-group-11-21-18-59 successfully created.


In [19]:
# View feature group details
food_feature_group_desc = food_feature_group.describe()
food_feature_group_desc

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:243285667099:feature-group/foodlens-products-feature-group-11-21-18-59',
 'FeatureGroupName': 'foodlens-products-feature-group-11-21-18-59',
 'RecordIdentifierFeatureName': 'code',
 'EventTimeFeatureName': 'EventTime',
 'FeatureDefinitions': [{'FeatureName': 'code', 'FeatureType': 'String'},
  {'FeatureName': 'product_name', 'FeatureType': 'String'},
  {'FeatureName': 'nova_group', 'FeatureType': 'Integral'},
  {'FeatureName': 'additives_n', 'FeatureType': 'Integral'},
  {'FeatureName': 'ingredients_n', 'FeatureType': 'Integral'},
  {'FeatureName': 'nutriscore_score', 'FeatureType': 'Integral'},
  {'FeatureName': 'energy_100g', 'FeatureType': 'Fractional'},
  {'FeatureName': 'salt_100g', 'FeatureType': 'Fractional'},
  {'FeatureName': 'carbohydrates_100g', 'FeatureType': 'Fractional'},
  {'FeatureName': 'cholesterol_100g', 'FeatureType': 'Fractional'},
  {'FeatureName': 'sodium_100g', 'FeatureType': 'Fractional'},
  {'FeatureName': 'fibe

## Ingest Data into FeatureGroup

In [20]:
# File configuration
prefix_processed_data_s3 = 'feature-store-project/processed-data'
prefix_glue_script_s3 = 'feature-store-project/glue-scripts'
glue_job_name = 'FeatureStore-Ingestion-Job' + datetime.now().strftime("-%Y-%m-%d-%H-%M")

# Save the processed DataFrame to S3 
print("Saving processed DataFrame to S3...")
current_time_str = datetime.now().strftime("%Y%m%d%H%M%S")
data_s3_uri = f's3://{bucket}/{prefix_processed_data_s3}/{current_time_str}/'
glue_script_file_name = 'ingest_script.py'
script_s3_key = f'{prefix_glue_script_s3}/{glue_script_file_name}'
script_s3_uri = f's3://{bucket}/{script_s3_key}'
local_ingest_script_path = 'utils/ingest_script.py'

# Convert the pandas DataFrame to Parquet on S3
food_features_df.to_parquet(data_s3_uri)
print(f"Data saved to: {data_s3_uri}")

# Upload the Glue Script
boto3.Session().resource('s3').Bucket(bucket).Object(f'{prefix_glue_script_s3}/{glue_script_file_name}').upload_file(local_ingest_script_path)
print(f"Using Glue Script from: {script_s3_uri}")

Saving processed DataFrame to S3...
Data saved to: s3://sagemaker-us-east-1-243285667099/feature-store-project/processed-data/20251011211941/
Using Glue Script from: s3://sagemaker-us-east-1-243285667099/feature-store-project/glue-scripts/ingest_script.py


In [27]:
# Submit the AWS Glue Job
print("Submitting ingestion job to AWS Glue...")
glue_client = boto3.client('glue', region_name=region)

try:
    # First, ensure the Glue Job definition exists (run this once)
    try:
        glue_client.get_job(JobName=glue_job_name)
    except glue_client.exceptions.EntityNotFoundException:
        print(f"    Creating new Glue Job definition: {glue_job_name}")
        glue_client.create_job(
            Name=glue_job_name,
            Role=role,
            Command={
                'Name': 'glueetl',
                'ScriptLocation': script_s3_uri,
                'PythonVersion': '3'
            },
            WorkerType='G.1X',
            NumberOfWorkers=5, 
            GlueVersion='4.0',
            MaxRetries=0
        )
    
    # Start the job run with parameters
    response = glue_client.start_job_run(
        JobName=glue_job_name,
        Arguments={
            '--s3_data_path': data_s3_uri,
            '--feature_group_name': feature_group_name,
            '--sagemaker_role_arn': role,
            '--region': region,
            '--additional-python-modules': 'numpy==1.26.4,sagemaker==2.214.0'
        }
    )

    job_run_id = response['JobRunId']
    print(f"SUCCESS: Glue job started! Run ID: {job_run_id}")
    
    # Monitoring Loop
    print("Monitoring job progress (Checking status every 30 seconds)...")
    
    job_status = 'STARTING'
    while job_status in ['STARTING', 'RUNNING', 'WAITING']:
        status_response = glue_client.get_job_run(JobName=glue_job_name, RunId=job_run_id)
        job_status = status_response['JobRun']['JobRunState']
        
        print(f"    Current Status: {job_status}")

        # Check if the job has reached a terminal state
        if job_status not in ['STARTING', 'RUNNING', 'WAITING']:
            break # Exit the loop immediately if status is SUCCEEDED, FAILED, etc.
        
        print("    Waiting 30 seconds...")
        time.sleep(30)
        
    if job_status == 'SUCCEEDED':
        print(f"\nJob SUCCEEDED! Data successfully ingested into {feature_group_name}.")
    elif job_status in ['FAILED', 'STOPPED', 'TIMEOUT']:
        error_message = status_response['JobRun'].get('ErrorMessage', 'No specific error message.')
        print(f"\nJob FAILED/STOPPED with status: {job_status}. Error: {error_message}")
    # End of Monitoring Loop
except Exception as e:
    print(f"\nERROR submitting Glue job: {e}")

Submitting ingestion job to AWS Glue...
SUCCESS: Glue job started! Run ID: jr_b70066cb9c2005055755a1b61f7e17a052b968b2d681899f77edecbb24ce9cfe
Monitoring job progress (Checking status every 30 seconds)...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: RUNNING
    Waiting 30 seconds...
    Current Status: SUCCEEDED

Job SUCCEEDED! Data successfully ingested into foodlens-products-feature-group-11-21-18-59.


## Confirm Data Ingestion

In [28]:
# Grab sample record
record_identifier_value = '00001252' # Cottage Cheese

# Call the get_record API
featurestore_runtime.get_record(
    FeatureGroupName=feature_group_name,
    RecordIdentifierValueAsString=record_identifier_value,
)

{'ResponseMetadata': {'RequestId': 'c0712faf-35ed-4715-b1ab-83d8c431d5c7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c0712faf-35ed-4715-b1ab-83d8c431d5c7',
   'content-type': 'application/json',
   'content-length': '2317',
   'date': 'Sat, 11 Oct 2025 21:32:21 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'code', 'ValueAsString': '00001252'},
  {'FeatureName': 'product_name', 'ValueAsString': 'Cottage cheese'},
  {'FeatureName': 'nova_group', 'ValueAsString': '3'},
  {'FeatureName': 'additives_n', 'ValueAsString': '0'},
  {'FeatureName': 'ingredients_n', 'ValueAsString': '3'},
  {'FeatureName': 'nutriscore_score', 'ValueAsString': '4'},
  {'FeatureName': 'energy_100g', 'ValueAsString': '407.0'},
  {'FeatureName': 'salt_100g', 'ValueAsString': '0.86283183'},
  {'FeatureName': 'carbohydrates_100g', 'ValueAsString': '4.424779'},
  {'FeatureName': 'cholesterol_100g', 'ValueAsString': '0.017699115'},
  {'FeatureName': 'sodium_100g', 'ValueAsString': '0.345

In [31]:
# Verify all records made into feature group
# You may have to wait a few minutes for the data to fully appear in offline store
feature_database_name = 'sagemaker_featurestore' 
feature_group_table_name = food_feature_group_desc['OfflineStoreConfig']['DataCatalogConfig']['TableName']

statement = """SELECT COUNT(*)
FROM "{}"."{}" """.format(
    feature_database_name, 
    feature_group_table_name 
)

print(statement)
total_record_count = pd.read_sql(statement, conn)
total_record_count

SELECT COUNT(*)
FROM "sagemaker_featurestore"."foodlens_products_feature_group_11_21_18_59_1760217539" 


Unnamed: 0,_col0
0,100000
