# Feature Engineering

In [1]:
# Import libraries
import boto3
import sagemaker
from pyathena import connect
import pandas as pd
import time

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# Setup boto and sagemaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
print('Boto and Sagemaker Sessions Initialized...\nBucket: {0}\nRegion: {1}'.format(bucket, region))

Boto and Sagemaker Sessions Initialized...
Bucket: sagemaker-us-east-1-654654351234
Region: us-east-1


In [3]:
database_name = 'foodfacts'
raw_table_name = "aai_540_openfoodfacts"

# Initialize our database name
database_name = 'foodfacts'

# Set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

# Create connection to Athena database
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [6]:
# Explore count of food products sold in United States
statement = """SELECT count(*) FROM {}.{} WHERE contains(countries_tags, 'en:united-states');""".format(
    database_name, raw_table_name
)
print(statement)
df_us = pd.read_sql(statement, conn)
df_us

SELECT count(*) FROM foodfacts.aai_540_openfoodfacts WHERE contains(countries_tags, 'en:united-states');


  df_us = pd.read_sql(statement, conn)


Unnamed: 0,_col0
0,804173


In [7]:
print(df_us.columns.tolist())

['_col0']


In [8]:
# Select columns

target_col = 'nutriscore_score'

select_cols = [
    'product_name',
    'nutriments',
    'nova_group',
    'additives_n',
    'ingredients_n',
    target_col]



In [9]:
# Filter down dataset to food products in United States with select raw columns
statement = """SELECT 
    product_name,
    nutriments,
    nova_group,
    additives_n,
    ingredients_n,
    nutriscore_score FROM {}.{} WHERE contains(countries_tags, 'en:united-states') LIMIT 5""".format(
    database_name, raw_table_name
)
print(statement)
df_us = pd.read_sql(statement, conn)
df_us

SELECT 
    product_name,
    nutriments,
    nova_group,
    additives_n,
    ingredients_n,
    nutriscore_score FROM foodfacts.aai_540_openfoodfacts WHERE contains(countries_tags, 'en:united-states') LIMIT 5


  df_us = pd.read_sql(statement, conn)


Unnamed: 0,product_name,nutriments,nova_group,additives_n,ingredients_n,nutriscore_score
0,"[{'lang': 'main', 'text': 'Sopressata'}, {'lan...","[{'name': 'carbohydrates', 'value': 3.57, '100...",,0,7,
1,"[{'lang': 'main', 'text': 'Pork rinds chicharr...","[{'name': 'salt', 'value': 6250.0, '100g': 6.2...",4.0,8,17,37.0
2,"[{'lang': 'main', 'text': 'Lime-ade juice'}, {...","[{'name': 'salt', 'value': 0.0, '100g': 0.0, '...",3.0,1,7,0.0
3,"[{'lang': 'main', 'text': 'Penne whole wheat m...","[{'name': 'proteins', 'value': 12.5, '100g': 1...",1.0,0,1,-5.0
4,"[{'lang': 'main', 'text': 'Dulce de leche iced...","[{'name': 'energy-kcal', 'value': 455.0, '100g...",4.0,12,32,30.0


In [10]:
df_us.loc[2]['product_name']

[{'lang': 'main', 'text': 'Lime-ade juice'},
 {'lang': 'en', 'text': 'Lime-ade juice'}]

In [11]:
df_us.loc[2]['nutriments']

[{'name': 'salt',
  'value': 0.0,
  '100g': 0.0,
  'serving': 0.0,
  'unit': 'mg',
  'prepared_value': None,
  'prepared_100g': None,
  'prepared_serving': None,
  'prepared_unit': None},
 {'name': 'vitamin-c',
  'value': 29.0,
  '100g': 0.029,
  'serving': 0.06,
  'unit': 'mg',
  'prepared_value': None,
  'prepared_100g': None,
  'prepared_serving': None,
  'prepared_unit': None},
 {'name': 'fat',
  'value': 0.0,
  '100g': 0.0,
  'serving': 0.0,
  'unit': 'g',
  'prepared_value': None,
  'prepared_100g': None,
  'prepared_serving': None,
  'prepared_unit': None},
 {'name': 'proteins',
  'value': 0.0,
  '100g': 0.0,
  'serving': 0.0,
  'unit': 'g',
  'prepared_value': None,
  'prepared_100g': None,
  'prepared_serving': None,
  'prepared_unit': None},
 {'name': 'fruits-vegetables-legumes-estimate-from-ingredients',
  'value': None,
  '100g': 24.107143,
  'serving': 24.107143,
  'unit': None,
  'prepared_value': None,
  'prepared_100g': None,
  'prepared_serving': None,
  'prepared_unit

In [None]:
# Parse nutriments column to extract features per 100g

# For example: carbohydrates_100g
# "Negative" Components: energy, saturated fats, sugars, and salt/sodium.
# "Positive" Components: proteins, fiber, and the percentage of fruits, vegetables, and nuts. ...

In [None]:
# Continue with feature store and feature groups ...

In [14]:

#Pull a bigger slice (adjust LIMIT as needed)
statement = f"""
SELECT 
    product_name,
    nutriments,
    nova_group,
    additives_n,
    ingredients_n,
    nutriscore_score
FROM {database_name}.{raw_table_name}
WHERE contains(countries_tags, 'en:united-states')
  AND nutriscore_score IS NOT NULL
LIMIT 1000 
"""
print(statement)
df = pd.read_sql(statement, conn)
print(f"Fetched {len(df):,} rows")


SELECT 
    product_name,
    nutriments,
    nova_group,
    additives_n,
    ingredients_n,
    nutriscore_score
FROM foodfacts.aai_540_openfoodfacts
WHERE contains(countries_tags, 'en:united-states')
  AND nutriscore_score IS NOT NULL
LIMIT 1000 



  df = pd.read_sql(statement, conn)


Fetched 1,000 rows


In [19]:
print(df["nutriments"].iloc[0])
print(df["nutriments"].iloc[1])
print(df["nutriments"].iloc[2])

[{'name': 'salt', 'value': 6250.0, '100g': 6.25, 'serving': 0.875, 'unit': 'mg', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'energy-kcal', 'value': 571.0, '100g': 571.0, 'serving': 79.9, 'unit': 'kcal', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'trans-fat', 'value': 0.0, '100g': 0.0, 'serving': 0.0, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'saturated-fat', 'value': 14.29, '100g': 14.29, 'serving': 2.0, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'carbohydrates', 'value': 0.0, '100g': 0.0, 'serving': 0.0, 'unit': 'g', 'prepared_value': None, 'prepared_100g': None, 'prepared_serving': None, 'prepared_unit': None}, {'name': 'fat', 'value': 42.86, '100g': 42.86, 'serving': 6.0, 'unit': 'g', 'prepared_value': None, 

In [30]:
statement = f"""
SELECT 
    product_name,
    nova_group,
    nutriscore_score,
    nutriscore_grade,
    ecoscore_score,
    ecoscore_grade,
    categories,
    categories_tags,
    labels,
    additives_tags
FROM {database_name}.{raw_table_name}
WHERE contains(countries_tags, 'en:united-states')
LIMIT 1000
"""
print(statement)

df_groups = pd.read_sql(statement, conn)
print(f"Fetched {len(df_groups):,} rows")
df_groups.head()


SELECT 
    product_name,
    nova_group,
    nutriscore_score,
    nutriscore_grade,
    ecoscore_score,
    ecoscore_grade,
    categories,
    categories_tags,
    labels,
    additives_tags
FROM foodfacts.aai_540_openfoodfacts
WHERE contains(countries_tags, 'en:united-states')
LIMIT 1000



  df_groups = pd.read_sql(statement, conn)


Fetched 1,000 rows


Unnamed: 0,product_name,nova_group,nutriscore_score,nutriscore_grade,ecoscore_score,ecoscore_grade,categories,categories_tags,labels,additives_tags
0,"[{'lang': 'main', 'text': 'Sixlets'}, {'lang':...",4.0,,unknown,,unknown,,,,"[en:e129, en:e133, en:e1400, en:e171, en:e322,..."
1,"[{'lang': 'main', 'text': 'Sixlets'}, {'lang':...",4.0,32.0,e,,unknown,"Snacks, Sweet snacks, Cocoa and its products, ...","[en:snacks, en:sweet-snacks, en:cocoa-and-its-...",,"[en:e129, en:e133, en:e1400, en:e171, en:e322,..."
2,"[{'lang': 'main', 'text': 'Candy Coated Chocol...",4.0,31.0,e,,unknown,"Snacks, Sweet snacks, Cocoa and its products, ...","[en:snacks, en:sweet-snacks, en:cocoa-and-its-...",,"[en:e129, en:e133, en:e1400, en:e171, en:e322,..."
3,"[{'lang': 'main', 'text': 'Chocolatey candies'...",4.0,32.0,e,,unknown,Baking decorations,[en:baking-decorations],,"[en:e102, en:e129, en:e1400, en:e171, en:e322,..."
4,"[{'lang': 'main', 'text': 'Shimmer Turquoise G...",4.0,18.0,d,,unknown,"Snacks, Sweet snacks, Confectioneries","[en:snacks, en:sweet-snacks, en:confectioneries]",,"[en:e102, en:e133, en:e1400, en:e171, en:e321,..."


In [20]:
def parse_nutriments(entry):
    """
    Convert a list of nutrient dicts into a flat dict {nutrient_100g: value}.
    """
    if entry is None or (isinstance(entry, float) and np.isnan(entry)):
        return {}
    if isinstance(entry, str):
        try:
            entry = json.loads(entry)
        except Exception:
            return {}

    out = {}
    if isinstance(entry, list):  # case you have
        for d in entry:
            name = d.get("name")
            val = d.get("100g")
            if name and val is not None:
                # normalize name → column-friendly format
                col = name.replace("-", "_") + "_100g"
                out[col] = val
    elif isinstance(entry, dict):  # fallback for flat dicts
        for k, v in entry.items():
            out[k] = v
    return out

# Apply parsing
nutri_dicts = df["nutriments"].apply(parse_nutriments)
nutri_df = pd.DataFrame(nutri_dicts.tolist())

# Convert to numeric
for c in nutri_df.columns:
    nutri_df[c] = pd.to_numeric(nutri_df[c], errors="coerce")

nutri_df.head(3)



Unnamed: 0,salt_100g,energy_kcal_100g,trans_fat_100g,saturated_fat_100g,carbohydrates_100g,fat_100g,sodium_100g,fruits_vegetables_nuts_estimate_from_ingredients_100g,cholesterol_100g,energy_100g,...,pantothenic_acid_100g,phylloquinone_100g,caffeine_100g,vitamin_e_100g,starch_100g,selenium_100g,choline_100g,vitamin_k_100g,iodine_100g,energy_kj_100g
0,6.25,571.0,0.0,14.29,0.0,42.86,2.5,0.0,0.107,2389.0,...,,,,,,,,,,
1,0.0,4.0,,,0.97,0.0,0.0,24.107143,,17.0,...,,,,,,,,,,
2,0.0,357.0,0.0,0.0,73.21,2.68,0.0,0.0,0.0,1494.0,...,,,,,,,,,,


In [35]:

# Convert grabbed data to numeric 
for c in nutri_df.columns:
    nutri_df[c] = pd.to_numeric(nutri_df[c], errors="coerce")

# If sodium is present but salt missing, estimate salt (g) = sodium (g) * 2.5
if "sodium_100g" in nutri_df and "salt_100g" in nutri_df:
    need_salt = nutri_df["salt_100g"].isna() & nutri_df["sodium_100g"].notna()
    nutri_df.loc[need_salt, "salt_100g"] = nutri_df.loc[need_salt, "sodium_100g"] * 2.5

# If both energy_kcal and energy_kJ exist, prefer kcal; if only kJ is present:
# kcal = kJ / 4.184 
nutri_df.rename(columns={"energy-kcal_100g": "energy_kcal_100g"}, inplace=True)

In [36]:
# Assemble the final feature table 
id_cols = ["product_name"]
raw_meta_cols = ["nova_group", "additives_n", "ingredients_n"]
target_col = "nutriscore_score"  # <-- your target (do NOT include nutriscore_grade to avoid leakage)

features_df = pd.concat(
    [
        df[id_cols + raw_meta_cols + [target_col]].reset_index(drop=True),
        nutri_df.reset_index(drop=True),
    ],
    axis=1
)

# Enforce numeric types on meta columns that should be numeric
for c in ["nova_group", "additives_n", "ingredients_n", target_col]:
    features_df[c] = pd.to_numeric(features_df[c], errors="coerce")

print("Feature columns:", features_df.columns.tolist())
print("Shape:", features_df.shape)
features_df.head(3)

Feature columns: ['product_name', 'nova_group', 'additives_n', 'ingredients_n', 'nutriscore_score', 'salt_100g', 'energy_kcal_100g', 'trans_fat_100g', 'saturated_fat_100g', 'carbohydrates_100g', 'fat_100g', 'sodium_100g', 'fruits_vegetables_nuts_estimate_from_ingredients_100g', 'cholesterol_100g', 'energy_100g', 'nova_group_100g', 'fruits_vegetables_legumes_estimate_from_ingredients_100g', 'proteins_100g', 'nutrition_score_fr_100g', 'vitamin_c_100g', 'sugars_100g', 'fiber_100g', 'calcium_100g', 'iron_100g', 'potassium_100g', 'vitamin_a_100g', 'manganese_100g', 'phosphorus_100g', 'molybdenum_100g', 'copper_100g', 'polyunsaturated_fat_100g', 'monounsaturated_fat_100g', 'added_sugars_100g', 'vitamin_d_100g', 'vitamin_b2_100g', 'magnesium_100g', 'vitamin_b1_100g', 'vitamin_pp_100g', 'folates_100g', 'vitamin_b6_100g', 'vitamin_b9_100g', 'polyols_100g', 'insoluble_fiber_100g', 'vitamin_b12_100g', 'zinc_100g', 'soluble_fiber_100g', 'pantothenic_acid_100g', 'phylloquinone_100g', 'caffeine_100g

Unnamed: 0,product_name,nova_group,additives_n,ingredients_n,nutriscore_score,salt_100g,energy_kcal_100g,trans_fat_100g,saturated_fat_100g,carbohydrates_100g,...,pantothenic_acid_100g,phylloquinone_100g,caffeine_100g,vitamin_e_100g,starch_100g,selenium_100g,choline_100g,vitamin_k_100g,iodine_100g,energy_kj_100g
0,"[{'lang': 'main', 'text': 'Pork rinds chicharr...",4.0,8,17,37,6.25,571.0,0.0,14.29,0.0,...,,,,,,,,,,
1,"[{'lang': 'main', 'text': 'Lime-ade juice'}, {...",3.0,1,7,0,0.0,4.0,,,0.97,...,,,,,,,,,,
2,"[{'lang': 'main', 'text': 'Penne whole wheat m...",1.0,0,1,-5,0.0,357.0,0.0,0.0,73.21,...,,,,,,,,,,


In [31]:
#  Quick data quality view 
null_rates = features_df.isna().mean().sort_values(ascending=False)
print("Null rate (top 20):")
print(null_rates.head(20))

# May be drop columns that are mostly missing (e.g., >80% NaN) ??? 
mostly_missing = null_rates[null_rates > 0.80].index.tolist()
if mostly_missing:
    print("Dropping:", mostly_missing)
    features_df = features_df.drop(columns=mostly_missing)

Null rate (top 20):
vitamin_a_100g        0.699
vitamin_c_100g        0.660
potassium_100g        0.520
trans_fat_100g        0.143
calcium_100g          0.139
iron_100g             0.136
fiber_100g            0.129
cholesterol_100g      0.124
saturated_fat_100g    0.084
sugars_100g           0.030
nova_group_100g       0.026
nova_group            0.026
fat_100g              0.001
sodium_100g           0.001
energy_kcal_100g      0.001
salt_100g             0.001
carbohydrates_100g    0.001
proteins_100g         0.001
energy_100g           0.001
additives_n           0.000
dtype: float64


In [27]:
#  EDA examples 
# Top categories by nova_group (categorical)
#FYI - if you want to read about NOVA classigfication https://en.wikipedia.org/wiki/Nova_classification
if "nova_group" in features_df:
    print("\nValue counts: nova_group")
    print(features_df["nova_group"].value_counts(dropna=False).head(10))


Value counts: nova_group
nova_group
4.0    661
3.0    168
1.0    124
NaN     26
2.0     21
Name: count, dtype: int64
