In [None]:
import os
import pandas as pd
from pathlib import Path
import json

# Please download the bulk all data FAOSTAT dataset manually from:
# https://bulks-faostat.fao.org/production/SUA_Crops_Livestock_E_All_Data.zip
# Place the CSV file in the ../data/ directory relative to this script.

csv_path = Path('..') / 'data' / 'SUA_Crops_Livestock_E_All_Data_NOFLAG.csv'
df = pd.read_csv(csv_path)


# Filter to only countries (Area Code < 5000)
df = df[df['Area Code'] < 5000]

# Remove year columns except Y2023
year_cols = [c for c in df.columns if c.startswith('Y') and c != 'Y2023']
df = df.drop(columns=year_cols)

# Filter to only food supply element
df = df[df['Element'] == 'Food supply (kcal/capita/day)']

# Identify items with total Y2023 < 30 across all countries (Filter out bottom 1%)
df = df.fillna(0) # Fill NaNs with 0 for summation
items_to_drop = (
    df.groupby("Item")["Y2023"]
      .sum()
      .loc[lambda x: x < 30]
      .index
)
df_filtered = df[~df["Item"].isin(items_to_drop)]

print('Initial Rows:', len(df), ', Filtered Rows:', len(df_filtered))
print("Initial Unique Item Count:", df["Item"].nunique(), ", Filtered Unique Item Count:", df_filtered["Item"].nunique())

In [None]:
# Load category map for item aggregation
with open("category_map.json", 'r', encoding='utf-8') as f:
    category_map = json.load(f)

# Quick Sanity check
eggs_subcategories = category_map.get("Eggs", [])
print("Eggs Subcategories:", eggs_subcategories)

In [None]:
# Group by country (Area) to sum sub-items per country
new_rows = []

# Group by country (Area) to sum sub-items per country
for area, group in df.groupby("Area"):
    for category, subitems in category_map.items():
        # Filter the group's rows that match any subitem in this category
        matching_rows = group[group["Item"].isin(subitems)]
        
        # Sum the Y2023 values
        total_y2023 = matching_rows["Y2023"].sum()
        
        # Only add a row if there is a non-zero sum
        if total_y2023 != 0:
            new_rows.append({
                "Area": area,
                "Item": category,
                "Y2023": total_y2023
            })
df_categories = pd.DataFrame(new_rows)
# print(df_categories["Item"].unique())
df_categories.to_csv(Path('..') / 'data' / 'SUA_Crops_Livestock_E_All_Data_Categorized.csv', index=False)

# Round Y2023 values to 2 decimal places
df_categories["Y2023"] = df_categories["Y2023"].round(2)

# Combine all food categories per country (sum Y2023 for each Area)
country_totals = df_categories.groupby("Area", as_index=False)["Y2023"].sum()

# Sort by total food supply
country_totals_sorted = country_totals.sort_values("Y2023", ascending=False)

# Show top 10 countries
print("Top 10 countries by food supply (kcal/capita/day):")
print(country_totals_sorted.head(10))
# Show top 10 items for United States by kcal/capita/day
us_items = df_categories[df_categories["Area"] == "United States of America"]
top10_us_items = us_items.sort_values("Y2023", ascending=False).head(10)
print("Top 10 items for United States by kcal/capita/day:")
print(top10_us_items)

In [None]:

# Load the nutrition data and combine with df_categories
nutrition_data_dir = Path('..') / 'food-nutrition-data'

# Define the nutrients we want to extract
macronutrients = ['Calories', 'Protein', 'Fat', 'Saturated Fat', 'Carbohydrates', 'Sugar', 'Fiber']
micronutrients = ['Sodium', 'Calcium', 'Iron', 'Vitamin A', 'Vitamin C', 'Vitamin D']
all_nutrients = macronutrients + micronutrients

# Create a copy of df_categories to add nutrition columns
df_nutrition = df_categories.copy()

# Initialize columns for all nutrients
for nutrient in all_nutrients:
    df_nutrition[nutrient] = 0.0

# Process each row
for idx, row in df_nutrition.iterrows():
    item_name = row['Item']
    kcal_value = row['Y2023']
    
    # Convert item name to filename format (replace spaces with underscores)
    json_filename = item_name.replace(' ', '_') + '.json'
    json_path = nutrition_data_dir / json_filename
    
    try:
        # Load the JSON file
        with open(json_path, 'r', encoding='utf-8') as f:
            nutrition_info = json.load(f)
        
        nutrients_json = nutrition_info.get('nutrients', {})
        
        # Get the base calories from the JSON
        base_calories = nutrients_json['Calories']['value']  # Default to 100 if not specified
        # print(f"Processing {item_name}: Base Calories = {base_calories}, Kcal Value = {kcal_value}")
        
        # Calculate scaling factor
        scaling_factor = kcal_value / base_calories
        
        # Extract and scale each nutrient

        if 'Calories' in nutrients_json:
            df_nutrition.at[idx, 'Calories'] = kcal_value
        
        if 'Protein' in nutrients_json:
            df_nutrition.at[idx, 'Protein'] = nutrients_json['Protein']['value'] * scaling_factor
        
        if 'Fat' in nutrients_json:
            df_nutrition.at[idx, 'Fat'] = nutrients_json['Fat']['value'] * scaling_factor
        
        if 'Saturated Fat' in nutrients_json:
            df_nutrition.at[idx, 'Saturated Fat'] = nutrients_json['Saturated Fat']['value'] * scaling_factor
        
        if 'Carbohydrates' in nutrients_json:
            df_nutrition.at[idx, 'Carbohydrates'] = nutrients_json['Carbohydrates']['value'] * scaling_factor
        
        if 'Sugar' in nutrients_json:
            df_nutrition.at[idx, 'Sugar'] = nutrients_json['Sugar']['value'] * scaling_factor
        
        if 'Fiber' in nutrients_json:
            df_nutrition.at[idx, 'Fiber'] = nutrients_json['Fiber']['value'] * scaling_factor
        
        if 'Sodium' in nutrients_json:
            df_nutrition.at[idx, 'Sodium'] = nutrients_json['Sodium']['value'] * scaling_factor
        
        if 'Calcium' in nutrients_json:
            df_nutrition.at[idx, 'Calcium'] = nutrients_json['Calcium']['value'] * scaling_factor
        
        if 'Iron' in nutrients_json:
            df_nutrition.at[idx, 'Iron'] = nutrients_json['Iron']['value'] * scaling_factor
        
        if 'Vitamin A' in nutrients_json:
            df_nutrition.at[idx, 'Vitamin A'] = nutrients_json['Vitamin A']['value'] * scaling_factor
        
        if 'Vitamin C' in nutrients_json:
            df_nutrition.at[idx, 'Vitamin C'] = nutrients_json['Vitamin C']['value'] * scaling_factor
        
        if 'Vitamin D' in nutrients_json:
            df_nutrition.at[idx, 'Vitamin D'] = nutrients_json['Vitamin D']['value'] * scaling_factor
            
    except FileNotFoundError:
        print(f"Warning: Nutrition file not found for {item_name} ({json_filename})")
    except Exception as e:
        print(f"Error processing {item_name}: {str(e)}")

# Round all nutrient columns to 2 decimal places
for nutrient in all_nutrients:
    df_nutrition[nutrient] = df_nutrition[nutrient].round(2)

In [None]:
country_waste_map = {
    # --- North America and Oceania (42) ---
    'Australia': 42,
    'Canada': 42,
    'Fiji': 42,
    'French Polynesia': 42,
    'Kiribati': 42,
    'Marshall Islands': 42,
    'Micronesia (Federated States of)': 42,
    'Nauru': 42,
    'New Caledonia': 42,
    'New Zealand': 42,
    'Papua New Guinea': 42,
    'Samoa': 42,
    'Solomon Islands': 42,
    'Tonga': 42,
    'Tuvalu': 42,
    'United States of America': 42,
    'Vanuatu': 42,

    # --- Industrialized Asia (25) ---
    'China': 25,
    'China, Hong Kong SAR': 25,
    'China, Macao SAR': 25,
    'China, Taiwan Province of': 25,
    'China, mainland': 25,
    'Mongolia': 25,
    'Republic of Korea': 25,

    # --- Europe (22) ---
    'Albania': 22,
    'Austria': 22,
    'Belarus': 22,
    'Belgium': 22,
    'Bosnia and Herzegovina': 22,
    'Bulgaria': 22,
    'Croatia': 22,
    'Cyprus': 22,
    'Czechia': 22,
    'Denmark': 22,
    'Estonia': 22,
    'Finland': 22,
    'France': 22,
    'Germany': 22,
    'Greece': 22,
    'Hungary': 22,
    'Iceland': 22,
    'Ireland': 22,
    'Israel': 22,
    'Italy': 22,
    'Latvia': 22,
    'Lithuania': 22,
    'Luxembourg': 22,
    'Malta': 22,
    'Montenegro': 22,
    'Netherlands (Kingdom of the)': 22,
    'North Macedonia': 22,
    'Norway': 22,
    'Poland': 22,
    'Portugal': 22,
    'Republic of Moldova': 22,
    'Romania': 22,
    'Russian Federation': 22,
    'Serbia': 22,
    'Slovakia': 22,
    'Slovenia': 22,
    'Spain': 22,
    'Sweden': 22,
    'Switzerland': 22,
    'Türkiye': 22,
    'Ukraine': 22,
    'United Kingdom of Great Britain and Northern Ireland': 22,

    # --- North, West, Central Africa & West/Central Asia (19) ---
    'Algeria': 19,
    'Armenia': 19,
    'Azerbaijan': 19,
    'Bahrain': 19,
    'Burkina Faso': 19,
    'Cabo Verde': 19,
    'Cameroon': 19,
    'Congo': 19,
    "Côte d'Ivoire": 19,
    'Democratic Republic of the Congo': 19,
    'Egypt': 19,
    'Gabon': 19,
    'Gambia': 19,
    'Georgia': 19,
    'Ghana': 19,
    'Guinea': 19,
    'Guinea-Bissau': 19,
    'Iran (Islamic Republic of)': 19,
    'Iraq': 19,
    'Jordan': 19,
    'Kazakhstan': 19,
    'Kuwait': 19,
    'Kyrgyzstan': 19,
    'Lebanon': 19,
    'Liberia': 19,
    'Libya': 19,
    'Mauritania': 19,
    'Morocco': 19,
    'Niger': 19,
    'Nigeria': 19,
    'Oman': 19,
    'Qatar': 19,
    'Sao Tome and Principe': 19,
    'Saudi Arabia': 19,
    'Senegal': 19,
    'Sierra Leone': 19,
    'Syrian Arab Republic': 19,
    'Tajikistan': 19,
    'Tunisia': 19,
    'Turkmenistan': 19,
    'United Arab Emirates': 19,
    'Uzbekistan': 19,
    'Yemen': 19,

    # --- South and Southeast Asia (17) ---
    'Afghanistan': 17,
    'Bangladesh': 17,
    'Bhutan': 17,
    'Cambodia': 17,
    'India': 17,
    'Indonesia': 17,
    "Lao People's Democratic Republic": 17,
    'Malaysia': 17,
    'Maldives': 17,
    'Myanmar': 17,
    'Nepal': 17,
    'Pakistan': 17,
    'Philippines': 17,
    'Sri Lanka': 17,
    'Thailand': 17,
    'Timor-Leste': 17,
    'Viet Nam': 17,

    # --- Latin America (15) ---
    'Antigua and Barbuda': 15,
    'Argentina': 15,
    'Bahamas': 15,
    'Barbados': 15,
    'Belize': 15,
    'Bolivia (Plurinational State of)': 15,
    'Brazil': 15,
    'Chile': 15,
    'Colombia': 15,
    'Costa Rica': 15,
    'Dominican Republic': 15,
    'Ecuador': 15,
    'El Salvador': 15,
    'Grenada': 15,
    'Guatemala': 15,
    'Guyana': 15,
    'Haiti': 15,
    'Honduras': 15,
    'Jamaica': 15,
    'Mexico': 15,
    'Nicaragua': 15,
    'Panama': 15,
    'Paraguay': 15,
    'Peru': 15,
    'Saint Kitts and Nevis': 15,
    'Saint Lucia': 15,
    'Saint Vincent and the Grenadines': 15,
    'Suriname': 15,
    'Trinidad and Tobago': 15,
    'Uruguay': 15,
    'Venezuela (Bolivarian Republic of)': 15,

    # --- Sub-Saharan Africa (East & Southern) (23) ---
    'Angola': 23,
    'Botswana': 23,
    'Comoros': 23,
    'Djibouti': 23,
    'Eswatini': 23,
    'Ethiopia': 23,
    'Kenya': 23,
    'Lesotho': 23,
    'Madagascar': 23,
    'Malawi': 23,
    'Mauritius': 23,
    'Mozambique': 23,
    'Namibia': 23,
    'Rwanda': 23,
    'Seychelles': 23,
    'South Africa': 23,
    'Uganda': 23,
    'United Republic of Tanzania': 23,
    'Zambia': 23,
    'Zimbabwe': 23
}

# We want to account for consumer waste by scaling down nutrient values
# based on the country waste percentages provided above.
# Additionally, we apply specific adjustments for certain nutrients based on data limitations.

df_nutrition_scaled = df_nutrition.copy()
for idx, row in df_nutrition_scaled.iterrows():
    area = row["Area"]
    waste_factor = 1 - (country_waste_map.get(area, 0) / 100)
    for nutrient in all_nutrients:
        value = row[nutrient] * waste_factor
        if nutrient == "Sodium":
            value *= 3.5
        elif nutrient == "Saturated Fat":
            value *= 0.9
        elif nutrient == "Fiber":
            value *= 0.40
        elif nutrient == "Sugar":
            value *= 0.65
        df_nutrition_scaled.at[idx, nutrient] = value


# Sum up macro/micro nutrient totals for each country
country_nutrient_totals = (
    df_nutrition_scaled.groupby("Area", as_index=False)[all_nutrients].sum()
)
country_nutrient_totals.to_csv(Path('..') / 'data' / 'country_nutrient_totals.csv', index=False)

# Print top 10 countries by total Fat (g/capita/day)
country_nutrient_totals_sorted_fat = country_nutrient_totals.sort_values("Vitamin D", ascending=False)
print("\nTop 10 countries by total Fat (g/capita/day):")
print(country_nutrient_totals_sorted_fat.head(10))

average_nutrients = country_nutrient_totals[all_nutrients].mean()
print("\nAverage nutrient totals across all countries (per capita per day):")
print(average_nutrients.round(2))
# Print median, range, and quartiles for all nutrients in country_nutrient_totals
print("\nMedian, range, and quartiles for each nutrient (per capita per day):")
for nutrient in all_nutrients:
    median = country_nutrient_totals[nutrient].median()
    min_val = country_nutrient_totals[nutrient].min()
    max_val = country_nutrient_totals[nutrient].max()
    q1 = country_nutrient_totals[nutrient].quantile(0.25)
    q3 = country_nutrient_totals[nutrient].quantile(0.75)
    print(f"{nutrient}: median={median:.2f}, range=({min_val:.2f}, {max_val:.2f}), Q1={q1:.2f}, Q3={q3:.2f}")






In [None]:
import numpy as np

# Algorithm to calculate nutrition scores per country

# --- Configuration ---
recommended = {
    "Calories": 2300, "Protein": 60, "Fat": 80, "Saturated Fat": 25,
    "Carbohydrates": 300, "Sugar": 50, "Fiber": 28, "Sodium": 2300,
    "Calcium": 1000, "Iron": 18, "Vitamin A": 900, "Vitamin C": 90,
    "Vitamin D": 20
}

weights = {
    "Calories": 2.0, 
    "Protein": 2.0, 
    "Fat": 2.0, 
    "Saturated Fat": 2.0,
    "Carbohydrates": 2.0, 
    "Sugar": 1.5,       # Kept low to favor Western diets
    "Fiber": 0.8, 
    "Sodium": 1.0,
    "Calcium": 0.5, 
    "Iron": 0.5, 
    "Vitamin A": 0.5, 
    "Vitamin C": 0.5,
    "Vitamin D": 0.2
}

sensitive_targets = ["Calories", "Fat", "Saturated Fat", "Sugar", "Sodium", "Carbohydrates"]

# --- Scoring Functions ---

def calculate_asymmetric_score(actual, target, tolerance_under=0.3, tolerance_over=2.0):
    if actual == target: return 100.0
    diff = actual - target
    sigma = target * tolerance_under if diff < 0 else target * tolerance_over
    return 100 * np.exp(- (diff ** 2) / (2 * (sigma ** 2)))

def calculate_saturation_score(actual, target):
    if actual >= target: return 100.0
    return 100 * ((actual / target) ** 2.0)

# --- Execution ---
country_nutrition_scores = []

for idx, row in country_nutrient_totals.iterrows():
    total_weighted_score = 0.0
    
    for nutrient, weight in weights.items():
        actual = row[nutrient]
        target = recommended[nutrient]
        
        if nutrient in sensitive_targets:
            score = calculate_asymmetric_score(actual, target, tolerance_under=0.25, tolerance_over=2.0)
        else:
            score = calculate_saturation_score(actual, target)
            
        total_weighted_score += score * weight
    
    country_nutrition_scores.append({
        "Area": row["Area"],
        "Raw Score": total_weighted_score / sum(weights.values())
    })

scores_df = pd.DataFrame(country_nutrition_scores)

# --- Scaling (Non-Linear to lower Median) ---
min_raw = scores_df["Raw Score"].min()
max_raw = scores_df["Raw Score"].max()

# We use an exponent (power=2.0) here. 
# This "Curves" the grades. An average raw score gets pushed down, 
# but a perfect raw score stays at 100.
scores_df["Nutrition Score"] = scores_df["Raw Score"].apply(
    lambda x: 20 + 80 * (((x - min_raw) / (max_raw - min_raw)) ** 2.0)
).round(2)

# --- Output ---
print("Top 20 Nutrition Scores:")
print(scores_df.sort_values("Nutrition Score", ascending=False).head(20))

print("\nBottom 20 Nutrition Scores:")
print(scores_df.sort_values("Nutrition Score", ascending=True).head(20))

# Statistics
print(f"\nNutrition Score Statistics:")
print(f"Median: {scores_df['Nutrition Score'].median():.2f}")
print(f"Max: {scores_df['Nutrition Score'].max():.2f}")
print(f"Min: {scores_df['Nutrition Score'].min():.2f}")
print(f"Q1: {scores_df['Nutrition Score'].quantile(0.25):.2f}")
print(f"Q3: {scores_df['Nutrition Score'].quantile(0.75):.2f}")

In [None]:
import pycountry

def get_iso3(name):
    try:
        return pycountry.countries.lookup(name).alpha_3
    except:
        return None

In [None]:
# Add ISO3 codes to align with world_countries.json
scores_df["ISO3"] = scores_df["Area"].apply(get_iso3)
scores_df = scores_df[["ISO3", "Area", "Raw Score", "Nutrition Score"]]

# Apply a manual patch for those that don't match perfectly
iso3_patch = {
    "Bolivia (Plurinational State of)": "BOL",
    "China, Hong Kong SAR": "HKG",
    "China, Macao SAR": "MAC",
    "China, Taiwan Province of": "TWN",
    "China, mainland": "CHN",
    "Democratic Republic of the Congo": "COD",
    "Iran (Islamic Republic of)": "IRN",
    "Micronesia (Federated States of)": "FSM",
    "Netherlands (Kingdom of the)": "NLD",
    "Republic of Korea": "KOR",
    "Venezuela (Bolivarian Republic of)": "VEN"
}
scores_df["ISO3"] = scores_df.apply(
    lambda row: iso3_patch[row["Area"]] if row["Area"] in iso3_patch else row["ISO3"],
    axis=1
)


scores_df.to_csv(Path('..') / 'data' / 'country_nutrition_scores.csv', index=False)