In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

## RECIPES EDA

In [None]:
df=pd.read_csv('../data/recipes.csv', nrows=20000)


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df['RecipeInstructions'].iloc[42]

In [None]:
df.isnull().sum()

### Visualization starts here

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [None]:
# Load the data
df = pd.read_csv('../data/recipes.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
nutritional_cols = ['Calories', 'FatContent', 'SaturatedFatContent', 
                   'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                   'FiberContent', 'SugarContent', 'ProteinContent']



In [None]:
# Statistical summary
print("\nNutritional Content Summary:")
print(df[nutritional_cols].describe())


In [None]:
# correlation heatmap for nutritional values
plt.figure(figsize=(12, 8))
sns.heatmap(df[nutritional_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation between Nutritional Values')
plt.tight_layout()
plt.show()

In [None]:
# display unique values of RecipeCategory top 20 based on count
df['RecipeCategory'].value_counts().head(20)

In [None]:
# Recipe Categories Analysis
plt.figure(figsize=(15, 6))
df['RecipeCategory'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Recipe Categories')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Get top 20 recipe categories
top_categories = df['RecipeCategory'].value_counts().head(20)

# Create the figure
plt.figure(figsize=(12, 8))

# Generate color gradient
colors = sns.color_palette("Blues", n_colors=len(top_categories))

# Create horizontal bar plot
bars = plt.barh(top_categories.index, top_categories.values, color=colors)

# Add labels on bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 1000,  # Offset for better readability
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center', fontsize=10)

# Customize the plot
plt.title('Top 20 Recipe Categories', fontsize=14, pad=15)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Recipe Category', fontsize=12)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines for a cleaner look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Get top 5 categories
top_categories = df['RecipeCategory'].value_counts().head(10).index
df_top = df[df['RecipeCategory'].isin(top_categories)]

# Normalize the nutritional values for better visualization
nutrients = ['Calories', 'FatContent', 'ProteinContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent']
df_normalized = df_top[nutrients].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
# Nutritional Distribution by Category
# Stacked Bar Chart

plt.figure(figsize=(12, 6))
nutrient_means = df_top.groupby('RecipeCategory')[nutrients].mean()
nutrient_means_normalized = nutrient_means.apply(lambda x: x/x.sum(), axis=1)
nutrient_means_normalized.plot(kind='barh', stacked=True)
plt.title('Proportional Nutrient Distribution by Category')
plt.ylabel('Recipe Category')
plt.xlabel('Proportion of Nutrients')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Ingredients Analysis
# Count number of ingredients per recipe
df['IngredientCount'] = df['RecipeIngredientParts'].apply(lambda x: len(str(x).split(',')))

In [None]:
# Save key insights
insights = {
    'total_recipes': len(df),
    'avg_rating': df['AggregatedRating'].mean(),
    'avg_calories': df['Calories'].mean(),
    'avg_ingredients': df['IngredientCount'].mean(),
}

print("\nKey Insights:")
for key, value in insights.items():
    print(f"{key}: {value:.2f}")

In [None]:
# display first 10 rows of df with name, keywords, RecipeCategory, RecipeIngredientParts, RecipeInstructions, RecipeYield, PrepTime, CookTime, TotalTime, RecipeInstructions, RecipeIngredientParts
df_temp = df[['Name', 'Keywords', 'RecipeCategory', 'RecipeIngredientParts', 'RecipeInstructions', 'RecipeYield', 'PrepTime', 'CookTime', 'TotalTime']].head(25)
# save df to csv
df_temp.to_csv('recipes.csv', index=False)


In [None]:
# Function to process ingredient
def extract_ingredients(ingredient):
    if pd.isna(ingredient):
        return []
    # Clean the string and split into ingredients
    ingredients = ingredient.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [ingredient.strip().lower() for ingredient in ingredients]

# Extract and count all ingredient
all_ingredients = []
for ingredient in df['RecipeIngredientParts'].dropna():
    all_ingredients.extend(extract_ingredients(ingredient))

# Count ingredients and get top 20
ingredient_counts = pd.Series(all_ingredients).value_counts().head(20)

# Create the plot
plt.figure(figsize=(12, 8))

# Create horizontal bar chart with color gradient
colors = sns.color_palette("RdYlBu_r", n_colors=len(ingredient_counts))
bars = plt.barh(range(len(ingredient_counts)), ingredient_counts.values, color=colors)

# Customize the plot
plt.title('Top 20 Recipe ingredients', fontsize=14, pad=20)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Ingredients', fontsize=12)

# Add value labels on the bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 100,  
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center',
             fontsize=10)

# Set y-tick labels
plt.yticks(range(len(ingredient_counts)), ingredient_counts.index)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

# Print summary statistics
print("\nIngredient Statistics:")
print(f"Total Unique ingredient: {len(pd.Series(all_ingredients).unique()):,}")
print(f"Total ingredient Occurrences: {len(all_ingredients):,}")

# Print top ingredient combinations
print("\n🔍 Top 5 Most Common Ingredients:")
for idx, (ingredient, count) in enumerate(ingredient_counts.head(20).items(), 1):
    print(f"{idx}. {ingredient.title()}: {count:,} recipes")

In [None]:
def normalize_ingredient(ingredient):
    """Normalize ingredient names by combining similar ingredients"""
    ingredient = ingredient.lower().strip()
    
    # Dictionary of ingredient mappings
    ingredient_mappings = {
        # Eggs
        'eggs': 'egg',
        'egg whites': 'egg',
        'egg white': 'egg',
        'egg yolks': 'egg',
        'egg yolk': 'egg',
        'raw egg': 'egg',
        'egg substitute': 'egg',
        'egg beaters': 'egg',
        
        # Sugar
        'brown sugar': 'sugar',
        'white sugar': 'sugar',
        'granulated sugar': 'sugar',
        'powdered sugar': 'sugar',
        'caster sugar': 'sugar',
        'confectioners sugar': 'sugar',
        'demerara sugar': 'sugar',
        'light demerara sugar': 'sugar',
        
        # Salt
        'sea salt': 'salt',
        'kosher salt': 'salt',
        
        # Flour 
        'all-purpose flour': 'flour',
        'all purpose flour': 'flour',
        'unbleached all-purpose flour': 'flour',
        'unbleached all purpose flour': 'flour',
        'self-rising flour': 'flour',
        'plain flour': 'flour',
        'wheat flour': 'flour',
        
        # Oil
        'vegetable oil': 'oil',
        'olive oil': 'oil',
        'spanish olive oil': 'oil',
        'walnut oil': 'oil',
        'canola oil': 'oil',
        'extra virgin olive oil': 'oil',
        'coconut oil': 'oil',
        
        # Butter
        'unsalted butter': 'butter',
        'salted butter': 'butter',
        'melted butter': 'butter',
        'sweet butter': 'butter',
    }
    
    # Return normalized ingredient name
    return ingredient_mappings.get(ingredient, ingredient)

def extract_ingredients(ingredient_str):
    """Extract and normalize ingredients from string"""
    if pd.isna(ingredient_str):
        return []
    # Clean the string and split into ingredients
    ingredients = ingredient_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [normalize_ingredient(ingredient.strip().lower()) for ingredient in ingredients]

# Extract and count all ingredients
all_ingredients = []
for ingredient in df['RecipeIngredientParts'].dropna():
    all_ingredients.extend(extract_ingredients(ingredient))

In [None]:
# save all ingredients to csv
df_ingredients = pd.DataFrame(all_ingredients, columns=['ingredients'])
df_ingredients.to_csv('ingredients.csv', index=False)

In [None]:
# Count ingredients and get top 20
ingredient_counts = pd.Series(all_ingredients).value_counts().head(20)

# Create the plot
plt.figure(figsize=(12, 8))

# Create horizontal bar chart with custom color palette
colors = sns.color_palette("RdYlBu_r", n_colors=len(ingredient_counts))
bars = plt.barh(range(len(ingredient_counts)), ingredient_counts.values, color=colors)

# Customize the plot
plt.title('Top 20 Recipe Ingredients (Normalized)', fontsize=14, pad=20)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Ingredients', fontsize=12)

# Add value labels on the bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 100,  
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center',
             fontsize=10)

# Set y-tick labels
plt.yticks(range(len(ingredient_counts)), ingredient_counts.index)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

# Print summary statistics with emoji
print("\n📊 Ingredient Statistics:")
print(f"Total Unique Ingredients: {len(pd.Series(all_ingredients).unique()):,}")
print(f"Total Ingredient Occurrences: {len(all_ingredients):,}")
print(f"Most Common Ingredient: {ingredient_counts.index[0]} ({ingredient_counts.values[0]:,} occurrences)")

# Print top ingredient combinations
print("\n🔍 Top 5 Most Common Ingredients:")
for idx, (ingredient, count) in enumerate(ingredient_counts.head().items(), 1):
    print(f"{idx}. {ingredient.title()}: {count:,} recipes")

### Data cleaning starts here

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv('../data/recipes.csv', nrows=20000)

In [None]:
df.duplicated().any()

In [None]:
numeric_columns = ['Calories', 'FatContent', 'SaturatedFatContent', 
                    'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                    'FiberContent', 'SugarContent', 'ProteinContent']

# Check which rows have all zeros in numeric columns
all_zeros = (df[numeric_columns] == 0).all(axis=1)
zero_nutrition_rows = df[all_zeros]


In [None]:
# Print results
print(f"\nAnalysis of Zero Nutritional Values:")
print(f"Total rows with all nutritional values = 0: {len(zero_nutrition_rows)} ({(len(zero_nutrition_rows)/len(df)*100):.2f}%)")

# Display sample of these rows
print("\nSample of recipes with zero nutritional values:")
print(zero_nutrition_rows[['Name', 'RecipeCategory'] + numeric_columns].head())

# Save these recipes to investigate
# zero_nutrition_rows.to_csv('zero_nutrition_recipes.csv', index=False)

# Additional analysis of these rows
print("\nCategories with zero nutritional values:")
print(zero_nutrition_rows['RecipeCategory'].value_counts().head())

In [None]:
# Remove rows with all zeros and create new dataframe
df_clean = df[~all_zeros].copy()

In [None]:
 # Create a copy to avoid modifying original data
# df_clean = df.copy()

In [None]:
import re

# Time-related columns

def convert_duration(duration_str):
    """Convert ISO 8601 duration format to minutes"""
    if pd.isna(duration_str):
        return np.nan
        
    try:
        # Remove 'PT' prefix and initialize variables
        duration_str = str(duration_str).replace('PT', '')
        hours = 0
        minutes = 0
        
        # Find hours
        hour_match = re.search(r'(\d+)H', duration_str)
        if hour_match:
            hours = int(hour_match.group(1))
        
        # Find minutes
        minute_match = re.search(r'(\d+)M', duration_str)
        if minute_match:
            minutes = int(minute_match.group(1))
        
        total_minutes = hours * 60 + minutes
        return total_minutes if total_minutes > 0 else np.nan
        
    except (ValueError, AttributeError):
        return np.nan


In [None]:
# Convert time columns to minutes
time_columns = ['CookTime', 'PrepTime', 'TotalTime']
for col in time_columns:
    df_clean[f'New_{col}_Minutes'] = df_clean[col].apply(convert_duration)

In [None]:
df_clean[['CookTime', 'PrepTime', 'TotalTime', 'New_CookTime_Minutes', 'New_PrepTime_Minutes', 'New_TotalTime_Minutes']].head() 

In [None]:
# Date formatting

df_clean['DatePublished'] = pd.to_datetime(df_clean['DatePublished'])
df_clean['PublishYear'] = df_clean['DatePublished'].dt.year
df_clean['PublishMonth'] = df_clean['DatePublished'].dt.month

In [None]:
# Clean text columns
text_columns = ['Name', 'Description', 'RecipeCategory', 'AuthorName']
for col in text_columns:
    df_clean[col] = df_clean[col].str.strip()

In [None]:
# Process Keywords
def clean_keywords(keywords_str):
    if pd.isna(keywords_str):
        return []
    # Remove c() and split
    keywords = keywords_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [k.strip() for k in keywords if k.strip()]

df_clean['New_Keywords_List'] = df_clean['Keywords'].apply(clean_keywords)
    


In [None]:
df_clean.head(10)

In [None]:
# Clean Ingredients
def clean_ingredients(ingredients_str):
    if pd.isna(ingredients_str):
        return []
    ingredients = ingredients_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [ing.strip() for ing in ingredients if ing.strip()]

df_clean['New_Ingredients_List'] = df_clean['RecipeIngredientParts'].apply(clean_ingredients)
df_clean['New_Quantities_List'] = df_clean['RecipeIngredientQuantities'].apply(clean_ingredients)

In [None]:
df_clean.head(5)

In [None]:
# Clean Instructions
def clean_instructions(instructions_str):
    if pd.isna(instructions_str):
        return []
    instructions = instructions_str.replace('c(', '').replace(')', '').replace('"', '').split('.,')
    return [instr.strip() + '.' for instr in instructions if instr.strip()]

df_clean['New_Instructions_List'] = df_clean['RecipeInstructions'].apply(clean_instructions)
    


In [None]:
df_clean.head(5)

In [None]:
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [None]:
# find df_clean where recipe id = 39
df_clean[df_clean['RecipeId'] == 39]



In [None]:
# Format numeric columns
numeric_columns = ['Calories', 'FatContent', 'SaturatedFatContent', 
                    'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                    'FiberContent', 'SugarContent', 'ProteinContent']

for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').round(1)


In [None]:
df[numeric_columns].head(10)

In [None]:
df_clean[numeric_columns].head(10)

In [None]:
# df.info()

In [None]:
# df_clean.info()

In [None]:
# Sample 50k records randomly from df_cleaned
df_5k = df_clean.sample(n=5000, random_state=42)  # random_state for reproducibility

# Save to CSV file
output_path = 'recipes_5k.csv'
df_5k.to_csv(output_path, index=False)

In [None]:
# Check for partially missing nutritional information
def analyze_zero_nutrients(df_clean):
    zero_counts = {}
    for col in numeric_columns:
        zero_counts[col] = (df_clean[col] == 0).sum()
    
    print("\n Count of zero values for each nutrient:")
    for col, count in zero_counts.items():
        percentage = (count/len(df_clean)*100)
        print(f"{col}: {count} zeros ({percentage:.2f}%)")
    
    # Check for suspicious patterns
    partial_zeros = df_clean[df_clean[numeric_columns].apply(lambda x: (x == 0).any() & (x != 0).any(), axis=1)]
    print(f"\nRows with some (but not all) zero values: {len(partial_zeros)}")
    
    return partial_zeros

partial_zeros = analyze_zero_nutrients(df_clean)

# Display sample of partial zero rows
print("Sample of recipes with partial zero nutritional values:")
print(partial_zeros[['Name', 'RecipeCategory'] + numeric_columns].head())

In [None]:
df_clean['Name'].duplicated().sum()

In [None]:
# show all duplicated names
# df[df['Name'].duplicated(keep=False)].sort_values(by='Name')


# Find duplicates and show first 10 records
duplicated_recipes = df_clean[df_clean['Name'].duplicated(keep=False)].sort_values(by='Name').head(10)

# Display the results in a more readable format
print(f"Sample of Duplicated Recipe Names:")

print(duplicated_recipes[['Name', 'AuthorName', 'RecipeCategory', 'AggregatedRating', 'ReviewCount', 'Calories']+numeric_columns].to_string())

print(f"\nTotal number of recipes with duplicate names: {len(df[df['Name'].duplicated(keep=False)])}")

In [None]:
# Near duplicates, convert all names to lowercase and strip whitespace:
df_clean['clean_name'] = df_clean['Name'].str.lower().str.strip()

df_duplicates = df_clean[df_clean['clean_name'].duplicated(keep=False)].sort_values(by='clean_name')
# show df_duplicates
df_duplicates.head(10)


In [None]:
# show df_duplicates with all columns
df_duplicates[df_clean.columns].head(6)

In [None]:
# First, create clean name column
df_clean['clean_name'] = df_clean['Name'].str.lower().str.strip()

# Check duplicates across multiple relevant columns
columns_to_check = [
    'clean_name',
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
    'SugarContent',
    'ProteinContent',
    'RecipeCategory'
]

# Find duplicates across all specified columns
duplicates = df_clean[df_clean.duplicated(subset=columns_to_check, keep=False)].sort_values(by='clean_name')

# Display summary
print(f"Duplicate Analysis:")
print(f"Total number of duplicate records: {len(duplicates)}")
print(f"Number of unique recipes that have duplicates: {len(duplicates['clean_name'].unique())}")

# Show sample of duplicates with relevant columns
print("\nSample of Duplicate Records (showing different RecipeIds but same content):")
print("=" * 100)
sample_cols = ['RecipeId', 'Name', 'AuthorName', 'RecipeCategory', 'Calories', 'ReviewCount']
print(duplicates[sample_cols].head(10).to_string())

# Group by clean name to see how many duplicates each recipe has
duplicate_counts = duplicates.groupby('clean_name').size().sort_values(ascending=False)
print("\nTop 10 Most Duplicated Recipes:")
print(duplicate_counts.head(10))

## FOOD EDA

In [None]:
# read food_data.xlsx
food_data_df = pd.read_excel('../data/food_data.xlsx')
food_data_df.head()

In [None]:
# read food.csv
preprocessed_food_df = pd.read_csv('../data/preprocessed/food.csv')
preprocessed_food_df.head()

In [None]:
preprocessed_food_df.columns

In [None]:
preprocessed_food_df.head(7)

'vitamin_D', 'calcium',  'vitamin_C', 'iron', 'potassium', 
        'vitamin_B_6', 'vitamin_B_12', 'vitamin_A', 'riboflavin', 'vitamin_E', 'folate_total',
        'vitamin_K', 'zinc', 'magnesium','sodium',  'thiamin', 'Niacin',  'selenium'

In [None]:
food_info_cols = ['description', 'category', 'main_category','sub_category']
# nutrient_cols
nutrient_cols = ['vitamin_D', 'calcium',  'vitamin_C', 'iron', 'potassium', 
        'vitamin_B_6', 'vitamin_B_12', 'vitamin_A', 'riboflavin', 'vitamin_E', 'folate_total',
        'vitamin_K', 'zinc', 'magnesium','sodium',  'thiamin', 'Niacin',  'selenium', 'water']
# show nutrient_cols
food_df = preprocessed_food_df[food_info_cols + nutrient_cols]


In [None]:
food_df.shape

In [None]:
food_df.isnull().sum()

In [None]:
food_df.duplicated().sum()

### visualization

In [None]:
# plot top 10 main categories
food_df['main_category'].value_counts().head(10).plot(kind='bar')
plt.show()

In [None]:
# plot top 10 sub categories
food_df['sub_category'].value_counts().head(10).plot(kind='bar')
plt.show()


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

# Assuming your dataframe is named 'df'
# If not, load it with: df = pd.read_csv('your_file.csv')

# List of nutrient columns (all columns except the first 4 which are descriptive)
nutrient_cols = food_df.columns[4:]

# Create a dictionary to store nutrient-subcategory pairs
bigram_counts = defaultdict(float)

# Iterate through each row and count nutrient-subcategory associations 
# creates 
for _, row in food_df.iterrows():
    subcat = row['sub_category']
    for nutrient in nutrient_cols:
        if row[nutrient] > 0:  # Only count if the nutrient is present
            bigram = f"{nutrient} - {subcat}"
            bigram_counts[bigram] += row[nutrient]

print(bigram_counts)

# Convert to DataFrame and sort by count
bigram_df = pd.DataFrame.from_dict(bigram_counts, orient='index', columns=['count'])
bigram_df = bigram_df.sort_values('count', ascending=False)

# Plot the top 20 bigrams
plt.figure(figsize=(12, 8))
top_bigrams = bigram_df.head(10)
top_bigrams.plot(kind='barh', color='skyblue')
plt.title('Top 10 Nutrient-Subcategory Associations')
plt.xlabel('Total Nutrient Amount (Normalized)')
plt.ylabel('Nutrient - Subcategory Pair')
plt.gca().invert_yaxis()  # Show highest at top
plt.gca().set_facecolor('white') 
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Common style settings for both plots
plt.style.use('ggplot')  # This will make styles more consistent
plt.rcParams['figure.facecolor'] = 'white'  # Set background color

def plot_top_bigrams(bigram_counts, title, color):
    # Convert to DataFrame and sort by count
    bigram_df = pd.DataFrame(list(bigram_counts.items()), columns=['bigram', 'count'])
    bigram_df = bigram_df.sort_values('count', ascending=False)
    
    # Plot the top 5 bigrams
    plt.figure(figsize=(12, 10))
    top_bigrams = bigram_df.head(10)
    plt.barh(top_bigrams['bigram'], top_bigrams['count'], color=color)
    plt.title(title, fontsize=22, fontweight='bold')
    plt.xlabel('Total Nutrient Amount (Normalized)', fontsize=14)
    plt.ylabel('Nutrient - Subcategory Pair', fontsize=14)
    plt.gca().invert_yaxis()  # Show highest at top
    plt.gca().set_facecolor('#fafafa') 
    plt.tight_layout()
    plt.show()

# First plot
nutrient_cols = food_df.columns[4:]
bigram_counts = {}

for _, row in food_df.iterrows():
    subcat = row['sub_category']
    for nutrient in nutrient_cols:
        if row[nutrient] > 0:
            bigram = f"{nutrient} - {subcat}"
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + row[nutrient]

plot_top_bigrams(bigram_counts, 'Top 10 Nutrient-Subcategory Associations', '#8786f3')

# Second plot
nutrient_cols = [col for col in food_df.columns[4:] if col != 'water']
bigram_counts = {}

for _, row in preprocessed_food_df.iterrows():
    subcat = row['sub_category']
    for nutrient in nutrient_cols:
        if row[nutrient] > 0:
            bigram = f"{nutrient} - {subcat}"
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + row[nutrient]

plot_top_bigrams(bigram_counts, 'Top 10 Nutrient-Subcategory Associations (excluding water)', '#ffa083')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# List of nutrient columns (all columns except the first 4 which are descriptive)
nutrient_cols = food_df.columns[4:]

# Exclude water from nutrient columns if needed
nutrient_cols = [col for col in nutrient_cols]

# Create an empty dictionary to store nutrient-subcategory pairs
bigram_counts = {}

# Iterate through each row and count nutrient-subcategory associations
for _, row in food_df.iterrows():
    subcat = row['sub_category']
    for nutrient in nutrient_cols:
        if row[nutrient] > 0:  # Only count if the nutrient is present
            bigram = f"{nutrient} - {subcat}"
            if bigram in bigram_counts:
                bigram_counts[bigram] += row[nutrient]
            else:
                bigram_counts[bigram] = row[nutrient]

# Convert to DataFrame and sort by count
bigram_df = pd.DataFrame(list(bigram_counts.items()), columns=['bigram', 'count'])
bigram_df = bigram_df.sort_values('count', ascending=False)

# Plot the top 10 bigrams
plt.figure(figsize=(12, 8))
top_bigrams = bigram_df.head(5)
plt.barh(top_bigrams['bigram'], top_bigrams['count'], color='#8786f3')
plt.title('Top 5 Nutrient-Subcategory Associations', fontsize=16)
plt.xlabel('Total Nutrient Amount (Normalized)', fontsize=14)
plt.ylabel('Nutrient - Subcategory Pair', fontsize=14)
plt.gca().invert_yaxis()  # Show highest at top
plt.gca().set_facecolor('white') 
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# List of nutrient columns (all columns except the first 4 which are descriptive)
nutrient_cols = food_df.columns[4:]
# Exclude water from nutrient columns
nutrient_cols = [col for col in nutrient_cols if col != 'water']

# Create a dictionary to store nutrient-subcategory pairs
bigram_counts = {}

# Iterate through each row and count nutrient-subcategory associations
for _, row in preprocessed_food_df.iterrows():
    subcat = row['sub_category']
    for nutrient in nutrient_cols:
        if row[nutrient] > 0:  # Only count if the nutrient is present
            bigram = f"{nutrient} - {subcat}"
            bigram_counts[bigram] = bigram_counts.get(bigram, 0) + row[nutrient]

print(bigram_counts)

# Convert to DataFrame and sort by count
bigram_df = pd.DataFrame.from_dict(bigram_counts, orient='index', columns=['count'])
bigram_df = bigram_df.sort_values('count', ascending=False)

# Plot the top 10 bigrams
plt.figure(figsize=(12, 8))
top_bigrams = bigram_df.head(10)
top_bigrams.plot(kind='barh', color='#ffa083')
# plt.title('Top 10 Nutrient-Subcategory Associations', fontsize=18, fontweight='bold')
plt.xlabel('Total Nutrient Amount (Normalized)', fontsize=14, fontweight='bold')
plt.ylabel('Nutrient - Subcategory Pair', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()  # Show highest at top
plt.gca().set_facecolor('#fafafa') 
plt.tight_layout()
plt.show()


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from umap import UMAP
import os

# Set the correct working directory
os.chdir('/Users/aartijangid/aarti-data/culinary_compass')

# Load the data
data = pd.read_csv('data/embeddings/recipes.csv')

# Convert string representations of lists to actual lists
data['IngredientEmbedding'] = data['IngredientEmbedding'].apply(literal_eval)

# Get top 15 categories
top_15_categories = data['RecipeCategory'].value_counts().nlargest(15).index

# Filter data for top 15 categories
data_filtered = data[data['RecipeCategory'].isin(top_15_categories)]

# Extract embeddings
embeddings = np.array(data_filtered['IngredientEmbedding'].tolist())

# Apply UMAP for dimensionality reduction
umap = UMAP(n_components=2, random_state=42)
embeddings_2d = umap.fit_transform(embeddings)

# Add the 2D coordinates to the dataframe
data_filtered['x'] = embeddings_2d[:, 0]
data_filtered['y'] = embeddings_2d[:, 1]

# Create the visualization
plt.figure(figsize=(15, 10))
sns.scatterplot(
    x='x', y='y',
    hue='RecipeCategory',
    palette='tab20',
    data=data_filtered,
    s=100,
    alpha=0.8
)

plt.title('Recipe Embeddings Visualization (UMAP) - Top 15 Categories')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Print the top 15 categories and their counts
print("\nTop 15 Recipe Categories:")
print(data_filtered['RecipeCategory'].value_counts())