In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [None]:
df=pd.read_csv('../data/recipes.csv', nrows=30000)


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df['RecipeInstructions'].iloc[42]

In [None]:
df.isnull().sum()

### Visualization starts here

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

In [None]:
# Load the data
df = pd.read_csv('../data/recipes.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
nutritional_cols = ['Calories', 'FatContent', 'SaturatedFatContent', 
                   'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                   'FiberContent', 'SugarContent', 'ProteinContent']



In [None]:
# Statistical summary
print("\nNutritional Content Summary:")
print(df[nutritional_cols].describe())


In [None]:
# correlation heatmap for nutritional values
plt.figure(figsize=(12, 8))
sns.heatmap(df[nutritional_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation between Nutritional Values')
plt.tight_layout()
plt.show()

In [None]:
# display unique values of RecipeCategory top 20 based on count
df['RecipeCategory'].value_counts().head(20)

In [None]:
# Recipe Categories Analysis
plt.figure(figsize=(15, 6))
df['RecipeCategory'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Recipe Categories')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Get top 20 recipe categories
top_categories = df['RecipeCategory'].value_counts().head(20)

# Create the figure
plt.figure(figsize=(12, 8))

# Generate color gradient
colors = sns.color_palette("Blues", n_colors=len(top_categories))

# Create horizontal bar plot
bars = plt.barh(top_categories.index, top_categories.values, color=colors)

# Add labels on bars
for bar in bars:
    width = bar.get_width()
    plt.text(width + 1000,  # Offset for better readability
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center', fontsize=10)

# Customize the plot
plt.title('Top 20 Recipe Categories', fontsize=14, pad=15)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Recipe Category', fontsize=12)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines for a cleaner look
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()


In [None]:
# Get top 5 categories
top_categories = df['RecipeCategory'].value_counts().head(10).index
df_top = df[df['RecipeCategory'].isin(top_categories)]

# Normalize the nutritional values for better visualization
nutrients = ['FatContent', 'ProteinContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent']
df_normalized = df_top[nutrients].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

## remove one dish meal, lunch, snacks, breakfast and color codes

In [None]:
# Nutritional Distribution by Category
# Stacked Bar Chart

plt.figure(figsize=(12, 6))
nutrient_means = df_top.groupby('RecipeCategory')[nutrients].mean()
nutrient_means_normalized = nutrient_means.apply(lambda x: x/x.sum(), axis=1)
nutrient_means_normalized.plot(kind='barh', stacked=True)
plt.title('Proportional Nutrient Distribution by Category')
plt.ylabel('Recipe Category')
plt.xlabel('Proportion of Nutrients')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Nutritional Distribution by Category
# Stacked Bar Chart

# Define categories to exclude
categories_to_exclude = ['One Dish Meal', 'Lunch/Snacks', 'Breakfast', 'Sauces', 'Chicken Breasts', 'Chicken Breast']

# Filter out the excluded categories    
df_filtered = df[~df['RecipeCategory'].isin(categories_to_exclude)]
top_categories = df_filtered['RecipeCategory'].value_counts().head(8).index
df_top = df_filtered[df_filtered['RecipeCategory'].isin(top_categories)]

# Define color scheme
colors = ['#FD9F6E', '#CBCE54', '#FDD526', '#A4C1F3', '#B0927A', '#C0A6CA']

plt.figure(figsize=(12, 6))
nutrient_means = df_top.groupby('RecipeCategory')[nutrients].mean()
nutrient_means_normalized = nutrient_means.apply(lambda x: x/x.sum(), axis=1)
ax = nutrient_means_normalized.plot(kind='barh', stacked=True, color=colors)
# plt.title('Proportional Nutrient Distribution by Category', fontsize=18, fontweight='bold')
plt.ylabel('Recipe Category')
plt.xlabel('Proportion of Nutrients')
# remove legend 
ax.legend_.remove()
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.gca().set_facecolor('#fafafa') 
plt.tight_layout()
plt.show()

In [None]:
# Ingredients Analysis
# Count number of ingredients per recipe
df['IngredientCount'] = df['RecipeIngredientParts'].apply(lambda x: len(str(x).split(',')))

In [None]:
# Save key insights
insights = {
    'total_recipes': len(df),
    'avg_rating': df['AggregatedRating'].mean(),
    'avg_calories': df['Calories'].mean(),
    'avg_ingredients': df['IngredientCount'].mean(),
}

print("\nKey Insights:")
for key, value in insights.items():
    print(f"{key}: {value:.2f}")

In [None]:
# display first 10 rows of df with name, keywords, RecipeCategory, RecipeIngredientParts, RecipeInstructions, RecipeYield, PrepTime, CookTime, TotalTime, RecipeInstructions, RecipeIngredientParts
df_temp = df[['Name', 'Keywords', 'RecipeCategory', 'RecipeIngredientParts', 'RecipeInstructions', 'RecipeYield', 'PrepTime', 'CookTime', 'TotalTime']].head(25)
# save df to csv
df_temp.to_csv('recipes.csv', index=False)


In [None]:
# Function to process ingredient
def extract_ingredients(ingredient):
    if pd.isna(ingredient):
        return []
    # Clean the string and split into ingredients
    ingredients = ingredient.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [ingredient.strip().lower() for ingredient in ingredients]

# Extract and count all ingredient
all_ingredients = []
for ingredient in df['RecipeIngredientParts'].dropna():
    all_ingredients.extend(extract_ingredients(ingredient))

# Count ingredients and get top 20
ingredient_counts = pd.Series(all_ingredients).value_counts().head(20)

# Create the plot
plt.figure(figsize=(12, 8))

# Create horizontal bar chart with color gradient
colors = sns.color_palette("RdYlBu_r", n_colors=len(ingredient_counts))
bars = plt.barh(range(len(ingredient_counts)), ingredient_counts.values, color=colors)

# Customize the plot
plt.title('Top 20 Recipe ingredients', fontsize=14, pad=20)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Ingredients', fontsize=12)

# Add value labels on the bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 100,  
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center',
             fontsize=10)

# Set y-tick labels
plt.yticks(range(len(ingredient_counts)), ingredient_counts.index)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

# Print summary statistics
print("\nIngredient Statistics:")
print(f"Total Unique ingredient: {len(pd.Series(all_ingredients).unique()):,}")
print(f"Total ingredient Occurrences: {len(all_ingredients):,}")

# Print top ingredient combinations
print("\n🔍 Top 5 Most Common Ingredients:")
for idx, (ingredient, count) in enumerate(ingredient_counts.head(20).items(), 1):
    print(f"{idx}. {ingredient.title()}: {count:,} recipes")

In [None]:
def normalize_ingredient(ingredient):
    """Normalize ingredient names by combining similar ingredients"""
    ingredient = ingredient.lower().strip()
    
    # Dictionary of ingredient mappings
    ingredient_mappings = {
        # Eggs
        'eggs': 'egg',
        'egg whites': 'egg',
        'egg white': 'egg',
        'egg yolks': 'egg',
        'egg yolk': 'egg',
        'raw egg': 'egg',
        'egg substitute': 'egg',
        'egg beaters': 'egg',
        
        # Sugar
        'brown sugar': 'sugar',
        'white sugar': 'sugar',
        'granulated sugar': 'sugar',
        'powdered sugar': 'sugar',
        'caster sugar': 'sugar',
        'confectioners sugar': 'sugar',
        'demerara sugar': 'sugar',
        'light demerara sugar': 'sugar',
        
        # Salt
        'sea salt': 'salt',
        'kosher salt': 'salt',
        
        # Flour 
        'all-purpose flour': 'flour',
        'all purpose flour': 'flour',
        'unbleached all-purpose flour': 'flour',
        'unbleached all purpose flour': 'flour',
        'self-rising flour': 'flour',
        'plain flour': 'flour',
        'wheat flour': 'flour',
        
        # Oil
        'vegetable oil': 'oil',
        'olive oil': 'oil',
        'spanish olive oil': 'oil',
        'walnut oil': 'oil',
        'canola oil': 'oil',
        'extra virgin olive oil': 'oil',
        'coconut oil': 'oil',
        
        # Butter
        'unsalted butter': 'butter',
        'salted butter': 'butter',
        'melted butter': 'butter',
        'sweet butter': 'butter',
    }
    
    # Return normalized ingredient name
    return ingredient_mappings.get(ingredient, ingredient)

def extract_ingredients(ingredient_str):
    """Extract and normalize ingredients from string"""
    if pd.isna(ingredient_str):
        return []
    # Clean the string and split into ingredients
    ingredients = ingredient_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [normalize_ingredient(ingredient.strip().lower()) for ingredient in ingredients]

# Extract and count all ingredients
all_ingredients = []
for ingredient in df['RecipeIngredientParts'].dropna():
    all_ingredients.extend(extract_ingredients(ingredient))

In [None]:
# save all ingredients to csv
df_ingredients = pd.DataFrame(all_ingredients, columns=['ingredients'])
df_ingredients.to_csv('ingredients.csv', index=False)

In [None]:
# Count ingredients and get top 20
ingredient_counts = pd.Series(all_ingredients).value_counts().head(20)

# Create the plot
plt.figure(figsize=(12, 8))

# Create horizontal bar chart with custom color palette
colors = sns.color_palette("RdYlBu_r", n_colors=len(ingredient_counts))
bars = plt.barh(range(len(ingredient_counts)), ingredient_counts.values, color=colors)

# Customize the plot
plt.title('Top 20 Recipe Ingredients (Normalized)', fontsize=14, pad=20)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Ingredients', fontsize=12)

# Add value labels on the bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 100,  
             bar.get_y() + bar.get_height()/2,
             f'{int(width):,}',
             va='center',
             fontsize=10)

# Set y-tick labels
plt.yticks(range(len(ingredient_counts)), ingredient_counts.index)

# Add grid for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

# Print summary statistics with emoji
print("\n📊 Ingredient Statistics:")
print(f"Total Unique Ingredients: {len(pd.Series(all_ingredients).unique()):,}")
print(f"Total Ingredient Occurrences: {len(all_ingredients):,}")
print(f"Most Common Ingredient: {ingredient_counts.index[0]} ({ingredient_counts.values[0]:,} occurrences)")

# Print top ingredient combinations
print("\n🔍 Top 5 Most Common Ingredients:")
for idx, (ingredient, count) in enumerate(ingredient_counts.head().items(), 1):
    print(f"{idx}. {ingredient.title()}: {count:,} recipes")

### Data cleaning starts here

In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv('../data/recipes.csv')

In [None]:
df.duplicated().any()

In [None]:
numeric_columns = ['Calories', 'FatContent', 'SaturatedFatContent', 
                    'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                    'FiberContent', 'SugarContent', 'ProteinContent']

# Check which rows have all zeros in numeric columns
all_zeros = (df[numeric_columns] == 0).all(axis=1)
zero_nutrition_rows = df[all_zeros]


In [None]:
# Print results
print(f"\nAnalysis of Zero Nutritional Values:")
print(f"Total rows with all nutritional values = 0: {len(zero_nutrition_rows)} ({(len(zero_nutrition_rows)/len(df)*100):.2f}%)")

# Display sample of these rows
print("\nSample of recipes with zero nutritional values:")
print(zero_nutrition_rows[['Name', 'RecipeCategory'] + numeric_columns].head())

# Save these recipes to investigate
# zero_nutrition_rows.to_csv('zero_nutrition_recipes.csv', index=False)

# Additional analysis of these rows
print("\nCategories with zero nutritional values:")
print(zero_nutrition_rows['RecipeCategory'].value_counts().head())

In [None]:
# Remove rows with all zeros and create new dataframe
df_clean = df[~all_zeros].copy()

In [None]:
 # Create a copy to avoid modifying original data
# df_clean = df.copy()

In [None]:
import re

# Time-related columns

def convert_duration(duration_str):
    """Convert ISO 8601 duration format to minutes"""
    if pd.isna(duration_str):
        return np.nan
        
    try:
        # Remove 'PT' prefix and initialize variables
        duration_str = str(duration_str).replace('PT', '')
        hours = 0
        minutes = 0
        
        # Find hours
        hour_match = re.search(r'(\d+)H', duration_str)
        if hour_match:
            hours = int(hour_match.group(1))
        
        # Find minutes
        minute_match = re.search(r'(\d+)M', duration_str)
        if minute_match:
            minutes = int(minute_match.group(1))
        
        total_minutes = hours * 60 + minutes
        return total_minutes if total_minutes > 0 else np.nan
        
    except (ValueError, AttributeError):
        return np.nan


In [None]:
# Convert time columns to minutes
time_columns = ['CookTime', 'PrepTime', 'TotalTime']
for col in time_columns:
    df_clean[f'New_{col}_Minutes'] = df_clean[col].apply(convert_duration)

In [None]:
df_clean[['CookTime', 'PrepTime', 'TotalTime', 'New_CookTime_Minutes', 'New_PrepTime_Minutes', 'New_TotalTime_Minutes']].head() 

In [None]:
# Date formatting

df_clean['DatePublished'] = pd.to_datetime(df_clean['DatePublished'])
df_clean['PublishYear'] = df_clean['DatePublished'].dt.year
df_clean['PublishMonth'] = df_clean['DatePublished'].dt.month

In [None]:
# Clean text columns
text_columns = ['Name', 'Description', 'RecipeCategory', 'AuthorName']
for col in text_columns:
    df_clean[col] = df_clean[col].str.strip()

In [None]:
# Process Keywords
def clean_keywords(keywords_str):
    if pd.isna(keywords_str):
        return []
    # Remove c() and split
    keywords = keywords_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [k.strip() for k in keywords if k.strip()]

df_clean['New_Keywords_List'] = df_clean['Keywords'].apply(clean_keywords)
    


In [None]:
df_clean.head(10)

In [None]:
# Clean Ingredients
def clean_ingredients(ingredients_str):
    if pd.isna(ingredients_str):
        return []
    ingredients = ingredients_str.replace('c(', '').replace(')', '').replace('"', '').split(',')
    return [ing.strip() for ing in ingredients if ing.strip()]

df_clean['New_Ingredients_List'] = df_clean['RecipeIngredientParts'].apply(clean_ingredients)
df_clean['New_Quantities_List'] = df_clean['RecipeIngredientQuantities'].apply(clean_ingredients)

In [None]:
df_clean.head(5)

In [None]:
# Clean Instructions
def clean_instructions(instructions_str):
    if pd.isna(instructions_str):
        return []
    instructions = instructions_str.replace('c(', '').replace(')', '').replace('"', '').split('.,')
    return [instr.strip() + '.' for instr in instructions if instr.strip()]

df_clean['New_Instructions_List'] = df_clean['RecipeInstructions'].apply(clean_instructions)
    


In [None]:
df_clean.head(5)

In [None]:
# pd.options.display.max_rows = None
# pd.options.display.max_columns = None

In [None]:
# find df_clean where recipe id = 39
# df_clean[df_clean['RecipeId'] == 39]



In [None]:
# Format numeric columns
numeric_columns = ['Calories', 'FatContent', 'SaturatedFatContent', 
                    'CholesterolContent', 'SodiumContent', 'CarbohydrateContent',
                    'FiberContent', 'SugarContent', 'ProteinContent']

for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce').round(1)


In [None]:
df[numeric_columns].head(10)

In [None]:
df_clean[numeric_columns].head(10)

In [None]:
# df.info()

In [None]:
# df_clean.info()

In [None]:
# Sample 50k records randomly from df_cleaned
# df_5k = df_clean.sample(n=5000, random_state=42)  # random_state for reproducibility

# # Save to CSV file
# output_path = 'recipes_5k.csv'
# df_5k.to_csv(output_path, index=False)

In [None]:
# Check for partially missing nutritional information
# def analyze_zero_nutrients(df_clean):
#     zero_counts = {}
#     for col in numeric_columns:
#         zero_counts[col] = (df_clean[col] == 0).sum()
    
#     print("\n Count of zero values for each nutrient:")
#     for col, count in zero_counts.items():
#         percentage = (count/len(df_clean)*100)
#         print(f"{col}: {count} zeros ({percentage:.2f}%)")
    
#     # Check for suspicious patterns
#     partial_zeros = df_clean[df_clean[numeric_columns].apply(lambda x: (x == 0).any() & (x != 0).any(), axis=1)]
#     print(f"\nRows with some (but not all) zero values: {len(partial_zeros)}")
    
#     return partial_zeros

# partial_zeros = analyze_zero_nutrients(df_clean)

# # Display sample of partial zero rows
# print("Sample of recipes with partial zero nutritional values:")
# print(partial_zeros[['Name', 'RecipeCategory'] + numeric_columns].head())

In [None]:
# df_clean['Name'].duplicated().sum()

In [None]:
# show all duplicated names
# df[df['Name'].duplicated(keep=False)].sort_values(by='Name')


# Find duplicates and show first 10 records
duplicated_recipes = df_clean[df_clean['Name'].duplicated(keep=False)].sort_values(by='Name').head(10)

# Display the results in a more readable format
print(f"Sample of Duplicated Recipe Names:")

print(duplicated_recipes[['Name', 'AuthorName', 'RecipeCategory', 'AggregatedRating', 'ReviewCount', 'Calories']+numeric_columns].to_string())

print(f"\nTotal number of recipes with duplicate names: {len(df[df['Name'].duplicated(keep=False)])}")

In [None]:
# Near duplicates, convert all names to lowercase and strip whitespace:
df_clean['clean_name'] = df_clean['Name'].str.lower().str.strip()

df_duplicates = df_clean[df_clean['clean_name'].duplicated(keep=False)].sort_values(by='clean_name')
# show df_duplicates
df_duplicates.head(10)


In [None]:
# show df_duplicates with all columns
df_duplicates[df_clean.columns].head(6)

In [None]:
# First, create clean name column
df_clean['clean_name'] = df_clean['Name'].str.lower().str.strip()

# Check duplicates across multiple relevant columns
columns_to_check = [
    'clean_name',
    'Calories',
    'FatContent',
    'SaturatedFatContent',
    'CholesterolContent',
    'SodiumContent',
    'CarbohydrateContent',
    'FiberContent',
    'SugarContent',
    'ProteinContent',
    'RecipeCategory'
]

# Find duplicates across all specified columns
duplicates = df_clean[df_clean.duplicated(subset=columns_to_check, keep=False)].sort_values(by='clean_name')

# Display summary
print(f"Duplicate Analysis:")
print(f"Total number of duplicate records: {len(duplicates)}")
print(f"Number of unique recipes that have duplicates: {len(duplicates['clean_name'].unique())}")

# Show sample of duplicates with relevant columns
print("\nSample of Duplicate Records (showing different RecipeIds but same content):")
print("=" * 100)
sample_cols = ['RecipeId', 'Name', 'AuthorName', 'RecipeCategory', 'Calories', 'ReviewCount']
print(duplicates[sample_cols].head(10).to_string())

# Group by clean name to see how many duplicates each recipe has
duplicate_counts = duplicates.groupby('clean_name').size().sort_values(ascending=False)
print("\nTop 10 Most Duplicated Recipes:")
print(duplicate_counts.head(10))

In [None]:
# List of columns that were processed or are no longer needed
columns_to_drop = [
    'clean_name',  # processed version of Name
    'IngredientEmbedding',  # if this was processed into vectors
    'Images',  # if you've already extracted main image
    'Keywords',  # if you've processed this into a list
    'RecipeIngredientQuantities',  # if you've already processed ingredients
    'RecipeIngredientParts',  # if you've already processed ingredients
    'RecipeInstructions',  # if you've already processed instructions
    'CookTime',  # if you've converted to minutes
    'PrepTime',  # if you've converted to minutes
    'TotalTime',  # if you've converted to minutes
]

# Show current columns
print("Current columns in df_clean:")
print(df_clean.columns.tolist())
print(f"\nTotal columns: {len(df_clean.columns)}")

# Drop the columns and create new dataframe
df_clean_reduced = df_clean.drop(columns=[col for col in columns_to_drop if col in df_clean.columns])

# Show remaining columns
print("\nRemaining columns after dropping processed ones:")
print(df_clean_reduced.columns.tolist())
print(f"\nRemaining columns: {len(df_clean_reduced.columns)}")

# Show what was dropped
dropped_cols = [col for col in columns_to_drop if col in df_clean.columns]
print("\nColumns that were dropped:")
print(dropped_cols)

# Save the reduced dataframe if needed
df_clean = df_clean_reduced.copy()
print("\n✅ Dataframe updated with reduced columns")


### Use Clean CSV for plots

In [None]:
# Save to CSV
df_clean.to_csv('df_clean_reduced.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load both dataframes
df_clean_reduced = pd.read_csv('df_clean_reduced.csv', nrows=20000)  # Reduced df_clean
# df_original = pd.read_csv('data/df_clean.csv')  # Original df_clean with ingredient information

# First, let's explode the ingredients list to get individual ingredients
all_ingredients = df['RecipeIngredientParts'].str.strip('[]').str.split(',').explode()

# Clean the ingredients (remove quotes, spaces, etc.)
all_ingredients = all_ingredients.str.strip().str.strip('"\'').str.lower()

# Count the frequency of each ingredient
ingredient_counts = all_ingredients.value_counts()

# Get top 20 ingredients
top_20_ingredients = ingredient_counts.head(20)

# Create a more readable display with enhanced formatting
print("Top 20 Most Common Ingredients in Recipes")
print("=" * 60)
print(f"{'Rank':<6}{'Ingredient':<35}{'Count':>10}{'Percentage':>12}")
print("-" * 60)

total_recipes = len(df_clean_reduced)
for idx, (ingredient, count) in enumerate(top_20_ingredients.items(), 1):
    percentage = (count / total_recipes) * 100
    print(f"{idx:<6}{ingredient:<35}{count:>10,}{percentage:>11.1f}%")

# Create an enhanced visualization
plt.figure(figsize=(12, 8))
colors = sns.color_palette("husl", n_colors=20)
bars = sns.barplot(x=top_20_ingredients.values, 
                  y=top_20_ingredients.index,
                  palette=colors)
plt.title('Top 20 Most Common Ingredients in Recipes', pad=20, fontsize=14)
plt.xlabel('Number of Recipes', fontsize=12)
plt.ylabel('Ingredients', fontsize=12)
for i, v in enumerate(top_20_ingredients.values):
    bars.text(v, i, f' {v:,}', va='center', fontsize=10)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
df_clean.columns

In [None]:
def plot_top_ingredient_bigrams_no_nltk(df, column_name='New_Ingredients_List', top_n=20):
    """
    Plot the top N ingredient bigrams without using NLTK.
    """
    # Custom stop words
    stop_words = {'and', 'the', 'of', 'with', 'for', 'to', 'in', 'a', 'an', 'or', 
                 'as', 'at', 'by', 'from', 'into', 'on', 'that', 'this'}
    
    all_ingredients = []
    for ingredient_list in df[column_name]:
        if isinstance(ingredient_list, list):
            for ing in ingredient_list:
                # Simple cleaning
                words = [word.lower().strip(".,!?()") for word in ing.split()]
                clean_ing = ' '.join([word for word in words 
                                     if word not in stop_words and len(word) > 2])
                if clean_ing:
                    all_ingredients.append(clean_ing)
    
    # Generate bigrams manually
    bigram_counts = {}
    for i in range(len(all_ingredients)-1):
        bigram = (all_ingredients[i], all_ingredients[i+1])
        bigram_counts[bigram] = bigram_counts.get(bigram, 0) + 1
    
    # Get top bigrams
    top_bigrams = sorted(bigram_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # Plot
    plt.figure(figsize=(12, 8))
    plt.barh([' '.join(bigram) for bigram, count in top_bigrams],
             [count for bigram, count in top_bigrams],
             color='lightgreen')
    plt.xlabel('Frequency')
    plt.ylabel('Ingredient Bigrams')
    plt.title(f'Top {top_n} Most Common Ingredient Bigrams')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
plot_top_ingredient_bigrams_no_nltk(df_clean, column_name='New_Ingredients_List', top_n=20)

In [None]:
# !pip install networkx

In [None]:
def plot_ingredient_network(df, column_name='New_Ingredients_List', top_n=15):
    """
    Plot ingredient relationships as a clean network graph with standardized ingredient names.
    """
    import networkx as nx
    import matplotlib.pyplot as plt
    from matplotlib.colors import LinearSegmentedColormap
    from collections import Counter

    # Dictionary of ingredient mappings (provided)
    ingredient_mappings = {
        # Eggs
        'eggs': 'egg',
        'egg whites': 'egg',
        'egg white': 'egg',
        'egg yolks': 'egg',
        'egg yolk': 'egg',
        'raw egg': 'egg',
        'egg substitute': 'egg',
        'egg beaters': 'egg',
        
        # Sugar
        'brown sugar': 'sugar',
        'white sugar': 'sugar',
        'granulated sugar': 'sugar',
        'powdered sugar': 'sugar',
        'caster sugar': 'sugar',
        'confectioners sugar': 'sugar',
        'demerara sugar': 'sugar',
        'light demerara sugar': 'sugar',
        
        # Salt
        'sea salt': 'salt',
        'kosher salt': 'salt',
        
        # Flour 
        'all-purpose flour': 'flour',
        'all purpose flour': 'flour',
        'unbleached all-purpose flour': 'flour',
        'unbleached all purpose flour': 'flour',
        'self-rising flour': 'flour',
        'plain flour': 'flour',
        'wheat flour': 'flour',
        
        # Oil
        'vegetable oil': 'oil',
        'olive oil': 'oil',
        'spanish olive oil': 'oil',
        'walnut oil': 'oil',
        'canola oil': 'oil',
        'extra virgin olive oil': 'oil',
        'coconut oil': 'oil',
        
        # Butter
        'unsalted butter': 'butter',
        'salted butter': 'butter',
        'melted butter': 'butter',
        'sweet butter': 'butter',
    }

    # Custom stop words and cleaning
    stop_words = {'and', 'the', 'of', 'with', 'for', 'to', 'in', 'a', 'an', 'or', 
                 'as', 'at', 'by', 'from', 'into', 'on', 'that', 'this', 'fresh', 'dried'}
    
    def clean_and_map_ingredient(ing):
        """Clean ingredient and apply mapping to standardized names"""
        if not isinstance(ing, str):
            return None
            
        # Basic cleaning
        words = [word.lower().strip(".,!?()%") for word in ing.split()]
        clean_ing = ' '.join([word for word in words 
                            if word not in stop_words and len(word) > 2])
        
        # Apply mapping if exists, otherwise return cleaned version
        return ingredient_mappings.get(clean_ing, clean_ing)

    # Process ingredients with cleaning and mapping
    all_ingredients = []
    for ingredient_list in df[column_name]:
        if isinstance(ingredient_list, str):
            try:
                ingredients = eval(ingredient_list)
            except:
                ingredients = ingredient_list.split(',')
        else:
            ingredients = ingredient_list
            
        for ing in ingredients:
            mapped_ing = clean_and_map_ingredient(ing)
            if mapped_ing and mapped_ing not in {'', ' '}:
                all_ingredients.append(mapped_ing)
    
    # Count ingredient frequencies
    ingredient_counts = Counter(all_ingredients)
    
    # Generate bigrams and count co-occurrences
    bigram_counts = Counter()
    for i in range(len(all_ingredients)-1):
        ing1, ing2 = all_ingredients[i], all_ingredients[i+1]
        if ing1 != ing2:  # Skip self-pairs
            bigram = tuple(sorted((ing1, ing2)))
            bigram_counts[bigram] += 1
    
    # Create network graph
    G = nx.Graph()
    
    # Get top ingredients by frequency
    top_ingredients = [ing for ing, count in ingredient_counts.most_common(top_n)]
    
    # Add nodes and edges (only between top ingredients)
    for (ing1, ing2), weight in bigram_counts.items():
        if ing1 in top_ingredients and ing2 in top_ingredients and weight > 2:
            G.add_edge(ing1, ing2, weight=weight)
    
    # Only keep nodes that have edges
    G.remove_nodes_from(list(nx.isolates(G)))
    
    # Set up the plot with professional styling
    plt.figure(figsize=(18, 12), facecolor='white')
    
    # Calculate layout with better spacing
    pos = nx.spring_layout(G, k=0.5, iterations=100, seed=42)
    
    # Custom color gradient
    cmap = LinearSegmentedColormap.from_list('ingredient_cmap', ['#d6eb6e', '#d6eb6e'])
    
    # Node sizes based on frequency (log scale for better visibility)
    max_count = max(ingredient_counts.values())
    node_sizes = [300 + 3000 * (ingredient_counts[node]/max_count) for node in G.nodes()]
    
    # Node colors based on centrality (importance in network)
    centrality = nx.degree_centrality(G)
    node_colors = [cmap(centrality[node]) for node in G.nodes()]
    
    # Edge widths based on co-occurrence frequency
    if bigram_counts:
        max_weight = max(bigram_counts.values())
        edge_weights = [2 + 3 * G[u][v]['weight']/max_weight for u,v in G.edges()]
    else:
        edge_weights = [2] * len(G.edges())
    
    # Draw the network with improved styling
    nx.draw_networkx_nodes(
        G, pos,
        node_size=node_sizes,
        node_color=node_colors,
        alpha=0.9,
        edgecolors='white',
        linewidths=1.5
    )
    
    nx.draw_networkx_edges(
        G, pos,
        width=edge_weights,
        alpha=0.3,
        edge_color='black'
    )
    
    # Label only the most important nodes to reduce clutter
    important_nodes = [node for node in G.nodes() if centrality[node] > 0.1]
    labels = {node: node.replace(' ', '\n') for node in important_nodes}
    
    nx.draw_networkx_labels(
        G, pos,
        labels=labels,
        font_size=14,
        font_family='sans-serif',
        font_weight='bold',
        alpha=1,
    )
    
    # Add title and adjust layout
    # plt.title('Ingredient Co-occurrence Network (Standardized Names)', fontsize=22, fontweight='bold', pad=20)
    plt.axis('off')
    plt.gca().set_facecolor('#fafafa') 
    # Add legend for node size/color
    plt.text(0.95, 0.05, 
            #  "Node size ≈ Ingredient frequency\nNode color ≈ Network importance",
            "Node size ≈ Ingredient frequency",
             transform=plt.gca().transAxes,
             ha='right', va='bottom',
             bbox=dict(facecolor='white', alpha=0.7), 
             fontsize=16
             )
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_ingredient_network(df_clean, top_n=15)

In [None]:
    # !pip install holoviews hvplot bokeh


In [None]:
import pandas as pd
import itertools
import numpy as np
import holoviews as hv
from holoviews import opts
import panel as pn
pn.extension()

hv.extension('bokeh')  # Enable Holoviews with Bokeh backend

# Generate Co-occurrence Pairs
ingredient_pairs = []
for ingredients in df_clean["New_Ingredients_List"]:
    unique_ingredients = sorted(set(ingredients))  # Remove duplicates within a recipe
    ingredient_pairs.extend(itertools.combinations(unique_ingredients, 2))

# Count occurrences of each ingredient pair
from collections import Counter
pair_counts = Counter(ingredient_pairs)

# Get top N most frequent pairs
top_pairs = pair_counts.most_common(50)

# Convert pairs into a DataFrame for Holoviews Chord Diagram
data = [(pair[0], pair[1], count) for pair, count in top_pairs]
df_pairs = pd.DataFrame(data, columns=["Source", "Target", "Weight"])

title_html = pn.pane.HTML(
    "<h3 style='color:#FF6347; font-size:16px; text-align:center;'>Ingredient Co-occurrence Chord Diagram</h3>",
    width=600
)

chord = hv.Chord(df_pairs).opts(
    opts.Chord(
        cmap='Category20', 
        edge_cmap='viridis', 
        edge_color='Weight',
        labels='index', 
        node_color='index',
        height=600, width=600,
        
    )
)

# save the chord diagram
hv.save(chord, 'chord_diagram.html')

# Display
chord
