In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
file_path = 'merged_input.csv'
df = pd.read_csv(file_path)

print(df.shape)
print(df.columns)
df.head()

### Most and Least ordered recipes

In [None]:
# Count number of times each recipe was ordered
recipe_order_counts = (
    df[df['AddToCart'] == True]['RecipeID']
    .value_counts()
)

# Top 10 most ordered recipes
most_ordered = recipe_order_counts.head(10)

# Bottom 10 least ordered recipes (among those ordered at least once)
least_ordered = recipe_order_counts.tail(10)

print("Top 10 Most Ordered Recipes:")
print(most_ordered)

print("\nBottom 10 Least Ordered Recipes (ordered at least once):")
print(least_ordered)

# Optional: visualize both
plt.figure(figsize=(10,5))
most_ordered.plot(kind='bar', color='seagreen')
plt.title('Top 10 Most Ordered Recipes')
plt.xlabel('Recipe ID')
plt.ylabel('Order Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
least_ordered.plot(kind='bar', color='firebrick')
plt.title('Bottom 10 Least Ordered Recipes')
plt.xlabel('Recipe ID')
plt.ylabel('Order Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Optional extra: list recipes that were *never* ordered
never_ordered = set(df['RecipeID'].unique()) - set(recipe_order_counts.index)
print(f"\nRecipes never ordered: {len(never_ordered)}")
if len(never_ordered) > 0:
    print("Example never-ordered recipes:", list(never_ordered)[:5])


### Actvity By Day

In [None]:
days = {'M' : 'Monday', 'T': 'Tuesday', 'W': 'Wednesday', 'Th': 'Thursday', 'F': 'Friday', 'Sa': 'Saturday', 'S': 'Sunday'}

# Count sessions by existing DayOfWeek column
day_counts = df['DayOfWeek'].map(days).value_counts()

# Identify the most active day
most_active_day = day_counts.idxmax()
print("Most active day of the week:", most_active_day)

# Plot
plt.figure(figsize=(8,5))
bars = plt.bar(day_counts.index, day_counts.values, color='lightgray')
bars[list(day_counts.index).index(most_active_day)].set_color('tomato')

plt.title('Sessions by Day of the Week')
plt.xlabel('Day')
plt.ylabel('Number of Sessions')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Usage by on time of day/meal (DayZone)

In [None]:
# Create a mapping for DayZone codes
zone_labels = {
    'B': 'Breakfast',
    'L': 'Lunch',
    'D': 'Dinner',
    'AD': 'After Dinner'
}

# Replace codes temporarily (no mutation of df)
dayzone_counts = df['DayZone'].map(zone_labels).value_counts()

# Plot
plt.figure(figsize=(7,5))
dayzone_counts.plot(kind='bar', color='mediumslateblue')
plt.title('Usage by Time of Day / Meal')
plt.xlabel('Meal / Time of Day')
plt.ylabel('Number of Sessions')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Users by region

In [None]:
# Unique users by region
users_by_region = df.groupby('Location')['UserID'].nunique().sort_values(ascending=True)

# Plot
plt.figure(figsize=(10,6))
users_by_region.plot(kind='barh', color='mediumseagreen')
plt.title('Number of Unique Users by Region')
plt.xlabel('Number of Users')
plt.ylabel('Region')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Usage on Weekday vs Weekend (By Day Type)

In [None]:
# Count of sessions by DayType
daytype_counts = df['DayType'].value_counts()

# Plot
plt.figure(figsize=(6,4))
daytype_counts.plot(kind='bar', color=['steelblue', 'salmon'])
plt.title('Session Count: Weekday vs Weekend')
plt.xlabel('Day Type')
plt.ylabel('Number of Sessions')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Number of visits per domain

In [None]:
# Count number of visits per domain
domain_counts = df['Domain'].value_counts()

# Plot
plt.figure(figsize=(10,5))
domain_counts.plot(kind='bar', color='darkcyan')
plt.title('Most Visited Domains')
plt.xlabel('Domain')
plt.ylabel('Number of Visits')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Browser Usage

In [None]:
# Function to extract browser name from UserAgent
def extract_browser(ua):
    if pd.isna(ua):
        return 'Unknown'
    ua = ua.lower()
    if 'chrome' in ua and 'edg' not in ua:
        return 'Chrome'
    elif 'firefox' in ua:
        return 'Firefox'
    elif 'safari' in ua and 'chrome' not in ua:
        return 'Safari'
    elif 'edge' in ua:
        return 'Edge'
    elif 'opera' in ua or 'opr' in ua:
        return 'Opera'
    else:
        return 'Other'

# Apply extraction
browser_series = df['UserAgent'].apply(extract_browser)

# Count occurrences
browser_counts = browser_series.value_counts()

# Plot
plt.figure(figsize=(8,5))
browser_counts.plot(kind='bar', color='royalblue')
plt.title('Browser Usage Distribution')
plt.xlabel('Browser')
plt.ylabel('Number of Sessions')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("Browser counts:")
print(browser_counts)


### Uesr Visit Frequency

In [None]:
# Count occurrences of each user
user_counts = df['UserID'].value_counts()

# Number of repeating users
repeating_users = (user_counts > 1).sum()
total_users = user_counts.shape[0]

print(f"Total users: {total_users}")
print(f"Repeating users: {repeating_users}")
print(f"Percentage of repeat users: {repeating_users / total_users * 100:.2f}%")

# Plot distribution of session counts per user
plt.figure(figsize=(8,5))
user_counts.value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of User Visit Frequency')
plt.xlabel('Number of Sessions per User')
plt.ylabel('Number of Users')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Features Usage

In [None]:
# Boolean-based feature usage
feature_cols = ['ChatInteraction', 'RecipeSearch', 'AddToCart', 'RecipeSaved', 'TabChange']

# Copy just to avoid touching df directly
temp = df.copy()

# ViewedRecipes metric: either ViewedRecipes or ViewedRecipesBeforeWidget is non-empty
temp['ViewedRecipesAction'] = np.where(
    temp['ViewedRecipes'].notna() | temp['ViewedRecipesBeforeWidget'].notna(), True, False
)

# Widget usage metric: WidgetDuration > 0
temp['WidgetUsed'] = temp['WidgetDuration'] > 0

# Combine all features into one metric dictionary
feature_usage = temp[feature_cols + ['ViewedRecipesAction', 'WidgetUsed']].sum().sort_values(ascending=False)

# Plot
plt.figure(figsize=(8,5))
feature_usage.plot(kind='bar', color='seagreen')
plt.title('Most Used Features')
plt.xlabel('Feature')
plt.ylabel('Usage Count')
plt.xticks(rotation=30, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Most Saved Recipes

In [None]:
# Filter only rows where a recipe was saved
saved_recipes = df[df['RecipeSaved'] == True]

# Count how many times each recipe was saved
most_saved_recipes = saved_recipes['RecipeID'].value_counts().head(10)

# Plot
plt.figure(figsize=(10,5))
most_saved_recipes.plot(kind='bar', color='tomato')
plt.title('Top 10 Most Saved Recipes')
plt.xlabel('Recipe ID')
plt.ylabel('Save Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Optional: print out the actual top recipes
print("Most saved recipes:")
print(most_saved_recipes)


### Common Allergies

In [None]:
# Drop NaN and split semicolon-separated allergy codes
allergy_series = (
    df['Allergy']
    .dropna()
    .str.split(';')
    .explode()
    .str.strip()
)

# Count frequency of each allergy code
allergy_counts = allergy_series.value_counts().head(15)  # top 15 for readability

# Plot
plt.figure(figsize=(10,5))
allergy_counts.plot(kind='bar', color='indianred')
plt.title('Most Common Allergy Codes')
plt.xlabel('Allergy Code')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("Top allergy codes:")
print(allergy_counts)


### Daily Active Users

In [None]:
# Count unique users per date
daily_active_users = df.groupby('Date')['UserID'].nunique()

# Plot
plt.figure(figsize=(10,5))
daily_active_users.plot(kind='line', marker='o')
plt.title('Daily Active Users (DAU)')
plt.xlabel('Date')
plt.ylabel('Number of Unique Users')
plt.xticks(rotation=45, ha='right')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Optional: print top/bottom 5 days
print("Top 5 days by activity:")
print(daily_active_users.sort_values(ascending=False).head())


### Tab change Percentage

In [None]:
# Total unique users
total_users = df['UserID'].nunique()

# Users who changed tabs at least once
users_tab_changed = df.loc[df['TabChange'] == True, 'UserID'].nunique()

# Calculate percentage
tab_change_pct = (users_tab_changed / total_users) * 100

print(f"Total users: {total_users}")
print(f"Users who changed tabs: {users_tab_changed}")
print(f"Percentage of users who changed tabs: {tab_change_pct:.2f}%")

plt.figure(figsize=(4,4))
plt.pie(
    [users_tab_changed, total_users - users_tab_changed],
    labels=['Changed Tab', 'Did Not Change Tab'],
    autopct='%1.1f%%',
    colors=['darkorange', 'lightgray']
)
plt.title('Percentage of Users Who Changed Tabs')
plt.tight_layout()
plt.show()


### Null rate by Region

In [None]:
# Drop rows without country
df_valid = df.dropna(subset=['Location']).copy()

# Define a flag for "viewed something or added to cart"
df_valid['FollowUpAction'] = np.where(
    df_valid['ViewedRecipes'].notna() | (df_valid['AddToCart'] == True),
    True,
    False
)

# For each country: calculate % of search sessions with no follow-up
search_null_rate = (
    df_valid[df_valid['RecipeSearch'] == True]
    .groupby('Location')
    .apply(lambda x: ((~x['FollowUpAction']).mean() * 100))
    .sort_values(ascending=False)
)

# Show top 10 countries
search_null_rate_top10 = search_null_rate.head(10)

# Plot
plt.figure(figsize=(10,6))
search_null_rate_top10.plot(kind='barh', color='firebrick')
plt.title('Search Null Rate by Country (Search Without View/Add-to-Cart)')
plt.xlabel('Null Rate (%)')
plt.ylabel('Country')
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("Search Null Rate by Country (%):")
print(search_null_rate.round(2))


### Usage by Month period

In [None]:
# Map MonthType codes to readable labels
month_labels = {
    'B': 'Beginning',
    'M': 'Middle',
    'E': 'End'
}

# Count usage per MonthType
usage_by_month_type = df['MonthType'].map(month_labels).value_counts().reindex(['Beginning', 'Middle', 'End'])

# Plot
plt.figure(figsize=(7,5))
usage_by_month_type.plot(kind='bar', color=['skyblue', 'steelblue', 'navy'])
plt.title('Usage Throughout the Month')
plt.xlabel('Month Period')
plt.ylabel('Number of Sessions')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

print("Usage count by month period:")
print(usage_by_month_type)


### Top Saved Ingredients and Ingredient Pair Synergy

In [None]:
from itertools import combinations
from collections import Counter

# Filter saved recipes and group by RecipeID
saved_recipes = df[df['RecipeSaved'] == True]

# Count saves per recipe
top_saved = saved_recipes['RecipeID'].value_counts().head(10)

# Filter dataset for only top saved recipes
top_recipes_df = df[df['RecipeID'].isin(top_saved.index)]

# Parse ingredients (semicolon-separated)
top_recipes_df = top_recipes_df.dropna(subset=['Ingredients']).copy()
top_recipes_df['IngredientList'] = top_recipes_df['Ingredients'].apply(lambda x: [i.strip() for i in x.split(';') if i.strip()])

# --- Average number of ingredients in top saved recipes ---
avg_ingredients = top_recipes_df['IngredientList'].apply(len).mean()
print(f"Average number of ingredients in top saved recipes: {avg_ingredients:.2f}")

# --- Most common ingredients among top recipes ---
all_ingredients = [i for sublist in top_recipes_df['IngredientList'] for i in sublist]
common_ingredients = pd.Series(all_ingredients).value_counts().head(10)

plt.figure(figsize=(10,5))
common_ingredients.plot(kind='bar', color='seagreen')
plt.title('Most Common Ingredients in Top Saved Recipes')
plt.xlabel('Ingredient ID')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# --- Common ingredient pairs ---
ingredient_pairs = Counter()
for ingredients in top_recipes_df['IngredientList']:
    for combo in combinations(sorted(set(ingredients)), 2):
        ingredient_pairs[combo] += 1

common_pairs = pd.Series(ingredient_pairs).sort_values(ascending=False)

print("\nMost common ingredient pairs in top saved recipes:")
print(common_pairs)


### Ready recipes
 How many users viewed/saved recipes that had ingredients already in the pantry, i.e which are ready to cook for them

In [None]:
# Filter users with pantry data
df_ready = df.dropna(subset=['Pantry']).copy()

# Parse pantry and ingredients into list form
def parse_list(x):
    if pd.isna(x):
        return []
    return [i.strip() for i in str(x).split(';') if i.strip()]

df_ready['PantryList'] = df_ready['Pantry'].apply(parse_list)
df_ready['IngredientsList'] = df_ready['Ingredients'].apply(parse_list)

# Define "Viewed or Saved" condition
df_ready['ViewedOrSaved'] = df_ready['ViewedRecipes'].notna() | (df_ready['RecipeSaved'] == True)

# Define readiness with 50% ingredient match threshold
def is_ready(pantry, recipe):
    if not recipe:
        return False
    overlap = len(set(pantry) & set(recipe))
    return overlap / len(recipe) >= 0.5  # 50% threshold

# Apply ready condition
df_ready['ReadyRecipe'] = df_ready.apply(
    lambda row: is_ready(row['PantryList'], row['IngredientsList']) if row['ViewedOrSaved'] else False,
    axis=1
)

# Count unique users who have at least one ready recipe
ready_users = df_ready[df_ready['ReadyRecipe']]['UserID'].nunique()
total_users = df_ready['UserID'].nunique()

ready_percentage = (ready_users / total_users) * 100

print(f"Total users with pantry info: {total_users}")
print(f"Users with ≥50% ingredients ready: {ready_users}")
print(f"Percentage of users with ready recipes: {ready_percentage:.2f}%")

# Optional pie visualization
ready_counts = [ready_users, total_users - ready_users]
plt.figure(figsize=(4,4))
plt.pie(
    ready_counts,
    labels=['50%+ Ready', 'Not Ready'],
    autopct='%1.1f%%',
    colors=['mediumseagreen', 'lightgray']
)
plt.title('Users with 50%+ Ready Recipes')
plt.tight_layout()
plt.show()


### Time to First Interaction
For each session, compute the time delta between SessionStart and the earliest of WidgetStart or ChatStart to estimate latency to first UI interaction; ignore sessions with both timestamps missing

In [None]:
# Convert relevant columns safely (without writing back)
session_start = pd.to_datetime(df['SessionStart'], format='%H:%M:%S', errors='coerce')
widget_start = pd.to_datetime(df['WidgetStart'], format='%H:%M:%S', errors='coerce')
chat_start = pd.to_datetime(df['ChatStart'], format='%H:%M:%S', errors='coerce')

# Compute the earliest interaction time (per row)
first_interaction = pd.concat([widget_start, chat_start], axis=1).min(axis=1)

# Filter out rows where both are missing
valid_sessions = df[~first_interaction.isna() & ~session_start.isna()].copy()

# Compute time delta (in minutes)
ttfi = (first_interaction[valid_sessions.index] - session_start[valid_sessions.index]).dt.total_seconds() / 60

# Handle midnight rollovers (negative durations)
ttfi = ttfi.apply(lambda x: x + 24*60 if x < 0 else x)

# Basic stats
avg_ttf = ttfi.mean()
median_ttf = ttfi.median()

print(f"Average Time-to-First-Interaction: {avg_ttf:.2f} minutes")
print(f"Median Time-to-First-Interaction: {median_ttf:.2f} minutes")

# Optional: visualize distribution
plt.figure(figsize=(8,5))
ttfi.dropna().plot(kind='hist', bins=20, color='slateblue', edgecolor='black')
plt.title('Distribution of Time-to-First-Interaction')
plt.xlabel('TTFI (minutes)')
plt.ylabel('Number of Sessions')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### Saved-to-Cart Migration Rate	
Among user–recipe pairs with any session where RecipeSaved = True, calculate the proportion that later exhibit AddToCart = True for the same RecipeID in a subsequent session, measuring commitment progression beyond save intent

In [None]:
# Filter rows where recipe was saved
saved_rows = df[df['RecipeSaved'] == True]

# Total saved user–recipe pairs
total_saved_pairs = saved_rows[['UserID', 'RecipeID']].drop_duplicates().shape[0]

# Saved and also added to cart (same session)
saved_and_cart = saved_rows[saved_rows['AddToCart'] == True]
migrated_pairs = saved_and_cart[['UserID', 'RecipeID']].drop_duplicates().shape[0]

# Calculate migration rate
migration_rate = (migrated_pairs / total_saved_pairs) * 100 if total_saved_pairs > 0 else 0

print(f"Total unique saved user–recipe pairs: {total_saved_pairs}")
print(f"Saved pairs also added to cart: {migrated_pairs}")
print(f'Saved-to-Cart Migration Rate: {migration_rate:.2f}%')

# Optional: pie visualization
plt.figure(figsize=(4,4))
plt.pie(
    [migrated_pairs, total_saved_pairs - migrated_pairs],
    labels=['Saved + Cart', 'Saved Only'],
    autopct='%1.1f%%',
    colors=['mediumseagreen', 'lightgray']
)
plt.title('Saved-to-Cart Migration Rate')
plt.tight_layout()
plt.show()


### Widget Reopen Rate	
Percentage of sessions where ActivityLog contains more than one “Widget opened” event, indicating that the widget was closed and reopened within the same session; parse ActivityLog tokens and count reopen occurrences per session, then divide by total sessions

In [None]:
# Count "Widget opened" occurrences per session
widget_open_counts = df['ActivityLog'].fillna('').apply(lambda x: str(x).count('Widget opened'))

# Sessions where widget was reopened (more than one open event)
reopened_sessions = (widget_open_counts > 1).sum()

# Total valid sessions
total_sessions = df.shape[0]

# Calculate reopen rate
reopen_rate = (reopened_sessions / total_sessions) * 100

print(f"Total sessions: {total_sessions}")
print(f"Sessions with widget reopened: {reopened_sessions}")
print(f"Widget Reopen Rate: {reopen_rate:.2f}%")

# Optional: plot reopen vs non-reopen sessions
plt.figure(figsize=(4,4))
plt.pie(
    [reopened_sessions, total_sessions - reopened_sessions],
    labels=['Reopened', 'Single Open'],
    autopct='%1.1f%%',
    colors=['mediumseagreen', 'lightgray']
)
plt.title('Widget Reopen Rate')
plt.tight_layout()
plt.show()


### Multi-Session Recipe Commitment	
Count recipes that are viewed in one session and saved/carted in a subsequent session by the same user, indicating deliberate consideration behavior.

In [None]:
# Ensure date column is parsed correctly
views = df[df['ViewedRecipes'].notna()][['UserID', 'RecipeID', 'Date']].copy()
saves_carts = df[(df['RecipeSaved'] == True) | (df['AddToCart'] == True)][['UserID', 'RecipeID', 'Date']].copy()

# Rename to avoid column conflicts
views = views.rename(columns={'Date': 'ViewDate'})
saves_carts = saves_carts.rename(columns={'Date': 'ActionDate'})

# Parse to datetime
views['ViewDate'] = pd.to_datetime(views['ViewDate'], errors='coerce')
saves_carts['ActionDate'] = pd.to_datetime(saves_carts['ActionDate'], errors='coerce')

# Get earliest view per user–recipe pair
first_view = views.groupby(['UserID', 'RecipeID'], as_index=False)['ViewDate'].min().rename(columns={'ViewDate': 'FirstViewDate'})

# Merge views with actions on user–recipe
merged = pd.merge(first_view, saves_carts, on=['UserID', 'RecipeID'], how='inner')

# Keep only actions that happened after the first view
committed_pairs = merged[merged['ActionDate'] > merged['FirstViewDate']][['UserID', 'RecipeID']].drop_duplicates()

# Totals
total_viewed_pairs = first_view.shape[0]
committed_count = committed_pairs.shape[0]
commitment_rate = (committed_count / total_viewed_pairs) * 100 if total_viewed_pairs > 0 else 0

print(f"Total viewed user–recipe pairs: {total_viewed_pairs}")
print(f"Pairs with later save/cart action: {committed_count}")
print(f"Multi-Session Recipe Commitment Rate: {commitment_rate:.2f}%")

# Optional visualization
plt.figure(figsize=(4,4))
plt.pie(
    [committed_count, total_viewed_pairs - committed_count],
    labels=['Committed (View→Later Save/Cart)', 'No Commitment'],
    autopct='%1.1f%%',
    colors=['teal', 'lightgray']
)
plt.title('Multi-Session Recipe Commitment Rate')
plt.tight_layout()
plt.show()


### Exploratory Index	
For each session, calculate: (Number of unique recipes viewed + Number of unique domains viewed) / Session duration in minutes.

In [None]:
# Parse recipe and domain lists (semicolon-separated)
viewed_recipes = df['ViewedRecipes'].fillna('').apply(lambda x: len(set(i.strip() for i in str(x).split(';') if i.strip())))
viewed_domains = df['Domain'].fillna('').apply(lambda x: len(set(i.strip() for i in str(x).split(';') if i.strip())))

# Convert start/end times safely (no mutation)
session_start = pd.to_datetime(df['SessionStart'], format='%H:%M:%S', errors='coerce')
session_end = pd.to_datetime(df['SessionEnd'], format='%H:%M:%S', errors='coerce')

# Compute session duration in minutes
session_duration = (session_end - session_start).dt.total_seconds() / 60
session_duration = session_duration.apply(lambda x: x + 24*60 if x < 0 else x)  # handle midnight wraparounds

# Avoid division by zero (ignore sessions with 0 duration)
valid = session_duration > 0

# Compute Exploratory Index
exploratory_index = (viewed_recipes + viewed_domains) / session_duration
exploratory_index = exploratory_index[valid]

# Print summary stats
print(f"Average Exploratory Index: {exploratory_index.mean():.3f}")
print(f"Median Exploratory Index: {exploratory_index.median():.3f}")

# Optional: plot distribution
plt.figure(figsize=(8,5))
exploratory_index.plot(kind='hist', bins=20, color='darkcyan', edgecolor='black')
plt.title('Distribution of Exploratory Index')
plt.xlabel('Exploratory Index (Unique Views per Minute)')
plt.ylabel('Number of Sessions')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Conversion Rate for Users with Allergies
The ratio of the conversion rate (AddToCart or RecipeSaved) for users with >=3 allergies compared to the platform average rate.

In [None]:
# Helper to count allergies safely
def allergy_count(x):
    if pd.isna(x):
        return 0
    return len([i.strip() for i in str(x).split(';') if i.strip()])

# Count allergies per user (series)
user_allergy_counts = df.groupby('UserID')['Allergy'].first().apply(allergy_count)

# Identify constrained users (≥3 allergies)
constrained_users = set(user_allergy_counts[user_allergy_counts >= 3].index)

# Conversion flags
conversion = (df['AddToCart'] == True) | (df['RecipeSaved'] == True)

# Overall platform conversion rate
platform_rate = conversion.mean() * 100

# Conversion rate among constrained users
constrained_rate = conversion[df['UserID'].isin(constrained_users)].mean() * 100

# Ratio of constrained to overall
success_ratio = (constrained_rate / platform_rate) if platform_rate > 0 else 0

print(f"Platform conversion rate: {platform_rate:.2f}%")
print(f"Constrained user conversion rate (≥3 allergies): {constrained_rate:.2f}%")
print(f"Constrained User Success Rate (ratio): {success_ratio:.3f}")

# Optional: bar plot comparison
plt.figure(figsize=(6,4))
plt.bar(['All Users', '≥3 Allergies'], [platform_rate, constrained_rate], color=['steelblue', 'salmon'])
plt.title('Constrained User Success Rate')
plt.ylabel('Conversion Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Browsing & Added to Cart	
Number of sessions with view/search recipe action without carting actions /  Total Number of sessions with view/search recipe actions

In [None]:
# Define session-level flags
view_or_search = df['ViewedRecipes'].notna() | (df['RecipeSearch'] == True)
cart_action = df['AddToCart'] == True

# Numerator: sessions with view/search but no cart
browse_only_sessions = (view_or_search) & (~cart_action)

# Denominator: all sessions with view/search
total_browsing_sessions = view_or_search

# Calculate browsing rate
browse_rate = (browse_only_sessions.sum() / total_browsing_sessions.sum()) * 100 if total_browsing_sessions.sum() > 0 else 0

print(f"Total sessions with view/search: {total_browsing_sessions.sum()}")
print(f"Sessions with view/search but no cart: {browse_only_sessions.sum()}")
print(f'Session-Level Browsing Behavior Rate: {browse_rate:.2f}%')

# Optional visualization
plt.figure(figsize=(4,4))
plt.pie(
    [browse_only_sessions.sum(), total_browsing_sessions.sum() - browse_only_sessions.sum()],
    labels=['Browse Only', 'Browse + Cart'],
    autopct='%1.1f%%',
    colors=['goldenrod', 'lightgray']
)
plt.title('Session-Level Browsing Behavior')
plt.tight_layout()
plt.show()


### user drop off %	
% of users that leave wihtout doing any actual tasks

In [None]:
# Define engagement (any real task)
engagement = (
    df['ViewedRecipes'].notna() |
    (df['RecipeSearch'] == True) |
    (df['RecipeSaved'] == True) |
    (df['AddToCart'] == True) |
    (df['WidgetDuration'] > 0) |
    (df['ChatInteraction'] == True)
)

# Drop-off = NOT engaged
dropoff = ~engagement

# Count unique users
total_users = df['UserID'].nunique()
dropoff_users = df.loc[dropoff, 'UserID'].nunique()

dropoff_rate = (dropoff_users / total_users * 100) if total_users > 0 else 0

print(f"Total users: {total_users}")
print(f"Users who dropped off (no meaningful action): {dropoff_users}")
print(f"User Drop-Off Rate: {dropoff_rate:.2f}%")

# Optional pie chart
plt.figure(figsize=(4,4))
plt.pie(
    [dropoff_users, total_users - dropoff_users],
    labels=['Drop-Off Users', 'Engaged Users'],
    autopct='%1.1f%%',
    colors=['firebrick', 'lightgray']
)
plt.title('User Drop-Off Rate')
plt.tight_layout()
plt.show()


### user drop off location	
which page users drop off from more ( ActivityLog)

In [None]:
# Engagement definition
engagement = (
    df['ViewedRecipes'].notna() |
    (df['RecipeSearch'] == True) |
    (df['RecipeSaved'] == True) |
    (df['AddToCart'] == True) |
    (df['WidgetDuration'] > 0) |
    (df['ChatInteraction'] == True)
)

# Drop-off sessions = NOT engaged
dropoff_sessions = df[~engagement].copy()

# Extract the final event/page from ActivityLog
def get_last_event(log):
    if pd.isna(log):
        return "No Activity Recorded"
    parts = [p.strip() for p in str(log).split(';') if p.strip()]
    return parts[-1] if parts else "No Activity Recorded"

dropoff_sessions['LastEvent'] = dropoff_sessions['ActivityLog'].apply(get_last_event)

# Count frequency of drop-off points
dropoff_points = dropoff_sessions['LastEvent'].value_counts()

print("Most common drop-off pages/events:")
print(dropoff_points.head(10))

# Optional: plot
plt.figure(figsize=(10,5))
dropoff_points.head(10).plot(kind='bar', color='firebrick')
plt.title('Most Common Drop-Off Locations (Last Logged Event)')
plt.xlabel('Event / Page')
plt.ylabel('Drop-Off Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


### Domain Service Classification	
classify partners into providing good recipe browsing, or good  ingredient buying services etc

In [None]:
# Define meaningful actions
view = df['ViewedRecipes'].notna()
save = df['RecipeSaved'] == True
cart = df['AddToCart'] == True
widget = df['WidgetDuration'] > 0

# Engagement = any of above
engagement = view | save | cart | widget
dropoff = ~engagement

# Aggregate at domain level
domain_stats = df.groupby('Domain').apply(lambda x: pd.Series({
    'sessions': len(x),
    'view_rate': (view.loc[x.index].sum() / len(x)) * 100,
    'save_rate': (save.loc[x.index].sum() / len(x)) * 100,
    'cart_rate': (cart.loc[x.index].sum() / len(x)) * 100,
    'widget_rate': (widget.loc[x.index].sum() / len(x)) * 100,
    'dropoff_rate': (dropoff.loc[x.index].sum() / len(x)) * 100
}))

# Classification rule
def classify(row):
    # pick dominant behavior
    behaviors = {
        'browsing': row['view_rate'],
        'intent/saving': row['save_rate'],
        'commerce/cart': row['cart_rate'],
        'widget assist': row['widget_rate'],
        'drop-off': row['dropoff_rate']
    }
    # return the behavior with max %
    return max(behaviors, key=behaviors.get)

domain_stats['classification'] = domain_stats.apply(classify, axis=1)

domain_stats_sorted = domain_stats.sort_values(by='sessions', ascending=False)

print(domain_stats_sorted.head(10))


### Widget Duration vs Engagement	
relate widget durating times to number of recipes viewed or saved

In [None]:
# --- Widget duration vs engagement in a single block ---

# 1. Extract required metrics without modifying df
widget_duration = df['WidgetDuration'].fillna(0)

# count recipes viewed per session
view_count = df['ViewedRecipes'].fillna('').apply(
    lambda x: len([i for i in str(x).split(';') if i.strip()])
)

# count recipe saves (True → 1, False/NaN → 0)
save_count = (df['RecipeSaved'] == True).astype(int)

# total recipe engagement for the session
recipe_engagement = view_count + save_count

# 2. Consider only sessions where widget was actually used
mask = widget_duration > 0
wd = widget_duration[mask]
eng = recipe_engagement[mask]

# 3. Correlation
correlation = wd.corr(eng)
print(f"Correlation between widget duration and recipe engagement: {correlation:.3f}")

# 4. Visualize relationship (scatter)
plt.figure(figsize=(7,5))
plt.scatter(wd, eng, alpha=0.6)
plt.title('Widget Duration vs Recipe Engagement')
plt.xlabel('Widget Duration')
plt.ylabel('Recipes Viewed/Saved')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


### Chat Duration vs Engagement
relate chat duration times tonumber of viewd/saved recipes

In [None]:
# --- Chat duration vs recipe engagement ---

# 1. Compute chat duration from ChatStart/ChatEnd safely
chat_start = pd.to_datetime(df['ChatStart'], format='%H:%M:%S', errors='coerce')
chat_end   = pd.to_datetime(df['ChatEnd'],   format='%H:%M:%S', errors='coerce')

chat_duration = (chat_end - chat_start).dt.total_seconds() / 60
chat_duration = chat_duration.apply(lambda x: x + 24*60 if x < 0 else x)  # handle midnight rollover
chat_duration = chat_duration.fillna(0)

# 2. Extract recipe engagement per session (same logic as before)
view_count = df['ViewedRecipes'].fillna('').apply(
    lambda x: len([i for i in str(x).split(';') if i.strip()])
)
save_count = (df['RecipeSaved'] == True).astype(int)
recipe_engagement = view_count + save_count

# 3. Consider only sessions where user actually used chat
mask = chat_duration > 0
cd = chat_duration[mask]
eng = recipe_engagement[mask]

# 4. Correlation between chat duration and engagement
correlation = cd.corr(eng)
print(f"Correlation between chat duration and recipe engagement: {correlation:.3f}")

# 5. Scatter plot
plt.figure(figsize=(7,5))
plt.scatter(cd, eng, alpha=0.6)
plt.title('Chat Duration vs Recipe Engagement')
plt.xlabel('Chat Duration (minutes)')
plt.ylabel('Recipes Viewed/Saved')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()
