## Top Funnel Analytics
#### Focused on views, likes, comments
#### Need to get video category (topic) AND Guest(s) extracted first

In [1]:
import pandas as pd
from tqdm.notebook import tqdm


df = pd.read_csv('/Users/riadanas/Desktop/steven_bartlett_project/data/raw/DIARY_all_pod.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
#### CREATE "date_since_published" COLUMN ####
from datetime import datetime, timezone

# Ensure the column is parsed as datetime
df["video_published_at"] = pd.to_datetime(df["video_published_at"], utc=True, errors="coerce")

# Get today's date in UTC
today = datetime.now(timezone.utc)

# Compute the number of days since publication
df["days_since_published"] = (today - df["video_published_at"]).dt.days

# Optional sanity check
print(df[["video_id", "video_published_at", "days_since_published"]].head())


      video_id        video_published_at  days_since_published
0  B7tnfSPySb0 2025-01-16 08:00:16+00:00                   286
1  B7tnfSPySb0 2025-01-16 08:00:16+00:00                   286
2  B7tnfSPySb0 2025-01-16 08:00:16+00:00                   286
3  B7tnfSPySb0 2025-01-16 08:00:16+00:00                   286
4  B7tnfSPySb0 2025-01-16 08:00:16+00:00                   286


In [3]:
print(df.shape)
df.head(2)

(402054, 19)


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,days_since_published
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doi...,Dr Georgia Ede is a Harvard trained psychiatri...,2025-01-16 08:00:16+00:00,2038491,61491,6274,Ugy5hSCgpLgiCntpLsh4AaABAg,Thanks! I I completely changed my diet six mon...,@ethann.12,UCvKPBMJVZ0HkF6QfOuqYmMA,4377,2025-01-21T23:36:59Z,False,False,,286
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doi...,Dr Georgia Ede is a Harvard trained psychiatri...,2025-01-16 08:00:16+00:00,2038491,61491,6274,UgwMJZJs2UnjR6cFBi14AaABAg,I have Bipolar and have recently decided to st...,@TheBipolarBoss,UCCY-1n70e24Sq2YaNPyP24A,3933,2025-01-16T08:20:19Z,False,False,,286


### Guest Name Processing - GPT-4

In [4]:
import os
import json
import re
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

def get_guest_names_openrouter(description: str) -> list:
    """
    Extract true podcast guest names from a YouTube description using OpenRouter (Claude 3.5 / GPT-4-mini).
    Ignores names used as references or examples.
    """
    if not isinstance(description, str) or not description.strip():
        return []

    prompt = f"""
    You are a podcast metadata assistant.

    Task:
    - Read the YouTube video description carefully.
    - Identify ONLY the actual guest(s) who appear in the episode or are directly interviewed.
    - If a guest's name was misspelled, correct it based on context.
    - Make sure to not miss guests that go by nicknames (e.g., "The Rock" or "MrBeast").
    - Ignore people mentioned just as examples, comparisons, or references (e.g., Warren Buffett, Elon Musk) unless they are clearly stated as guests.
    - If multiple guests appear, include all of them.
    - Preserve professional titles (e.g., "Dr", "Prof", "Sir") if present.
    - Return a clean JSON list of guest names, for example:
      ["Morgan Housel"]
      or ["Dr Andrew Huberman", "Lex Fridman"]
    - If no guest is clearly identified, return an empty list [].

    Description:
    \"\"\"{description}\"\"\"
    """

    try:
        completion = client.chat.completions.create(
            model="openai/gpt-4o-mini",  # you can change to "openai/gpt-4o-mini"
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=200,
        )

        content = completion.choices[0].message.content.strip()

        # Try parsing JSON
        try:
            result = json.loads(content)
        except json.JSONDecodeError:
            match = re.search(r'\[(.*?)\]', content)
            if match:
                inner = match.group(1)
                result = [n.strip().strip('"').strip() for n in inner.split(",") if n.strip()]
            else:
                result = re.findall(r"(?:Dr\.?|Prof\.?|Mr\.?|Ms\.?)?\s?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+", content)

        if isinstance(result, str):
            result = [result]
        result = [r.strip() for r in result if r.strip()]
        result = list(set(result))

    except Exception as e:
        print(f"Error processing description: {e}")
        result = []

    return result


# ------------------------------------------------------
# 🔁 Apply once per unique video_id
# ------------------------------------------------------

def assign_guest_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply guest extraction once per unique video_id.
    Adds a 'guest_list' column to the DataFrame.
    """
    # Create mapping: video_id → guest list
    mapping = {}
    unique_videos = df.drop_duplicates(subset="video_id")[["video_id", "video_description"]]

    for _, row in unique_videos.iterrows():
        vid = row["video_id"]
        desc = row["video_description"]
        guests = get_guest_names_openrouter(desc)
        mapping[vid] = guests

    # Map results back to main DataFrame
    df["guest_list"] = df["video_id"].map(mapping)
    return df

In [5]:
df['video_description'].values[0]

"Dr Georgia Ede is a Harvard trained psychiatrist specialising in nutritional and metabolic psychiatry. She is the author of the book, ‘Change Your Diet, Change Your Mind: A powerful plan to improve mood, overcome anxiety and protect memory for a lifetime of optimal mental health’.   00:00 Intro 02:02 What Do You Do? 02:21 Is Metabolic Psychiatry a New Term? 03:38 Why Is the Ketogenic Diet at the Heart of Your Work? 04:40 What's Your Academic Experience? 05:38 What Does Practicing Psychiatry Invo"

In [6]:
df = assign_guest_names(df)

In [7]:
df.head(1)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,days_since_published,guest_list
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doi...,Dr Georgia Ede is a Harvard trained psychiatri...,2025-01-16 08:00:16+00:00,2038491,61491,6274,Ugy5hSCgpLgiCntpLsh4AaABAg,Thanks! I I completely changed my diet six mon...,@ethann.12,UCvKPBMJVZ0HkF6QfOuqYmMA,4377,2025-01-21T23:36:59Z,False,False,,286,[Dr Georgia Ede]


In [8]:
df['guest_list'].value_counts()

guest_list
[Daniel Priestley, Gary Stevenson]                      51707
[Geoffrey Hinton]                                       28392
[Logan Ury, Scott Galloway]                             25038
[Louise Perry, Deborah Frances-White, Erica Komisar]    24551
[Konstantin Kisin, Daniel Priestly, Scott Galloway]     22318
[Cathie Wood]                                           20718
[Bret Weinstein, Daniel Priestley, Amjad Masad]         16431
[Dr Anna Lembke]                                        13362
[Erica Komisar]                                         12778
[Dr Benjamin Bikman]                                    10798
[]                                                       9263
[Dr. William Li]                                         9262
[MrBeast]                                                9100
[Jefferson Fisher]                                       8022
[Dr Nathan Bryan]                                        6946
[Morgan Housel]                                          68

### Topic Category

In [9]:
import ollama
import json

def get_topic_category(title: str) -> str:
    """
    Use Ollama to classify the video title into a topic category.
    Example categories: health, mental health, productivity, finance, relationships, entrepreneurship, other.
    """
    prompt = f"""
    You are a helpful assistant. Categorize the following YouTube video title into ONE broad category:
    - health
    - mental health / psychology
    - productivity / personal development
    - finance
    - relationships
    - entrepreneurship / business
    - Religion / Spirituality
    - Technology
    - Education
    - Lifestyle
    - Entertainment
    - other

    Title: "{title}"

    Return only the category name, nothing else.
    """

    response = ollama.chat(
        model="llama3.2:3b",  # you can swap to another local model
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip().lower()


In [10]:
# Enable tqdm for pandas
tqdm.pandas()

# Extract unique video_id/title pairs
video_meta = df[["video_id", "video_title"]].drop_duplicates()

# Apply Ollama category classification
video_meta["Topic_Category"] = video_meta["video_title"].apply(get_topic_category)

# Merge back into main dataframe
df = df.merge(video_meta[["video_id", "Topic_Category"]], on="video_id", how="left")

In [11]:
df['Topic_Category'].value_counts()

Topic_Category
mental health / psychology             90933
finance                                58540
health                                 52438
entertainment                          42088
relationships                          37695
entrepreneurship / business            33384
religion / spirituality                28392
productivity / personal development    23359
religion/spirituality                  18794
technology                             16431
Name: count, dtype: int64

## Top Funnel KPIs

In [12]:
print(df.shape)
df.head(2)

(402054, 21)


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,days_since_published,guest_list,Topic_Category
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doi...,Dr Georgia Ede is a Harvard trained psychiatri...,2025-01-16 08:00:16+00:00,2038491,61491,6274,Ugy5hSCgpLgiCntpLsh4AaABAg,Thanks! I I completely changed my diet six mon...,@ethann.12,UCvKPBMJVZ0HkF6QfOuqYmMA,4377,2025-01-21T23:36:59Z,False,False,,286,[Dr Georgia Ede],mental health / psychology
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doi...,Dr Georgia Ede is a Harvard trained psychiatri...,2025-01-16 08:00:16+00:00,2038491,61491,6274,UgwMJZJs2UnjR6cFBi14AaABAg,I have Bipolar and have recently decided to st...,@TheBipolarBoss,UCCY-1n70e24Sq2YaNPyP24A,3933,2025-01-16T08:20:19Z,False,False,,286,[Dr Georgia Ede],mental health / psychology


In [13]:
df.columns

Index(['channel_name', 'channel_id', 'video_id', 'video_title',
       'video_description', 'video_published_at', 'view_count',
       'video_like_count', 'comment_count', 'comment_id', 'comment_text',
       'author', 'author_id', 'comment_like_count', 'comment_published_at',
       'is_pinned', 'is_reply', 'parent_comment_id', 'days_since_published',
       'guest_list', 'Topic_Category'],
      dtype='object')

In [14]:
print(df['guest_list'].value_counts())
print("\n")
print(df['Topic_Category'].value_counts())

guest_list
[Daniel Priestley, Gary Stevenson]                      51707
[Geoffrey Hinton]                                       28392
[Logan Ury, Scott Galloway]                             25038
[Louise Perry, Deborah Frances-White, Erica Komisar]    24551
[Konstantin Kisin, Daniel Priestly, Scott Galloway]     22318
[Cathie Wood]                                           20718
[Bret Weinstein, Daniel Priestley, Amjad Masad]         16431
[Dr Anna Lembke]                                        13362
[Erica Komisar]                                         12778
[Dr Benjamin Bikman]                                    10798
[]                                                       9263
[Dr. William Li]                                         9262
[MrBeast]                                                9100
[Jefferson Fisher]                                       8022
[Dr Nathan Bryan]                                        6946
[Morgan Housel]                                          68

In [15]:
unique_guests = sorted({g for lst in df["guest_list"] for g in lst})
print(unique_guests)

['Alex Hormozi', 'Amjad Masad', 'Bret Weinstein', 'Cathie Wood', 'Charlie Houpert', 'Chris Eubank Jr.', 'Craig Robinson', 'Daniel Priestley', 'Daniel Priestly', 'Deborah Frances-White', 'Dr Anna Lembke', 'Dr Benjamin Bikman', 'Dr Bill von Hippel', 'Dr Daniel Amen', 'Dr Georgia Ede', 'Dr Joe Dispenza', 'Dr Jordan Peterson', 'Dr Mohit Khera', 'Dr Nathan Bryan', 'Dr Peter Attia', 'Dr Sarah Berry', 'Dr Stacy Sims', 'Dr Vonda Wright', 'Dr. Courtney Conley', 'Dr. Lisa Feldman Barrett', 'Dr. Sara Szal', 'Dr. Sarah Wakeman', 'Dr. William Li', 'Emma Grede', 'Erica Komisar', 'Esther Perel', 'Evan Spiegel', 'Evy Poumpouras', 'Gary Stevenson', 'Geoffrey Hinton', 'Jeff Cavaliere', 'Jefferson Fisher', 'Jimmy Fallon', 'Joe Navarro', 'Konstantin Kisin', 'Logan Ury', 'Louise Perry', 'Master Shi Heng Yi', 'Michelle Obama', 'Mike Dowd', 'Morgan Housel', 'MrBeast', 'Paul Brunson', 'Robert Greene', 'Scott Galloway', 'Simon Sinek', 'Steven', 'Vanessa Van Edwards', 'Vinh Giang']


In [16]:
##### #### Quick Name Clean up to clear misspellings #####
##### Standardize guest names using fuzzy matching #####

from rapidfuzz import process, fuzz

def standardize_guest_names(guest_lists, threshold=90):
    """
    Standardize guest names using fuzzy matching.
    - Works across all names in guest_lists.
    - Groups near-identical names under the same canonical version.
    """
    # Flatten and collect unique names
    all_names = sorted({g for lst in guest_lists for g in lst if isinstance(g, str)})
    name_map = {}

    for name in all_names:
        # If we already have entries, try to match this name to an existing canonical one
        if name_map:
            result = process.extractOne(
                name,
                name_map.keys(),
                scorer=fuzz.token_sort_ratio
            )
            if result:
                match, score, _ = result
                if score >= threshold:
                    name_map[name] = match
                    continue  # found a close match, move on
        # Otherwise, or if no good match found → new canonical entry
        name_map[name] = name

    return name_map

In [17]:
mapping = standardize_guest_names(df["guest_list"])
df["guest_list"] = df["guest_list"].apply(lambda lst: [mapping[g] for g in lst])

In [18]:
print(df['guest_list'].value_counts())

guest_list
[Daniel Priestley, Gary Stevenson]                      51707
[Geoffrey Hinton]                                       28392
[Logan Ury, Scott Galloway]                             25038
[Louise Perry, Deborah Frances-White, Erica Komisar]    24551
[Konstantin Kisin, Daniel Priestley, Scott Galloway]    22318
[Cathie Wood]                                           20718
[Bret Weinstein, Daniel Priestley, Amjad Masad]         16431
[Dr Anna Lembke]                                        13362
[Erica Komisar]                                         12778
[Dr Benjamin Bikman]                                    10798
[]                                                       9263
[Dr. William Li]                                         9262
[MrBeast]                                                9100
[Jefferson Fisher]                                       8022
[Dr Nathan Bryan]                                        6946
[Morgan Housel]                                          68

In [19]:
video_df = (
    df.drop_duplicates(subset="video_id")
      .loc[:, ["days_since_published","video_id", "video_title", "guest_list", "Topic_Category",
               "view_count", "video_like_count", "comment_count"]]
)

In [20]:
video_df["n_guests"] = video_df["guest_list"].apply(lambda x: len(x) if isinstance(x, list) and len(x) > 0 else 1)
video_df["views_per_guest"] = (video_df["view_count"] / video_df["n_guests"]).round(2)
video_df["likes_per_guest"] = (video_df["video_like_count"] / video_df["n_guests"]).round(2)
video_df["comments_per_guest"] = (video_df["comment_count"] / video_df["n_guests"]).round(2)

# Now safely explode
guest_df = video_df.explode("guest_list")

# Each guest gets fair share
guest_df = guest_df.rename(columns={"guest_list": "guest"})

In [21]:
pd.set_option("display.max_colwidth", None)

print(guest_df.shape)
guest_df

(59, 12)


Unnamed: 0,days_since_published,video_id,video_title,guest,Topic_Category,view_count,video_like_count,comment_count,n_guests,views_per_guest,likes_per_guest,comments_per_guest
0,286,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doing To Your Body! Can It Cure 43% Of Mental Illness?,Dr Georgia Ede,mental health / psychology,2038491,61491,6274,1,2038491.0,61491.0,6274.0
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Louise Perry,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Deborah Frances-White,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Erica Komisar,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
30960,247,7l_0ZcSuRwg,"Secret Agent: If You’re Easily Offended, You’re Easily Manipulated! This 1 Trick Catches A Lie In 2s",Steven,productivity / personal development,3806975,112769,5411,2,1903487.5,56384.5,2705.5
30960,247,7l_0ZcSuRwg,"Secret Agent: If You’re Easily Offended, You’re Easily Manipulated! This 1 Trick Catches A Lie In 2s",Evy Poumpouras,productivity / personal development,3806975,112769,5411,2,1903487.5,56384.5,2705.5
36537,184,uxu37dqVR90,The Savings Expert: The Truth About America Collapsing! The Cost Of Living Is About To Skyrocket!,Morgan Housel,finance,2820273,57474,6755,1,2820273.0,57474.0,6755.0
43370,254,9gk3mNJs2FY,The Sex Psychologist: We're Not Having Enough Sex! Fat Makes You Attractive! Dr Bill Von Hippel,Dr Bill von Hippel,mental health / psychology,1095572,21074,3206,1,1095572.0,21074.0,3206.0
46629,205,ZxXruY7llcc,Peter Attia: Anti-aging Cure No One Talks About! 50% Chance You’ll Die In A Year If This Happens!,Dr Peter Attia,health,1450645,31924,2108,1,1450645.0,31924.0,2108.0
48796,216,0GQozcTPyO0,"Hormone Expert: Control Your Hormones Control Your Belly Fat! Cortisol, oestrogen, testosterone.",Dr. Sara Szal,health,932985,28358,2168,1,932985.0,28358.0,2168.0


#### Topic Score

In [22]:
#### TOPIC SCORE
## Drop duplicate video ids to not get the same podcast counted multiple times

topic_df = (
    df.drop_duplicates(subset="video_id")
      .loc[:, ["video_id", "Topic_Category", "view_count", "video_like_count", "comment_count", "video_title"]]
)

In [23]:
print(topic_df.shape)
topic_df

(48, 6)


Unnamed: 0,video_id,Topic_Category,view_count,video_like_count,comment_count,video_title
0,B7tnfSPySb0,mental health / psychology,2038491,61491,6274,The Keto Psychiatrist: What Keto Is Really Doing To Your Body! Can It Cure 43% Of Mental Illness?
6409,ZHuZ_8VYCWA,relationships,1968583,46664,24218,DEBATE: Feminist Women Vs Non-Feminist Women
30960,7l_0ZcSuRwg,productivity / personal development,3806975,112769,5411,"Secret Agent: If You’re Easily Offended, You’re Easily Manipulated! This 1 Trick Catches A Lie In 2s"
36537,uxu37dqVR90,finance,2820273,57474,6755,The Savings Expert: The Truth About America Collapsing! The Cost Of Living Is About To Skyrocket!
43370,9gk3mNJs2FY,mental health / psychology,1095572,21074,3206,The Sex Psychologist: We're Not Having Enough Sex! Fat Makes You Attractive! Dr Bill Von Hippel
46629,ZxXruY7llcc,health,1450645,31924,2108,Peter Attia: Anti-aging Cure No One Talks About! 50% Chance You’ll Die In A Year If This Happens!
48796,0GQozcTPyO0,health,932985,28358,2168,"Hormone Expert: Control Your Hormones Control Your Belly Fat! Cortisol, oestrogen, testosterone."
51027,EdlXcVu1CTs,productivity / personal development,854152,14482,1738,The Business Expert: How To Build A Brand In 2025! They're Lying To You About Work-Life Balance!
52785,It5_C6AF1pk,health,1624462,48760,3137,Exercise & Nutrition Scientist: The Truth About Exercise On Your Period! Take These 4 Supplements!
55933,Hik6OY-nk4c,religion/spirituality,2046763,54184,6392,Jordan B Peterson: You Need To Listen To Your Wife! We've Built A Lonely & Sexless Society!


In [24]:
#### AVERAGE by Topic
## Average here works better than sum so topic categories that appear more don't bias the results!

topic_stats = (
    topic_df.groupby("Topic_Category", as_index=False)
             .agg({
                 "view_count": "mean",
                 "video_like_count": "mean",
                 "comment_count": "mean",
                 "video_id": "count"
             })
             .rename(columns={"video_id": "n_videos"})
)


##Topics: you used means per topic, to remove bias from topics that simply appear more often → ✅

In [25]:
#### Normalize Metrics
## Views, likes, and comments are in different scales — we want to make them comparable
## 👉 This rescales all values of that column between 0 and 1, using:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
for col in ["view_count", "video_like_count", "comment_count"]:
    topic_stats[f"{col}_norm"] = scaler.fit_transform(topic_stats[[col]])

In [26]:
#### apply weights

topic_stats["weighted_score"] = (
    0.5 * topic_stats["comment_count_norm"] +
    0.3 * topic_stats["video_like_count_norm"] +
    0.2 * topic_stats["view_count_norm"]
).round(2)

In [27]:
topic_stats["rank"] = topic_stats["weighted_score"].rank(ascending=False)
topic_stats['rank'] = topic_stats['rank'].astype(int)

topic_stats.sort_values("weighted_score", ascending=False)

Unnamed: 0,Topic_Category,view_count,video_like_count,comment_count,n_videos,view_count_norm,video_like_count_norm,comment_count_norm,weighted_score,rank
7,religion / spirituality,10774070.0,240870.0,27956.0,1,1.0,1.0,0.961516,0.98,1
2,finance,3835924.0,78074.5,28923.5,2,0.256391,0.222213,1.0,0.62,2
9,technology,3163223.0,63647.0,16094.0,1,0.184293,0.153283,0.489685,0.33,3
0,entertainment,2493375.0,63345.4,8267.0,5,0.112501,0.151842,0.178352,0.16,4
4,mental health / psychology,2593571.0,66718.916667,7429.416667,12,0.123239,0.167959,0.145036,0.15,5
8,religion/spirituality,2382793.0,65220.666667,7163.666667,3,0.100649,0.160801,0.134465,0.14,6
1,entrepreneurship / business,3041194.0,71531.0,5644.5,4,0.171214,0.19095,0.074038,0.13,7
3,health,2440389.0,76397.1,5125.3,10,0.106822,0.214199,0.053386,0.11,8
6,relationships,1443701.0,31564.0,9293.0,4,0.0,0.0,0.219163,0.11,8
5,productivity / personal development,2616397.0,73190.833333,3783.166667,6,0.125686,0.19888,0.0,0.08,10


- 1.0 = best performing topic in that metric
- 0.0 = weakest performing topic
- Values between 0–1 = proportionally scaled in between.

#### Guest Score

##### KPI values are divided equally among guests to avoid inflating multi-guest episodes.
##### This assumes each guest contributes roughly equally to the episode’s performance.

In [28]:
pd.set_option("display.max_colwidth", None)

print(guest_df.shape)
guest_df

(59, 12)


Unnamed: 0,days_since_published,video_id,video_title,guest,Topic_Category,view_count,video_like_count,comment_count,n_guests,views_per_guest,likes_per_guest,comments_per_guest
0,286,B7tnfSPySb0,The Keto Psychiatrist: What Keto Is Really Doing To Your Body! Can It Cure 43% Of Mental Illness?,Dr Georgia Ede,mental health / psychology,2038491,61491,6274,1,2038491.0,61491.0,6274.0
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Louise Perry,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Deborah Frances-White,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
6409,132,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Erica Komisar,relationships,1968583,46664,24218,3,656194.33,15554.67,8072.67
30960,247,7l_0ZcSuRwg,"Secret Agent: If You’re Easily Offended, You’re Easily Manipulated! This 1 Trick Catches A Lie In 2s",Steven,productivity / personal development,3806975,112769,5411,2,1903487.5,56384.5,2705.5
30960,247,7l_0ZcSuRwg,"Secret Agent: If You’re Easily Offended, You’re Easily Manipulated! This 1 Trick Catches A Lie In 2s",Evy Poumpouras,productivity / personal development,3806975,112769,5411,2,1903487.5,56384.5,2705.5
36537,184,uxu37dqVR90,The Savings Expert: The Truth About America Collapsing! The Cost Of Living Is About To Skyrocket!,Morgan Housel,finance,2820273,57474,6755,1,2820273.0,57474.0,6755.0
43370,254,9gk3mNJs2FY,The Sex Psychologist: We're Not Having Enough Sex! Fat Makes You Attractive! Dr Bill Von Hippel,Dr Bill von Hippel,mental health / psychology,1095572,21074,3206,1,1095572.0,21074.0,3206.0
46629,205,ZxXruY7llcc,Peter Attia: Anti-aging Cure No One Talks About! 50% Chance You’ll Die In A Year If This Happens!,Dr Peter Attia,health,1450645,31924,2108,1,1450645.0,31924.0,2108.0
48796,216,0GQozcTPyO0,"Hormone Expert: Control Your Hormones Control Your Belly Fat! Cortisol, oestrogen, testosterone.",Dr. Sara Szal,health,932985,28358,2168,1,932985.0,28358.0,2168.0


In [29]:
#### Aggregate metrics by guest
#### Use mean as your main ranking metric (fair and consistent). #####

guest_stats = (
    guest_df.groupby("guest", as_index=False)
             .agg({
                 "views_per_guest": "mean",
                 "likes_per_guest": "mean",
                 "comments_per_guest": "mean",
                 "video_id": "count"
             })
             .rename(columns={"video_id": "appearances"})
)


In [30]:
guest_stats

Unnamed: 0,guest,views_per_guest,likes_per_guest,comments_per_guest,appearances
0,Alex Hormozi,2577451.0,55216.0,2487.0,1
1,Amjad Masad,1054408.0,21215.67,5364.67,1
2,Bret Weinstein,1054408.0,21215.67,5364.67,1
3,Cathie Wood,5291988.0,131318.0,10125.0,1
4,Charlie Houpert,2589022.0,56655.0,3410.0,1
5,Chris Eubank Jr.,1382321.0,29589.0,4129.0,1
6,Craig Robinson,954040.0,34444.0,4375.5,1
7,Daniel Priestley,1552020.0,34768.613333,12745.0,3
8,Deborah Frances-White,656194.3,15554.67,8072.67,1
9,Dr Anna Lembke,5043118.0,163175.0,13061.0,1


In [31]:
#### Normalize each metric

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
for col in ["views_per_guest", "likes_per_guest", "comments_per_guest"]:
    guest_stats[f"{col}_norm"] = scaler.fit_transform(guest_stats[[col]])

In [32]:
#### Apply weights

guest_stats["weighted_score"] = (
    0.5 * guest_stats["comments_per_guest_norm"] +
    0.3 * guest_stats["likes_per_guest_norm"] +
    0.2 * guest_stats["views_per_guest_norm"]
).round(2)


In [33]:
guest_stats["rank"] = guest_stats["weighted_score"].rank(ascending=False)
guest_stats['rank'] = guest_stats['rank'].astype(int)

guest_stats.sort_values("weighted_score", ascending=False).head(10)

Unnamed: 0,guest,views_per_guest,likes_per_guest,comments_per_guest,appearances,views_per_guest_norm,likes_per_guest_norm,comments_per_guest_norm,weighted_score,rank
33,Geoffrey Hinton,10774070.0,240870.0,27956.0,1,1.0,1.0,1.0,1.0,1
32,Gary Stevenson,2425788.0,49337.5,25546.0,1,0.174898,0.160457,0.911168,0.54,2
9,Dr Anna Lembke,5043118.0,163175.0,13061.0,1,0.433581,0.65944,0.450977,0.51,3
10,Dr Benjamin Bikman,5387358.0,162020.0,10597.0,1,0.467604,0.654377,0.360155,0.47,4
3,Cathie Wood,5291988.0,131318.0,10125.0,1,0.458178,0.519802,0.342757,0.42,5
35,Jefferson Fisher,4766274.0,149002.0,7744.0,1,0.406219,0.597316,0.254994,0.39,6
26,Dr. William Li,3658744.0,136560.0,8991.0,1,0.296757,0.542779,0.300958,0.37,7
45,MrBeast,3760693.0,101905.0,8938.0,1,0.306833,0.390876,0.299005,0.33,8
7,Daniel Priestley,1552020.0,34768.613333,12745.0,3,0.088539,0.096597,0.439329,0.27,9
17,Dr Nathan Bryan,2440853.0,104461.0,6803.0,1,0.176387,0.402079,0.22031,0.27,9


## Plot Results For Guests

In [34]:
# “Top Guests by Engagement Score”

import plotly.express as px

fig = px.bar(
    guest_stats.sort_values("weighted_score", ascending=True),
    x="weighted_score",
    y="guest",
    orientation="h",
    title="Top Guests by Weighted Engagement Score",
    text="weighted_score",
    color="weighted_score",
    color_continuous_scale="Blues"
)
fig.update_traces(texttemplate="%{text:.2f}", textposition="outside")
fig.update_layout(xaxis_title="Weighted Score", yaxis_title="Guest", height=600)
fig.show()


In [35]:
#### Scatter (Bubble) Chart — “Engagement Landscape”

fig = px.scatter(
    guest_stats,
    x="views_per_guest",
    y="comments_per_guest",
    size="likes_per_guest",
    color="weighted_score",
    hover_name="guest",
    title="Guest Engagement Landscape: Views vs Comments (Bubble = Likes)",
    color_continuous_scale="Viridis",
    size_max=40
)
fig.update_layout(xaxis_title="Views per Guest", yaxis_title="Comments per Guest", height=600)
fig.show()


In [36]:
import plotly.express as px

# Sort guests by weighted_score descending
guest_stats = guest_stats.sort_values("weighted_score", ascending=False)

# Melt for multi-metric comparison
melted = guest_stats.melt(
    id_vars=["guest"],
    value_vars=["views_per_guest", "likes_per_guest", "comments_per_guest"],
    var_name="Metric",
    value_name="Score"
)

fig = px.bar(
    melted,
    x="guest",
    y="Score",
    color="Metric",
    barmode="group",
    title="Guest KPI Comparison (Log Scale, Sorted by Weighted Engagement)",
    color_discrete_sequence=px.colors.sequential.YlGnBu[::-1]  # richer colors (reversed palette)
)

# Layout improvements
fig.update_layout(
    xaxis_title="Guest",
    yaxis_title="KPI (Log Scale)",
    yaxis_type="log",
    bargap=0.25,
    bargroupgap=0.05,
    height=600,
    legend_title_text="Metric",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color="black", size=12),
)

# Rotate names and tidy up bars
fig.update_xaxes(tickangle=45)
fig.update_traces(
    marker_line_color="black",
    marker_line_width=0.5,
    opacity=0.9,
    texttemplate="%{y:.0f}",
    textposition="outside"
)

fig.show()


In [37]:
import plotly.figure_factory as ff

# Sort the DataFrame by comments (desc), then likes (desc)
guest_sorted = guest_stats.sort_values(
    by=["comments_per_guest_norm", "likes_per_guest_norm"],
    ascending=[True, True]
)

# Extract data for the heatmap
z = guest_sorted[["views_per_guest_norm", "likes_per_guest_norm", "comments_per_guest_norm"]].values
x = ["Views", "Likes", "Comments"]
y = guest_sorted["guest"].tolist()

# Create annotated heatmap
fig = ff.create_annotated_heatmap(
    z=z,
    x=x,
    y=y,
    colorscale="YlGnBu",
    showscale=True
)

fig.update_layout(
    title="Engagement Intensity by Guest and Metric (Sorted by Comments → Likes)",
    height=600
)

fig.show()


## Plot Results Per Topic Category

In [38]:
import plotly.express as px

fig = px.bar(
    topic_stats.sort_values("weighted_score", ascending=True),
    x="weighted_score",
    y="Topic_Category",
    orientation="h",
    title="Top Topics by Weighted Engagement Score",
    text="weighted_score",
    color="weighted_score",
    color_continuous_scale="Blues"
)

fig.update_traces(
    texttemplate="%{text:.2f}",
    textposition="outside",
    marker_line_color="black",
    marker_line_width=0.5,
    opacity=0.9
)

fig.update_layout(
    xaxis_title="Weighted Score",
    yaxis_title="Topic Category",
    height=600,
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color="black", size=12),
    coloraxis_showscale=False,
    margin=dict(l=120, r=40, t=60, b=40)
)

fig.show()


In [39]:
import plotly.express as px

fig = px.scatter(
    topic_stats,
    x="view_count",
    y="comment_count",
    size="video_like_count",
    color="weighted_score",
    hover_name="Topic_Category",
    title="Topic Engagement Landscape: Views vs Comments (Bubble = Likes)",
    color_continuous_scale="YlGnBu",
    size_max=50,
)

fig.update_layout(
    xaxis_title="Views per Topic",
    yaxis_title="Comments per Topic",
    height=600,
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color="black", size=12),
)

# Improve bubble visibility and clarity
fig.update_traces(
    marker=dict(
        line=dict(width=1, color="black"),
        opacity=0.85
    )
)

fig.show()


In [40]:
import plotly.express as px

# Sort topics by weighted_score descending
topic_sorted = topic_stats.sort_values("weighted_score", ascending=False)

# Melt for multi-metric comparison
melted = topic_sorted.melt(
    id_vars=["Topic_Category", "weighted_score"],
    value_vars=["view_count", "video_like_count", "comment_count"],
    var_name="Metric",
    value_name="Score"
)

# Create grouped bar chart (log scale)
fig = px.bar(
    melted,
    x="Topic_Category",
    y="Score",
    color="Metric",
    barmode="group",
    title="Topic KPI Comparison (Log Scale, Sorted by Engagement)",
    color_discrete_sequence=px.colors.sequential.YlGnBu[::-1]
)

# Layout and design improvements
fig.update_layout(
    xaxis_title="Topic Category (Sorted by Weighted Score)",
    yaxis_title="KPI (Log Scale)",
    yaxis_type="log",
    bargap=0.25,
    bargroupgap=0.05,
    height=600,
    legend_title_text="Metric",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color="black", size=12),
    margin=dict(l=80, r=40, t=60, b=80)
)

# Rotate names and tidy up bars
fig.update_xaxes(tickangle=45)
fig.update_traces(
    marker_line_color="black",
    marker_line_width=0.5,
    opacity=0.9,
    texttemplate="%{y:.0f}",
    textposition="outside"
)

fig.show()

In [41]:
import plotly.figure_factory as ff

# Sort topics by normalized comment and like counts (descending)
topic_sorted = topic_stats.sort_values(
    by=["comment_count_norm", "video_like_count_norm"],
    ascending=[True, True]
)

# Extract normalized KPI data for the heatmap
z = topic_sorted[["view_count_norm", "video_like_count_norm", "comment_count_norm"]].values
x = ["Views", "Likes", "Comments"]
y = topic_sorted["Topic_Category"].tolist()

# Create annotated heatmap
fig = ff.create_annotated_heatmap(
    z=z,
    x=x,
    y=y,
    colorscale="YlGnBu",
    showscale=True,
    annotation_text=[[f"{val:.2f}" for val in row] for row in z],  # rounded annotations
    hoverinfo="z"
)

# Layout adjustments for better readability
fig.update_layout(
    title="Engagement Intensity by Topic and Metric (Sorted by Comments → Likes)",
    height=650,
    xaxis_title="Metric",
    yaxis_title="Topic Category",
    plot_bgcolor="white",
    paper_bgcolor="white",
    font=dict(color="black", size=12),
    margin=dict(l=150, r=50, t=70, b=40)
)

fig.show()
