In [7]:
import pandas as pd
from tqdm.notebook import tqdm


df = pd.read_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/raw/_test_3.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [8]:
df = df.head(100)

In [9]:
print(df.shape)
df.head(2)

(100, 18)


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,jFlnRBO8mcg,The Savings Expert: Passive Income Is A Scam! ...,"Morgan Housel, global expert on personal finan...",2025-10-06T07:00:44Z,1023328,23390,2241,UgxZ7109QlZor9DvxaN4AaABAg,The idea that our culture prioritizes freedom ...,@michaelcupper,UCYu6S_dTgdQCop5mVZenYvw,2508,2025-10-08T09:51:51Z,False,False,
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,jFlnRBO8mcg,The Savings Expert: Passive Income Is A Scam! ...,"Morgan Housel, global expert on personal finan...",2025-10-06T07:00:44Z,1023328,23390,2241,Ugz0jfygK2CpiaGjZB54AaABAg,I’ve read almost every book on making money bu...,@rodey-f7z3w,UCQIKp48s3SOe7dYMBuCsBgg,1191,2025-10-08T03:09:24Z,False,False,


## Guest Name Processing - GPT-4

In [10]:
import os
import json
import re
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

def get_guest_names_openrouter(description: str) -> list:
    """
    Extract true podcast guest names from a YouTube description using OpenRouter (Claude 3.5 / GPT-4-mini).
    Ignores names used as references or examples.
    """
    if not isinstance(description, str) or not description.strip():
        return []

    prompt = f"""
    You are a podcast metadata assistant.

    Task:
    - Read the YouTube video description carefully.
    - Identify ONLY the actual guest(s) who appear in the episode or are directly interviewed.
    - Ignore people mentioned just as examples, comparisons, or references (e.g., Warren Buffett, Elon Musk).
    - If multiple guests appear, include all of them.
    - Preserve professional titles (e.g., "Dr", "Prof", "Sir") if present.
    - Return a clean JSON list of guest names, for example:
      ["Morgan Housel"]
      or ["Dr Andrew Huberman", "Lex Fridman"]
    - If no guest is clearly identified, return an empty list [].

    Description:
    \"\"\"{description}\"\"\"
    """

    try:
        completion = client.chat.completions.create(
            model="anthropic/claude-3.5-sonnet",  # you can change to "openai/gpt-4o-mini"
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=200,
        )

        content = completion.choices[0].message.content.strip()

        # Try parsing JSON
        try:
            result = json.loads(content)
        except json.JSONDecodeError:
            match = re.search(r'\[(.*?)\]', content)
            if match:
                inner = match.group(1)
                result = [n.strip().strip('"').strip() for n in inner.split(",") if n.strip()]
            else:
                result = re.findall(r"(?:Dr\.?|Prof\.?|Mr\.?|Ms\.?)?\s?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+", content)

        if isinstance(result, str):
            result = [result]
        result = [r.strip() for r in result if r.strip()]
        result = list(set(result))

    except Exception as e:
        print(f"Error processing description: {e}")
        result = []

    return result


# ------------------------------------------------------
# 🔁 Apply once per unique video_id
# ------------------------------------------------------

def assign_guest_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply guest extraction once per unique video_id.
    Adds a 'guest_list' column to the DataFrame.
    """
    # Create mapping: video_id → guest list
    mapping = {}
    unique_videos = df.drop_duplicates(subset="video_id")[["video_id", "video_description"]]

    for _, row in unique_videos.iterrows():
        vid = row["video_id"]
        desc = row["video_description"]
        guests = get_guest_names_openrouter(desc)
        mapping[vid] = guests

    # Map results back to main DataFrame
    df["guest_list"] = df["video_id"].map(mapping)
    return df

In [11]:
df = assign_guest_names(df)

In [12]:
df['guest_list'].value_counts()

guest_list
[Morgan Housel]    100
Name: count, dtype: int64

In [13]:
df.head(2)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,guest_list
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,jFlnRBO8mcg,The Savings Expert: Passive Income Is A Scam! ...,"Morgan Housel, global expert on personal finan...",2025-10-06T07:00:44Z,1023328,23390,2241,UgxZ7109QlZor9DvxaN4AaABAg,The idea that our culture prioritizes freedom ...,@michaelcupper,UCYu6S_dTgdQCop5mVZenYvw,2508,2025-10-08T09:51:51Z,False,False,,[Morgan Housel]
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,jFlnRBO8mcg,The Savings Expert: Passive Income Is A Scam! ...,"Morgan Housel, global expert on personal finan...",2025-10-06T07:00:44Z,1023328,23390,2241,Ugz0jfygK2CpiaGjZB54AaABAg,I’ve read almost every book on making money bu...,@rodey-f7z3w,UCQIKp48s3SOe7dYMBuCsBgg,1191,2025-10-08T03:09:24Z,False,False,,[Morgan Housel]


In [14]:
df['video_description'].values[0]

'Morgan Housel, global expert on personal finance, shares powerful lessons on Warren Buffett’s hidden struggles, Elon Musk’s sacrifices, money trauma and financial habits, how to invest wisely, and the psychology behind saving, spending, and success.   Morgan Housel is a partner at Collaborative Fund, former columnist for The Wall Street Journal, and a speaker on investing, saving, spending, and financial independence. He is also the bestselling author of books, such as: ‘The Psychology of Money’'

In [15]:
# Drop the heavy text column to speed up further LLM steps
# df = df.drop(columns=["video_description"])

## Text cleaning

In [16]:
import re
import spacy
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

# ------------------------------------------------------
# 1. Text Cleaning (light only)
# ------------------------------------------------------
def clean_comment(text: str) -> str:
    """
    Light cleaning for comments:
    - Remove @mentions
    - Remove URLs
    - Remove emojis / non-ascii
    - Lowercase
    - Strip whitespace
    - Keep context words (no lemmatization, no stopword removal yet)
    """
    if not isinstance(text, str):
        return ""

    # remove mentions
    text = re.sub(r"@\w+", "", text)

    # remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # remove emojis/non-ascii
    text = text.encode("ascii", "ignore").decode()

    # lowercase + strip
    text = text.lower().strip()

    return text


In [17]:
# Apply light cleaning to the comment_text column
df["cleaned_text"] = df["comment_text"].apply(clean_comment)

# Preview the results
df[["comment_text", "cleaned_text"]].head(10)

Unnamed: 0,comment_text,cleaned_text
0,The idea that our culture prioritizes freedom ...,the idea that our culture prioritizes freedom ...
1,I’ve read almost every book on making money bu...,ive read almost every book on making money but...
2,He who knows that enough is enough will always...,he who knows that enough is enough will always...
3,I’m a doctor and ‘Secrets To Perfect Health’ b...,im a doctor and secrets to perfect health by n...
4,The world is a scam 😂,the world is a scam
5,What nobody tells students is that making mone...,what nobody tells students is that making mone...
6,Most men think success is all about grinding h...,most men think success is all about grinding h...
7,People once used to compare themselves to peop...,people once used to compare themselves to peop...
8,I am retiring in 2 years at the age of 62 I ha...,i am retiring in 2 years at the age of 62 i ha...
9,"To me, I DO NOT need to go AWAY to enjoy mysel...","to me, i do not need to go away to enjoy mysel..."


## Topic category (llama) -> Once per video

In [18]:
import ollama
import json

def get_topic_category(title: str) -> str:
    """
    Use Ollama to classify the video title into a topic category.
    Example categories: health, mental health, productivity, finance, relationships, entrepreneurship, other.
    """
    prompt = f"""
    You are a helpful assistant. Categorize the following YouTube video title into ONE broad category:
    - health
    - mental health
    - productivity
    - finance
    - relationships
    - entrepreneurship
    - Religion / Spirituality
    - Technology
    - Education
    - Lifestyle
    - Entertainment
    - other

    Title: "{title}"

    Return only the category name, nothing else.
    """

    response = ollama.chat(
        model="llama3.2:3b",  # you can swap to another local model
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip().lower()


In [19]:
# Enable tqdm for pandas
tqdm.pandas()

# Extract unique video_id/title pairs
video_meta = df[["video_id", "video_title"]].drop_duplicates()

# Apply Ollama category classification
video_meta["Topic_Category"] = video_meta["video_title"].apply(get_topic_category)

# Merge back into main dataframe
df = df.merge(video_meta[["video_id", "Topic_Category"]], on="video_id", how="left")

In [20]:
df['Topic_Category'].value_counts()

Topic_Category
finance    100
Name: count, dtype: int64

## Sentiment analysis (Gemma2)

In [21]:
import re
import json
import numpy as np
import pandas as pd
import ollama
from tqdm import tqdm
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tqdm.pandas()

# -----------------------------
# 0) Helpers & resources
# -----------------------------
NEG_PHRASES = [
    r"\bnot good\b", r"\bnot great\b", r"\bnot helpful\b", r"\bdon't like\b", r"\bdont like\b",
    r"\bnot worth\b", r"\bwaste of time\b", r"\btoo long\b", r"\btoo slow\b", r"\btoo loud\b",
    r"\bmisleading\b", r"\bclickbait\b", r"\bbiased\b", r"\bconfusing\b", r"\bannoying\b",
    r"\bcringe\b", r"\bstupid\b", r"\bdumb\b", r"\bbad\b", r"\bawful\b", r"\bterrible\b",
    r"\buseless\b", r"\bpointless\b", r"\bwrong\b", r"\bpoor (audio|sound|quality)\b",
    r"\birrelevant\b", r"\boff\-topic\b", r"\bproblem\b", r"\bissue\b", r"\bdisappoint(ing|ed)\b",
    r"\brude\b", r"\boffensive\b", r"\bunfunny\b", r"\bboring\b", r"\blazy\b", r"\btoxic\b",
    r"\bhate\b", r"\bgarbage\b", r"\bignorant\b", r"\bweird\b", r"\bnegative\b", r"\bbroken\b",
    r"\bdownvote\b", r"\bterribly\b", r"\bdislike\b", r"\bpathetic\b", r"\bworse\b"
]
NEG_RE = re.compile("|".join(NEG_PHRASES))

SHORT_PRAISE_RE = re.compile(r"^(nice|cool|great|good|amazing|awesome|wow|love|thanks|perfect)[.!]?$", re.I)

# Load spaCy (light model for entity recognition)
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

def clean_comment(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = text.encode("ascii", "ignore").decode()
    return text.strip()


def bucket_from_p(p: float) -> str:
    if p < 0.35:
        return "Negative"
    if p > 0.65:
        return "Positive"
    return "Neutral"

# -----------------------------
# 1) HF Sentiment (RoBERTa)
# -----------------------------
_HF_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
_tok = AutoTokenizer.from_pretrained(_HF_MODEL)
_model = AutoModelForSequenceClassification.from_pretrained(_HF_MODEL)
_model.eval()

def roberta_probs(text: str) -> dict:
    if not text:
        return {"neg": 0.0, "neu": 1.0, "pos": 0.0}
    with torch.no_grad():
        inputs = _tok(text[:512], return_tensors="pt")
        logits = _model(**inputs).logits
        probs = F.softmax(logits, dim=1).cpu().numpy()[0]
    return {"neg": float(probs[0]), "neu": float(probs[1]), "pos": float(probs[2])}


# -----------------------------
# 2) LLM Sentiment (Gemma 2B)
# -----------------------------
def gemma_score_01(text: str) -> float:
    if not text:
        return 0.5
    prompt = f"""
    You are a neutral linguistic expert analyzing sentiment.
    Evaluate only the tone of the YouTube comment.
    Consider sarcasm and negation carefully.
    Return JSON only: {{"score": <float between -1.0 and 1.0>}}
    Comment: "{text}"
    """
    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        j = json.loads(resp["message"]["content"])
        s = float(j.get("score", 0.0))
    except Exception:
        s = 0.0

    p = (s + 1.0) / 2.0
    if NEG_RE.search(text) and p > 0.5:
        p -= 0.25
    if SHORT_PRAISE_RE.match(text) and p > 0.7:
        p = 0.6
    return float(np.clip(p, 0.0, 1.0))


# -----------------------------
# 3) Ensemble Sentiment
# -----------------------------
def ensemble_sentiment(text: str) -> dict:
    text_clean = clean_comment(text.lower())
    if not text_clean:
        return {"p_pos": 0.5, "bucket": "Neutral", "p_pos_llm": 0.5, "p_pos_hf": 0.33, "p_neg_hf": 0.33}

    p_pos_llm = gemma_score_01(text_clean)
    hf = roberta_probs(text_clean)
    p_pos_hf, p_neg_hf = hf["pos"], hf["neg"]

    w_llm, w_hf = 0.4, 0.6
    p_pos = w_llm * p_pos_llm + w_hf * p_pos_hf

    if p_neg_hf - p_pos_hf > 0.20 and p_pos > 0.3:
        p_pos -= 0.20

    p_pos = float(np.clip(p_pos, 0.0, 1.0))
    bucket = bucket_from_p(p_pos)
    return {
        "p_pos": p_pos,
        "bucket": bucket,
        "p_pos_llm": p_pos_llm,
        "p_pos_hf": p_pos_hf,
        "p_neg_hf": p_neg_hf,
    }


# -----------------------------
# 4) Likes Weighting
# -----------------------------
def like_weight(likes: float) -> float:
    if likes is None or likes <= 0:
        return 1.0
    if likes < 10:
        return 1.2
    if likes < 100:
        return 2.0
    if likes < 500:
        return 3.0
    return 4.0


# -----------------------------
# 5) Improved Guest Extraction
# -----------------------------

BANNED_GUEST_WORDS = {
    "jesus", "praise jesus", "ohhhh", "video", "motivation", "johari", "topic", "content"
}

GUEST_HINTS = re.compile(r"(with|feat\.|featuring|guest|bring back|have .* on|invite|episode with)", re.I)
TITLE_CUE_RE = re.compile(r"feat\.|featuring|with|guest|w/", re.I)

def extract_guests(text: str, video_title: str = "") -> list:
    """
    Extract guest names from comment or title with 3-layer logic:
    1. spaCy PERSON entities (fast)
    2. Gemma (if hints present)
    3. Fallback to video title if contains 'feat.' etc.
    """
    text = clean_comment(text)
    if not text:
        return []

    doc = nlp(text)
    names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]

    # If found via spaCy, trust them (light clean)
    if names:
        names = [n for n in names if len(n) > 2 and n.lower() not in BANNED_GUEST_WORDS]
        return list(set(names))

    # Only call LLM if hints present
    if not GUEST_HINTS.search(text):
        return []

    prompt = f"""
    Extract only the names of people explicitly mentioned or suggested as podcast guests.
    Ignore general words, religious figures, or vague text.
    Return only JSON array of proper names (no duplicates, no empty values).
    Example:
    ["Jordan Peterson", "Dr K", "Alex O'Connor"]
    Comment: "{text}"
    """

    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role": "user", "content": prompt}])
        names = json.loads(resp["message"]["content"])
    except Exception:
        names = []

    # Post-filter
    if isinstance(names, list):
        names = [
            n.strip() for n in names
            if n and len(n) > 2 and n[0].isupper() and n.lower() not in BANNED_GUEST_WORDS
        ]
    else:
        names = []

    # Fallback from video title
    if not names and TITLE_CUE_RE.search(video_title):
        doc_t = nlp(video_title)
        title_names = [ent.text.strip() for ent in doc_t.ents if ent.label_ == "PERSON"]
        names.extend(title_names)

    return list(set(names))



# -----------------------------
# 6) Topic Request Extraction
# -----------------------------
TOPIC_HINTS = re.compile(r"(talk about|episode on|discuss|cover|would love|should do|next guest|topic|content about)", re.I)

def extract_topics(text: str) -> list:
    text = clean_comment(text)
    if not text or not TOPIC_HINTS.search(text):
        return []

    prompt = f"""
    Extract specific topic requests or subjects mentioned in this YouTube comment.
    Return only JSON list of short topics.
    Example: ["AI", "mental health", "fitness"]
    Comment: "{text}"
    """
    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        topics = json.loads(resp["message"]["content"])
        if isinstance(topics, list):
            return topics
    except Exception:
        pass
    return []


# -----------------------------
# 7) Apply All to DataFrame
# -----------------------------
def apply_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["cleaned_text"] = df["cleaned_text"].fillna("").astype(str)

    # Sentiment
    res = df["cleaned_text"].progress_apply(ensemble_sentiment)
    df["sentiment_p_pos"] = res.apply(lambda r: r["p_pos"])
    df["sentiment_bucket"] = res.apply(lambda r: r["bucket"])
    df["p_pos_llm"] = res.apply(lambda r: r["p_pos_llm"])
    df["p_pos_hf"] = res.apply(lambda r: r["p_pos_hf"])
    df["p_neg_hf"] = res.apply(lambda r: r["p_neg_hf"])

    # Weights
    df["comment_like_count"] = df["comment_like_count"].fillna(0).astype(int)
    df["comment_weight"] = df["comment_like_count"].apply(like_weight)
    df["impact_weighted_sentiment"] = df["sentiment_p_pos"] * df["comment_weight"]

    # New columns — guests & topics
    df["guest_mentions"] = df.progress_apply(
        lambda r: extract_guests(r["cleaned_text"], r.get("video_title", "")), axis=1
    )

    df["topic_requests"] = df["cleaned_text"].progress_apply(extract_topics)

    return df


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# -----------------------------
# 6) Run
# -----------------------------
df_sent = apply_sentiment(df)

100%|██████████| 100/100 [00:51<00:00,  1.95it/s]
100%|██████████| 100/100 [00:22<00:00,  4.48it/s]
100%|██████████| 100/100 [00:03<00:00, 32.94it/s]


In [23]:
df_sent[[
    "comment_text",
    "cleaned_text",
    "sentiment_p_pos",
    "sentiment_bucket",
    "p_pos_llm",
    "p_pos_hf",
    "p_neg_hf",
    "comment_like_count",
    "comment_weight",
    "impact_weighted_sentiment",
    "guest_list",
    "topic_requests"
]].head(5)

Unnamed: 0,comment_text,cleaned_text,sentiment_p_pos,sentiment_bucket,p_pos_llm,p_pos_hf,p_neg_hf,comment_like_count,comment_weight,impact_weighted_sentiment,guest_list,topic_requests
0,The idea that our culture prioritizes freedom ...,the idea that our culture prioritizes freedom ...,0.152635,Negative,0.835,0.031058,0.625315,2508,4.0,0.610539,[Morgan Housel],[]
1,I’ve read almost every book on making money bu...,ive read almost every book on making money but...,0.808495,Positive,0.935,0.724158,0.047799,1191,4.0,3.233979,[Morgan Housel],[]
2,He who knows that enough is enough will always...,he who knows that enough is enough will always...,0.630817,Neutral,0.8125,0.509695,0.02462,1008,4.0,2.523268,[Morgan Housel],[]
3,I’m a doctor and ‘Secrets To Perfect Health’ b...,im a doctor and secrets to perfect health by n...,0.902741,Positive,0.84,0.944569,0.008702,801,4.0,3.610966,[Morgan Housel],[]
4,The world is a scam 😂,the world is a scam,0.208264,Negative,0.5,0.013774,0.877446,603,4.0,0.833058,[Morgan Housel],[]


In [24]:
df_sent['sentiment_bucket'].value_counts()

sentiment_bucket
Negative    42
Positive    37
Neutral     21
Name: count, dtype: int64

In [25]:
df_sent['guest_list'].value_counts()

guest_list
[Morgan Housel]    100
Name: count, dtype: int64

In [26]:
df_sent['topic_requests'].value_counts()

topic_requests
[]    100
Name: count, dtype: int64

In [27]:
# df_sent.to_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/processed/processed_snapshot4.csv', index=False)