In [1]:
import pandas as pd
from tqdm.notebook import tqdm


df = pd.read_csv('/Users/riadanas/Desktop/steven_bartlett_project/data/raw/DIARY_all_pod.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
print(df.shape)

# Keep a small sample for testing
#df = df.head(100)

(269430, 18)


In [3]:
import random

video_ids = df['video_id'].unique()[5:10]
df = df[df['video_id'].isin(video_ids)].sample(n=100)

In [4]:
print(df.shape)
print(df['video_id'].value_counts())
print(df['video_title'].value_counts())
df.head(2)

(100, 18)
video_id
Hik6OY-nk4c    28
ldizQkuWpDE    27
atejm2w2jWY    26
It5_C6AF1pk    14
0GQozcTPyO0     5
Name: count, dtype: int64
video_title
Jordan B Peterson: You Need To Listen To Your Wife! We've Built A Lonely & Sexless Society!           28
Body Language Expert Explains Why People Dislike You                                                  27
Shaolin Warrior Master: Hidden Epidemic Nobody Talks About! This Modern Habit Is Killing Millions!    26
Exercise & Nutrition Scientist: The Truth About Exercise On Your Period! Take These 4 Supplements!    14
Hormone Expert: Control Your Hormones Control Your Belly Fat! Cortisol, oestrogen, testosterone.       5
Name: count, dtype: int64


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
33591,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Hik6OY-nk4c,Jordan B Peterson: You Need To Listen To Your ...,Dr Jordan Peterson is a world-renowned former ...,2025-01-13T08:00:19Z,2046763,54184,6392,UgzCgbdzte3zTTQex6Z4AaABAg,This was very eye opening. I was sort of like ...,@JarreVonDuck,UCQMQlxaRQf04JUkceJ6obpw,0,2025-01-23T21:52:03Z,False,False,
25333,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,0GQozcTPyO0,Hormone Expert: Control Your Hormones Control ...,"Is your belly fat, stress, or burnout actually...",2025-03-27T06:00:00Z,932985,28358,2168,Ugzn7DFdwNefG9A-_bh4AaABAg.AGAV74jOz9WAGAdQyQHulW,You can’t even spell hormone. Shush.,@Beauwagner,UCwppeGE5ffvJR3LWs2X1efQ,0,2025-03-27T13:55:06Z,False,True,Ugzn7DFdwNefG9A-_bh4AaABAg


## Guest Name Processing - GPT-4

In [5]:
import os
import json
import re
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

def get_guest_names_openrouter(description: str) -> list:
    """
    Extract true podcast guest names from a YouTube description using OpenRouter (Claude 3.5 / GPT-4-mini).
    Ignores names used as references or examples.
    """
    if not isinstance(description, str) or not description.strip():
        return []

    prompt = f"""
    You are a podcast metadata assistant.

    Task:
    - Read the YouTube video description carefully.
    - Identify ONLY the actual guest(s) who appear in the episode or are directly interviewed.
    - If a guest's name was misspelled, correct it based on context.
    - Make sure to not miss guests that go by nicknames (e.g., "The Rock" or "MrBeast").
    - Ignore people mentioned just as examples, comparisons, or references (e.g., Warren Buffett, Elon Musk) unless they are clearly stated as guests.
    - If multiple guests appear, include all of them.
    - Preserve professional titles (e.g., "Dr", "Prof", "Sir") if present.
    - Return a clean JSON list of guest names, for example:
      ["Morgan Housel"]
      or ["Dr Andrew Huberman", "Lex Fridman"]
    - If no guest is clearly identified, return an empty list [].

    Description:
    \"\"\"{description}\"\"\"
    """

    try:
        completion = client.chat.completions.create(
            model="openai/gpt-4o-mini",  # you can change to "openai/gpt-4o-mini"
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1,
            max_tokens=200,
        )

        content = completion.choices[0].message.content.strip()

        # Try parsing JSON
        try:
            result = json.loads(content)
        except json.JSONDecodeError:
            match = re.search(r'\[(.*?)\]', content)
            if match:
                inner = match.group(1)
                result = [n.strip().strip('"').strip() for n in inner.split(",") if n.strip()]
            else:
                result = re.findall(r"(?:Dr\.?|Prof\.?|Mr\.?|Ms\.?)?\s?[A-Z][a-z]+(?:\s[A-Z][a-z]+)+", content)

        if isinstance(result, str):
            result = [result]
        result = [r.strip() for r in result if r.strip()]
        result = list(set(result))

    except Exception as e:
        print(f"Error processing description: {e}")
        result = []

    return result


# ------------------------------------------------------
# 🔁 Apply once per unique video_id
# ------------------------------------------------------

def assign_guest_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply guest extraction once per unique video_id.
    Adds a 'guest_list' column to the DataFrame.
    """
    # Create mapping: video_id → guest list
    mapping = {}
    unique_videos = df.drop_duplicates(subset="video_id")[["video_id", "video_description"]]

    for _, row in unique_videos.iterrows():
        vid = row["video_id"]
        desc = row["video_description"]
        guests = get_guest_names_openrouter(desc)
        mapping[vid] = guests

    # Map results back to main DataFrame
    df["guest_list"] = df["video_id"].map(mapping)
    return df

In [6]:
df = assign_guest_names(df)

In [7]:
df['guest_list'].value_counts()

guest_list
[Dr Jordan Peterson]             28
[Vanessa Van Edwards, Steven]    27
[Master Shi Heng Yi]             26
[Dr Stacy Sims]                  14
[Dr. Sara Szal]                   5
Name: count, dtype: int64

In [8]:
df.head(2)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,guest_list
33591,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Hik6OY-nk4c,Jordan B Peterson: You Need To Listen To Your ...,Dr Jordan Peterson is a world-renowned former ...,2025-01-13T08:00:19Z,2046763,54184,6392,UgzCgbdzte3zTTQex6Z4AaABAg,This was very eye opening. I was sort of like ...,@JarreVonDuck,UCQMQlxaRQf04JUkceJ6obpw,0,2025-01-23T21:52:03Z,False,False,,[Dr Jordan Peterson]
25333,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,0GQozcTPyO0,Hormone Expert: Control Your Hormones Control ...,"Is your belly fat, stress, or burnout actually...",2025-03-27T06:00:00Z,932985,28358,2168,Ugzn7DFdwNefG9A-_bh4AaABAg.AGAV74jOz9WAGAdQyQHulW,You can’t even spell hormone. Shush.,@Beauwagner,UCwppeGE5ffvJR3LWs2X1efQ,0,2025-03-27T13:55:06Z,False,True,Ugzn7DFdwNefG9A-_bh4AaABAg,[Dr. Sara Szal]


In [9]:
df['video_description'].values[0]

'Dr Jordan Peterson is a world-renowned former Professor of Psychology at the University of Toronto, and co-founder of the online education platform Peterson Academy. He is the author of bestselling books such as, ‘12 Rules for Life: An Antidote to Chaos’ and ‘We Who Wrestle With God: Perceptions of the Divine’.   00:00 Intro 02:30 The World Has Become Fractionated 05:23 Where Do We Find Ourselves Without Community? 08:41 How Do We Address Individualism in a Self-Centered Society? 15:21 Do Many P'

In [10]:
# Drop the heavy text column to speed up further LLM steps
# df = df.drop(columns=["video_description"])

## Text cleaning

In [11]:
import re
import spacy
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

# ------------------------------------------------------
# 1. Text Cleaning (light only)
# ------------------------------------------------------
def clean_comment(text: str) -> str:
    """
    Light cleaning for comments:
    - Remove @mentions
    - Remove URLs
    - Remove emojis / non-ascii
    - Lowercase
    - Strip whitespace
    - Keep context words (no lemmatization, no stopword removal yet)
    """
    if not isinstance(text, str):
        return ""

    # remove mentions
    text = re.sub(r"@\w+", "", text)

    # remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # remove emojis/non-ascii
    text = text.encode("ascii", "ignore").decode()

    # lowercase + strip
    text = text.lower().strip()

    return text


In [12]:
# Apply light cleaning to the comment_text column
df["cleaned_text"] = df["comment_text"].apply(clean_comment)

# Preview the results
pd.set_option('display.max_colwidth', 100)
df[["comment_text", "cleaned_text"]].head(10)

Unnamed: 0,comment_text,cleaned_text
33591,This was very eye opening. I was sort of like this idiot that Jordan was trying to help.,this was very eye opening. i was sort of like this idiot that jordan was trying to help.
25333,You can’t even spell hormone. Shush.,you cant even spell hormone. shush.
26967,I’m 40 and I agree with this list it is working for meeeee I feel great!,im 40 and i agree with this list it is working for meeeee i feel great!
41914,Pure gold! Seeing a lot of confusion in the comments. You have to realise that all the advice is...,pure gold! seeing a lot of confusion in the comments. you have to realise that all the advice is...
29542,i tried intermittent fasting and my period stopped coming.. i’m still trying to fix it… nobody w...,i tried intermittent fasting and my period stopped coming.. im still trying to fix it nobody war...
29694,​@@SevillaILoveit's not about disagreement though. Its literally saying or doing nothing when yo...,@'s not about disagreement though. its literally saying or doing nothing when you see others do ...
40205,Side note: What is it with spiritual people and tapping on tables? Is there an effect created by...,side note: what is it with spiritual people and tapping on tables? is there an effect created by...
39119,"Steven, your questions are amazingly on point. Wise, smart, thoughtful, well timed. Congratulat...","steven, your questions are amazingly on point. wise, smart, thoughtful, well timed. congratulat..."
46201,I prefer to be wary of first impressions and downplay their importance. How realistic are they? ...,i prefer to be wary of first impressions and downplay their importance. how realistic are they? ...
43841,Jokes on you when I check my phone I raise it in front of my face and don't scrunch up in defeat...,jokes on you when i check my phone i raise it in front of my face and don't scrunch up in defeat...


## Topic category (llama) -> Once per video

In [13]:
import ollama
import json

def get_topic_category(title: str) -> str:
    """
    Use Ollama to classify the video title into a topic category.
    Example categories: health, mental health, productivity, finance, relationships, entrepreneurship, other.
    """
    prompt = f"""
    You are a helpful assistant. Categorize the following YouTube video title into ONE broad category:
    - health
    - mental health / psychology
    - productivity / personal development
    - finance
    - relationships
    - entrepreneurship / business
    - Religion / Spirituality
    - Technology
    - Education
    - Lifestyle
    - Entertainment
    - other

    Title: "{title}"

    Return only the category name, nothing else.
    """

    response = ollama.chat(
        model="llama3.2:3b",  # you can swap to another local model
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip().lower()


In [14]:
# Enable tqdm for pandas
tqdm.pandas()

# Extract unique video_id/title pairs
video_meta = df[["video_id", "video_title"]].drop_duplicates()

# Apply Ollama category classification
video_meta["Topic_Category"] = video_meta["video_title"].apply(get_topic_category)

# Merge back into main dataframe
df = df.merge(video_meta[["video_id", "Topic_Category"]], on="video_id", how="left")

In [15]:
df['Topic_Category'].value_counts()

Topic_Category
health                        45
relationship                  28
mental health / psychology    27
Name: count, dtype: int64

## Sentiment analysis (Gemma2)

In [16]:
### This pipeline takes YouTube comments → cleans them → detects sentiment, impact, guest mentions, and topic requests 
### using a hybrid NLP + LLM approach, balancing accuracy, cost, and scale.

import re
import json
import numpy as np
import pandas as pd
import ollama
from tqdm import tqdm
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tqdm.pandas()

# -----------------------------
# 0) Helpers & resources
# -----------------------------
NEG_PHRASES = [
    r"\bnot good\b", r"\bnot great\b", r"\bnot helpful\b", r"\bdon't like\b", r"\bdont like\b",
    r"\bnot worth\b", r"\bwaste of time\b", r"\btoo long\b", r"\btoo slow\b", r"\btoo loud\b",
    r"\bmisleading\b", r"\bclickbait\b", r"\bbiased\b", r"\bconfusing\b", r"\bannoying\b",
    r"\bcringe\b", r"\bstupid\b", r"\bdumb\b", r"\bbad\b", r"\bawful\b", r"\bterrible\b",
    r"\buseless\b", r"\bpointless\b", r"\bwrong\b", r"\bpoor (audio|sound|quality)\b",
    r"\birrelevant\b", r"\boff\-topic\b", r"\bproblem\b", r"\bissue\b", r"\bdisappoint(ing|ed)\b",
    r"\brude\b", r"\boffensive\b", r"\bunfunny\b", r"\bboring\b", r"\blazy\b", r"\btoxic\b",
    r"\bhate\b", r"\bgarbage\b", r"\bignorant\b", r"\bweird\b", r"\bnegative\b", r"\bbroken\b",
    r"\bdownvote\b", r"\bterribly\b", r"\bdislike\b", r"\bpathetic\b", r"\bworse\b"
]
NEG_RE = re.compile("|".join(NEG_PHRASES))

SHORT_PRAISE_RE = re.compile(r"^(nice|cool|great|good|amazing|awesome|wow|love|thanks|perfect)[.!]?$", re.I)

# Load spaCy (light model for entity recognition)
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

def clean_comment(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = text.encode("ascii", "ignore").decode()
    return text.strip()


def bucket_from_p(p: float) -> str:
    if p < 0.35:
        return "Negative"
    if p > 0.65:
        return "Positive"
    return "Neutral"

# -----------------------------
# 1) HF Sentiment (RoBERTa)
# -----------------------------
_HF_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
_tok = AutoTokenizer.from_pretrained(_HF_MODEL)
_model = AutoModelForSequenceClassification.from_pretrained(_HF_MODEL)
_model.eval()

def roberta_probs(text: str) -> dict:
    if not text:
        return {"neg": 0.0, "neu": 1.0, "pos": 0.0}
    with torch.no_grad():
        inputs = _tok(text[:512], return_tensors="pt")
        logits = _model(**inputs).logits
        probs = F.softmax(logits, dim=1).cpu().numpy()[0]
    return {"neg": float(probs[0]), "neu": float(probs[1]), "pos": float(probs[2])}


# -----------------------------
# 2) LLM Sentiment (Gemma 2B)
# -----------------------------
def gemma_score_01(text: str) -> float:
    if not text:
        return 0.5
    prompt = f"""
    You are a neutral linguistic expert analyzing sentiment.
    Evaluate only the tone of the YouTube comment.
    Consider sarcasm and negation carefully.
    Return JSON only: {{"score": <float between -1.0 and 1.0>}}
    Comment: "{text}"
    """
    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        j = json.loads(resp["message"]["content"])
        s = float(j.get("score", 0.0))
    except Exception:
        s = 0.0

    p = (s + 1.0) / 2.0
    if NEG_RE.search(text) and p > 0.5:
        p -= 0.25
    if SHORT_PRAISE_RE.match(text) and p > 0.7:
        p = 0.6
    return float(np.clip(p, 0.0, 1.0))


# -----------------------------
# 3) Ensemble Sentiment
# -----------------------------
def ensemble_sentiment(text: str) -> dict:
    text_clean = clean_comment(text.lower())
    if not text_clean:
        return {"p_pos": 0.5, "bucket": "Neutral", "p_pos_llm": 0.5, "p_pos_hf": 0.33, "p_neg_hf": 0.33}

    p_pos_llm = gemma_score_01(text_clean)
    hf = roberta_probs(text_clean)
    p_pos_hf, p_neg_hf = hf["pos"], hf["neg"]

    w_llm, w_hf = 0.4, 0.6
    p_pos = w_llm * p_pos_llm + w_hf * p_pos_hf

    if p_neg_hf - p_pos_hf > 0.20 and p_pos > 0.3:
        p_pos -= 0.20

    p_pos = float(np.clip(p_pos, 0.0, 1.0))
    bucket = bucket_from_p(p_pos)
    return {
        "p_pos": p_pos,
        "bucket": bucket,
        "p_pos_llm": p_pos_llm,
        "p_pos_hf": p_pos_hf,
        "p_neg_hf": p_neg_hf,
    }


# -----------------------------
# 4) Likes Weighting
# -----------------------------
def like_weight(likes: float) -> float:
    if likes is None or likes <= 0:
        return 1.0
    if likes < 10:
        return 1.2
    if likes < 100:
        return 2.0
    if likes < 500:
        return 3.0
    return 4.0


# -----------------------------
# 5) Improved Guest Extraction
# -----------------------------

BANNED_GUEST_WORDS = {
    "jesus", "praise jesus", "ohhhh", "video", "motivation", "johari", "topic", "content"
}

GUEST_HINTS = re.compile(r"(with|feat\.|featuring|guest|bring back|have .* on|invite|episode with)", re.I)
TITLE_CUE_RE = re.compile(r"feat\.|featuring|with|guest|w/", re.I)

def extract_guests(text: str, video_title: str = "") -> list:
    """
    Extract guest names from comment or title with 3-layer logic:
    1. spaCy PERSON entities (fast)
    2. Gemma (if hints present)
    3. Fallback to video title if contains 'feat.' etc.
    """
    text = clean_comment(text)
    if not text:
        return []

    doc = nlp(text)
    names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]

    # If found via spaCy, trust them (light clean)
    if names:
        names = [n for n in names if len(n) > 2 and n.lower() not in BANNED_GUEST_WORDS]
        return list(set(names))

    # Only call LLM if hints present
    if not GUEST_HINTS.search(text):
        return []

    prompt = f"""
    You are analyzing a YouTube comment on a podcast episode.

    Your goal is to extract only the names of people the viewer *explicitly or implicitly wants to appear as guests* on future episodes.

    Focus on:
    - Mentions framed as requests or desires (e.g. “bring X on”, “invite Y”, “you should talk to Z”, “have A back”).
    - Include both full names and recognizable single names or titles (e.g. “Dr. Huberman”, “Peterson”, “Elon Musk”).
    - Ignore names mentioned for other reasons (e.g. as examples, comparisons, or in stories).
    - Exclude religious figures, fictional characters, vague references, or non-human entities.

    Return strictly a **JSON array of names** (no text explanation, no duplicates, no empty entries).

    Example:
    ["Jordan Peterson", "Dr. Rhonda Patrick", "Elon Musk"]

    Comment: "{text}"
    """

    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role": "user", "content": prompt}])
        names = json.loads(resp["message"]["content"])
    except Exception:
        names = []

    # Post-filter
    if isinstance(names, list):
        names = [
            n.strip() for n in names
            if n and len(n) > 2 and n[0].isupper() and n.lower() not in BANNED_GUEST_WORDS
        ]
    else:
        names = []

    # Fallback from video title
    if not names and TITLE_CUE_RE.search(video_title):
        doc_t = nlp(video_title)
        title_names = [ent.text.strip() for ent in doc_t.ents if ent.label_ == "PERSON"]
        names.extend(title_names)

    return list(set(names))



# -----------------------------
# 6) Topic Request Extraction
# -----------------------------
TOPIC_HINTS = re.compile(r"(talk about|episode on|discuss|cover|would love|should do|next guest|topic|content about)", re.I)

def extract_topics(text: str) -> list:
    text = clean_comment(text)
    if not text or not TOPIC_HINTS.search(text):
        return []

    prompt = f"""
    You are analyzing a YouTube comment on a podcast episode.

    Your goal is to extract *specific topics or subjects the viewer explicitly wants discussed in future episodes.*

    Focus on:
    - Requests or suggestions like “talk about X”, “do an episode on Y”, “discuss more about Z”, “I’d love a conversation on…” etc.
    - Extract concise, meaningful topics (1–4 words) representing what the viewer wants to learn or hear more about.
    - Ignore vague mentions, compliments, or generic categories (like “science”, “health”, “AI”) unless clearly framed as requests.
    - Exclude random nouns or things already being discussed.

    Return strictly a **JSON list of short topic phrases** (no explanation, no duplicates).

    Example:
    ["mental health in men", "AI and creativity", "nutrition and longevity"]

    Comment: "{text}"
    """

    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        topics = json.loads(resp["message"]["content"])
        if isinstance(topics, list):
            return topics
    except Exception:
        pass
    return []


# -----------------------------
# 7) Apply All to DataFrame
# -----------------------------
def apply_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["cleaned_text"] = df["cleaned_text"].fillna("").astype(str)

    # Sentiment
    res = df["cleaned_text"].progress_apply(ensemble_sentiment)
    df["sentiment_p_pos"] = res.apply(lambda r: r["p_pos"])
    df["sentiment_bucket"] = res.apply(lambda r: r["bucket"])
    df["p_pos_llm"] = res.apply(lambda r: r["p_pos_llm"])
    df["p_pos_hf"] = res.apply(lambda r: r["p_pos_hf"])
    df["p_neg_hf"] = res.apply(lambda r: r["p_neg_hf"])

    # Weights
    df["comment_like_count"] = df["comment_like_count"].fillna(0).astype(int)
    df["comment_weight"] = df["comment_like_count"].apply(like_weight)
    df["impact_weighted_sentiment"] = df["sentiment_p_pos"] * df["comment_weight"]

    # New columns — guests & topics
    df["guest_mentions"] = df.progress_apply(
        lambda r: extract_guests(r["cleaned_text"], r.get("video_title", "")), axis=1
    )

    df["topic_requests"] = df["cleaned_text"].progress_apply(extract_topics)

    return df


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
# -----------------------------
# 6) Run
# -----------------------------
df_sent = apply_sentiment(df)

100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:14<00:00,  6.69it/s]
100%|██████████| 100/100 [00:00<00:00, 190.25it/s]


In [18]:
df_sent[[
    "comment_text",
    "cleaned_text",
    "sentiment_p_pos",
    "sentiment_bucket",
    "p_pos_llm",
    "p_pos_hf",
    "p_neg_hf",
    "comment_like_count",
    "comment_weight",
    "impact_weighted_sentiment",
    "guest_list",
    "topic_requests"
]].head(5)

Unnamed: 0,comment_text,cleaned_text,sentiment_p_pos,sentiment_bucket,p_pos_llm,p_pos_hf,p_neg_hf,comment_like_count,comment_weight,impact_weighted_sentiment,guest_list,topic_requests
0,This was very eye opening. I was sort of like this idiot that Jordan was trying to help.,this was very eye opening. i was sort of like this idiot that jordan was trying to help.,0.527825,Neutral,0.82,0.333042,0.268005,0,1.0,0.527825,[Dr Jordan Peterson],[]
1,You can’t even spell hormone. Shush.,you cant even spell hormone. shush.,0.206989,Negative,0.5,0.011648,0.878349,0,1.0,0.206989,[Dr. Sara Szal],[]
2,I’m 40 and I agree with this list it is working for meeeee I feel great!,im 40 and i agree with this list it is working for meeeee i feel great!,0.881234,Positive,0.725,0.985389,0.003019,2,1.2,1.05748,[Dr Stacy Sims],[]
3,Pure gold! Seeing a lot of confusion in the comments. You have to realise that all the advice is...,pure gold! seeing a lot of confusion in the comments. you have to realise that all the advice is...,0.287034,Negative,0.525,0.12839,0.426005,3,1.2,0.344441,"[Vanessa Van Edwards, Steven]",[]
4,i tried intermittent fasting and my period stopped coming.. i’m still trying to fix it… nobody w...,i tried intermittent fasting and my period stopped coming.. im still trying to fix it nobody war...,0.253586,Negative,0.625,0.005976,0.897784,0,1.0,0.253586,[Dr Stacy Sims],[]


In [19]:
df_sent['sentiment_bucket'].value_counts()

sentiment_bucket
Negative    46
Positive    33
Neutral     21
Name: count, dtype: int64

In [20]:
df_sent['guest_mentions'].value_counts()

guest_mentions
[]                                                  80
[Elon Musk, Jordan Peterson, Dr. Rhonda Patrick]     5
[Elon Musk, Dr. Rhonda Patrick]                      5
[jordan]                                             4
[jordan peterson]                                    2
[shi heng]                                           1
[babysitter]                                         1
[ken robinson]                                       1
[steve, shi]                                         1
Name: count, dtype: int64

In [21]:
df_sent['topic_requests'].value_counts()

topic_requests
[]                                99
[relationship crisis in china]     1
Name: count, dtype: int64

In [22]:
# df_sent.to_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/processed/processed_snapshot4.csv', index=False)

## Sentiment Analysis (Boosted logic)

In [23]:
# ===========================
# Fast, Scalable Comment NLP (with improved, intent-driven guest/topic extraction)
# ===========================

import os
import re
import json
import sqlite3
import hashlib
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import spacy
import ollama

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# -----------------------------
# 0) GLOBALS & RESOURCES
# -----------------------------
tqdm.pandas()

NEG_PHRASES = [
    r"\bnot good\b", r"\bnot great\b", r"\bnot helpful\b", r"\bdon't like\b", r"\bdont like\b",
    r"\bnot worth\b", r"\bwaste of time\b", r"\btoo long\b", r"\btoo slow\b", r"\btoo loud\b",
    r"\bmisleading\b", r"\bclickbait\b", r"\bbiased\b", r"\bconfusing\b", r"\bannoying\b",
    r"\bcringe\b", r"\bstupid\b", r"\bdumb\b", r"\bbad\b", r"\bawful\b", r"\bterrible\b",
    r"\buseless\b", r"\bpointless\b", r"\bwrong\b", r"\bpoor (audio|sound|quality)\b",
    r"\birrelevant\b", r"\boff\-topic\b", r"\bproblem\b", r"\bissue\b", r"\bdisappoint(ing|ed)\b",
    r"\brude\b", r"\boffensive\b", r"\bunfunny\b", r"\bboring\b", r"\blazy\b", r"\btoxic\b",
    r"\bhate\b", r"\bgarbage\b", r"\bignorant\b", r"\bweird\b", r"\bnegative\b", r"\bbroken\b",
    r"\bdownvote\b", r"\bterribly\b", r"\bdislike\b", r"\bpathetic\b", r"\bworse\b"
]
NEG_RE = re.compile("|".join(NEG_PHRASES), re.I)
SHORT_PRAISE_RE = re.compile(r"^(nice|cool|great|good|amazing|awesome|wow|love|thanks|perfect)[.!]?$", re.I)

# Sentiment LLM pre-filter (nudges LLM use for nuanced tone)
LLM_HINTS = re.compile(r"(sarcasm|/s|lol|lmao|haha|not\s+\w+|but|however|although|though|guess|apparently)", re.I)

# ---------- INTENT TEMPLATES (NEW) ----------
# Guest intent phrases (bring/invite/have on/back/etc.)
GUEST_INTENT = re.compile(
    r"(bring\s+(?:him|her|them|[A-Z][\w'.\-]+)\s+(?:on|back)|"
    r"invite\s+(?:him|her|them|[A-Z][\w'.\-]+)|"
    r"have\s+(?:him|her|them|[A-Z][\w'.\-]+)\s+on|"
    r"get\s+(?:him|her|them|[A-Z][\w'.\-]+)\s+on|"
    r"episode\s+with\s+[A-Z][\w'.\-]+|"
    r"talk\s+to\s+[A-Z][\w'.\-]+|"
    r"next\s+guest|feature\s+[A-Z][\w'.\-]+)",
    re.I
)

# Negative/anti-intent for guest requests (e.g., "don't bring X")
NEG_INTENT = re.compile(r"(don'?t|do not|please\s+don'?t|no\s+more|stop)\s+(?:bring|get|have|invite|feature|interview)", re.I)

# Topic intent phrases (talk/discuss/more on/do one about/cover/etc.)
TOPIC_INTENT = re.compile(
    r"(talk\s+about|discuss|more\s+on|more\s+of|do\s+one\s+about|episode\s+on|video\s+on|"
    r"cover|speak\s+about|conversation\s+on|should\s+talk\s+about|"
    r"(?:i(?:'d)?\s+)?(?:love|want)\s+(?:to\s+hear|an?\s+episode)\s+on|"
    r"deep\s+dive\s+on|more\s+content\s+on)",
    re.I
)

# Capture after trigger up to punctuation/EOL
TOPIC_CAPTURE = re.compile(
    r"(?:talk\s+about|discuss|more\s+on|more\s+of|do\s+one\s+about|episode\s+on|video\s+on|"
    r"cover|speak\s+about|conversation\s+on|should\s+talk\s+about|"
    r"(?:i(?:'d)?\s+)?(?:love|want)\s+(?:to\s+hear|an?\s+episode)\s+on|"
    r"deep\s+dive\s+on|more\s+content\s+on)\s+([^\.!\?\n]+)",
    re.I
)

# spaCy NER (parser enabled because we use noun_chunks)
nlp = spacy.load("en_core_web_sm", disable=["tagger", "lemmatizer"])

# HF model
_HF_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
_tok = AutoTokenizer.from_pretrained(_HF_MODEL)
_model = AutoModelForSequenceClassification.from_pretrained(_HF_MODEL)
_model.eval()

if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"
_model.to(DEVICE)

CACHE_PATH = os.path.abspath("./sentiment_cache.sqlite")


# -----------------------------
# 1) UTILS
# -----------------------------
def clean_comment(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = text.encode("ascii", "ignore").decode()
    return text.strip()

def bucket_from_p(p: float) -> str:
    if p < 0.35: return "Negative"
    if p > 0.65: return "Positive"
    return "Neutral"

def sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def like_weight(likes: float) -> float:
    if likes is None or likes <= 0: return 1.0
    if likes < 10: return 1.2
    if likes < 100: return 2.0
    if likes < 500: return 3.0
    return 4.0


# -----------------------------
# 2) CACHE (sentiment + guests/topics)
# -----------------------------
def _ensure_cache():
    con = sqlite3.connect(CACHE_PATH)
    cur = con.cursor()
    cur.execute("""
      CREATE TABLE IF NOT EXISTS sentiments (
        comment_id TEXT PRIMARY KEY,
        text_hash  TEXT NOT NULL,
        p_pos      REAL NOT NULL,
        bucket     TEXT NOT NULL,
        p_pos_llm  REAL NOT NULL,
        p_pos_hf   REAL NOT NULL,
        p_neg_hf   REAL NOT NULL,
        created_at TEXT NOT NULL
      );
    """)
    cur.execute("""
      CREATE TABLE IF NOT EXISTS meta_extractions (
        comment_id  TEXT PRIMARY KEY,
        text_hash   TEXT NOT NULL,
        guest_json  TEXT NOT NULL,
        topic_json  TEXT NOT NULL,
        created_at  TEXT NOT NULL
      );
    """)
    con.commit(); con.close()

def cache_get_sentiments(comment_ids, text_hashes):
    if not comment_ids: return {}
    con = sqlite3.connect(CACHE_PATH); cur = con.cursor()
    qmarks = ",".join(["?"] * len(comment_ids))
    rows = cur.execute(f"SELECT * FROM sentiments WHERE comment_id IN ({qmarks})", comment_ids).fetchall()
    con.close()
    out = {}
    for (cid, th, p_pos, bucket, p_pos_llm, p_pos_hf, p_neg_hf, created_at) in rows:
        if text_hashes.get(cid) == th:
            out[cid] = dict(p_pos=p_pos, bucket=bucket, p_pos_llm=p_pos_llm, p_pos_hf=p_pos_hf, p_neg_hf=p_neg_hf)
    return out

def cache_put_sentiments(rows):
    if not rows: return
    con = sqlite3.connect(CACHE_PATH); cur = con.cursor()
    cur.executemany("""
      INSERT INTO sentiments (comment_id, text_hash, p_pos, bucket, p_pos_llm, p_pos_hf, p_neg_hf, created_at)
      VALUES (?, ?, ?, ?, ?, ?, ?, ?)
      ON CONFLICT(comment_id) DO UPDATE SET
        text_hash=excluded.text_hash,
        p_pos=excluded.p_pos,
        bucket=excluded.bucket,
        p_pos_llm=excluded.p_pos_llm,
        p_pos_hf=excluded.p_pos_hf,
        p_neg_hf=excluded.p_neg_hf,
        created_at=excluded.created_at;
    """, [(cid, th, ppos, buck, pll, phf, nhf, datetime.utcnow().isoformat()) for (cid, th, ppos, buck, pll, phf, nhf) in rows])
    con.commit(); con.close()

def cache_get_meta(comment_ids, text_hashes):
    if not comment_ids: return {}
    con = sqlite3.connect(CACHE_PATH); cur = con.cursor()
    qmarks = ",".join(["?"] * len(comment_ids))
    rows = cur.execute(f"SELECT * FROM meta_extractions WHERE comment_id IN ({qmarks})", comment_ids).fetchall()
    con.close()
    out = {}
    for (cid, th, gj, tj, created_at) in rows:
        if text_hashes.get(cid) == th:
            out[cid] = {"guest_mentions": json.loads(gj), "topic_requests": json.loads(tj)}
    return out

def cache_put_meta(rows):
    if not rows: return
    con = sqlite3.connect(CACHE_PATH); cur = con.cursor()
    cur.executemany("""
      INSERT INTO meta_extractions (comment_id, text_hash, guest_json, topic_json, created_at)
      VALUES (?, ?, ?, ?, ?)
      ON CONFLICT(comment_id) DO UPDATE SET
        text_hash=excluded.text_hash,
        guest_json=excluded.guest_json,
        topic_json=excluded.topic_json,
        created_at=excluded.created_at;
    """, [(cid, th, json.dumps(guests), json.dumps(topics), datetime.utcnow().isoformat()) for (cid, th, guests, topics) in rows])
    con.commit(); con.close()


# -----------------------------
# 3) HF SENTIMENT (BATCHED)
# -----------------------------
@torch.no_grad()
def roberta_probs_batch(texts: List[str], batch_size: int = 512, max_len: int = 128) -> np.ndarray:
    probs_all = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        enc = _tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=max_len)
        enc = {k: v.to(DEVICE) for k, v in enc.items()}
        logits = _model(**enc).logits
        probs = F.softmax(logits, dim=1).cpu().numpy()
        probs_all.append(probs)
    return np.vstack(probs_all) if probs_all else np.zeros((0, 3), dtype=float)


# -----------------------------
# 4) LLM SENTIMENT (PARALLEL)
# -----------------------------
def gemma_score_01(text: str, retries: int = 2, model_name: str = "gemma:2b") -> float:
    if not text: return 0.5
    prompt = f"""
    You are a neutral linguistic expert analyzing sentiment only.
    Consider sarcasm and negation carefully.
    Return JSON only: {{"score": <float between -1.0 and 1.0>}}
    Comment: "{text}"
    """.strip()
    for _ in range(max(1, retries)):
        try:
            resp = ollama.chat(model=model_name, messages=[{"role":"user","content":prompt}])
            j = json.loads(resp["message"]["content"])
            s = float(j.get("score", 0.0))
            p = (s + 1.0) / 2.0
            if NEG_RE.search(text) and p > 0.5: p -= 0.25
            if SHORT_PRAISE_RE.match(text) and p > 0.7: p = 0.6
            return float(np.clip(p, 0.0, 1.0))
        except Exception:
            pass
    return 0.5

def parallel_llm_scores(texts: List[str], max_workers: int = 8, model_name: str = "gemma:2b") -> List[float]:
    scores = [0.5] * len(texts)
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(gemma_score_01, t, 2, model_name): i for i, t in enumerate(texts)}
        for fut in tqdm(as_completed(futs), total=len(futs), desc="LLM (parallel)"):
            i = futs[fut]
            try: scores[i] = fut.result()
            except Exception: scores[i] = 0.5
    return scores


# -----------------------------
# 5) ENSEMBLE SENTIMENT
# -----------------------------
def ensemble_scores_fast(texts: List[str],
                         call_llm_if_uncertain: bool = True,
                         llm_max_workers: int = 8,
                         llm_model: str = "gemma:2b") -> Dict[str, np.ndarray]:
    cleaned = [clean_comment(t.lower()) for t in texts]
    hf_probs = roberta_probs_batch(cleaned, batch_size=512, max_len=128)
    if hf_probs.shape[0] == 0:
        n = len(texts)
        return dict(p_pos=np.full(n,0.5), p_pos_hf=np.full(n,1/3), p_neg_hf=np.full(n,1/3), p_pos_llm=np.full(n,0.5))

    p_neg_hf, p_neu_hf, p_pos_hf = hf_probs[:,0], hf_probs[:,1], hf_probs[:,2]

    if call_llm_if_uncertain:
        uncertain = np.abs(p_pos_hf - p_neg_hf) < 0.20
        hints = np.array([bool(LLM_HINTS.search(t)) for t in cleaned])
        run_llm_mask = np.logical_or(uncertain, hints)
    else:
        run_llm_mask = np.zeros(len(texts), dtype=bool)

    p_pos_llm = p_pos_hf.copy()
    if run_llm_mask.any():
        idxs = np.where(run_llm_mask)[0].tolist()
        subset = [cleaned[i] for i in idxs]
        llm_scores = parallel_llm_scores(subset, max_workers=llm_max_workers, model_name=llm_model)
        for loc, val in zip(idxs, llm_scores):
            p_pos_llm[loc] = val

    w_llm, w_hf = 0.4, 0.6
    p_pos = w_llm * p_pos_llm + w_hf * p_pos_hf
    adjust_mask = (p_neg_hf - p_pos_hf) > 0.20
    p_pos[adjust_mask] = np.maximum(0.0, p_pos[adjust_mask] - 0.20)
    p_pos = np.clip(p_pos, 0.0, 1.0)

    return dict(p_pos=p_pos, p_pos_hf=p_pos_hf, p_neg_hf=p_neg_hf, p_pos_llm=p_pos_llm)


# =====================================================================
# 6) >>> INTENT-DRIVEN GUEST EXTRACTION (precision-focused, batched) <<<
# =====================================================================

# Junk terms that often slip in
BAD_PERSON_LIKE = {
    "community note","intro","idk","drs","omad","keto","dang","app","stfu",
    "nicotine","propylene glycol","schizophrenia","hunter gatherers","good luck","pure gold",
    "someone","somebody","everyone","anyone","anybody","people","guest","guests","host",
    "video","content","topic","episode","pls","please","thanks","thank you"
}
TITLE_CUE_RE = re.compile(r"feat\.|featuring|with|guest|w/", re.I)

# Optional known roster to whitelist single surnames (fill from your dataset if you want)
KNOWN_GUESTS = set()  # e.g., {"Jordan Peterson","Ben Bikman","Georgia Ede", ...}
KNOWN_SURNAMES = {n.split()[-1] for n in KNOWN_GUESTS}

# Optional alias map for common misspellings / short forms
NAME_ALIASES = {
    "daniel priestly": "Daniel Priestley",
    "ben bikman": "Benjamin Bikman",
    "dr bikman": "Benjamin Bikman",
    "dr. bikman": "Benjamin Bikman",
    "jp": "Jordan Peterson",
    # Enable only if OK to default lone surname to JP for your dataset:
    # "peterson": "Jordan Peterson",
    "dr k": "Dr K",
}

def _norm_name(s: str) -> Optional[str]:
    """Normalize a candidate name; map aliases; keep title-cased."""
    if not s: return None
    s2 = re.sub(r"[^A-Za-z'\-\.\s]", " ", s).strip()
    s2 = re.sub(r"\s+", " ", s2)
    if not s2: return None
    low = s2.lower()
    if low in BAD_PERSON_LIKE: return None
    if low in NAME_ALIASES:
        return NAME_ALIASES[low]
    toks = s2.split()
    def fix(tok):
        return tok if tok.lower().startswith(("dr", "prof", "sir", "mr", "ms", "mrs")) else tok.title()
    s3 = " ".join(fix(t) for t in toks)
    if len(toks) == 1:
        if toks[0] in KNOWN_SURNAMES:
            return toks[0].title()
        return None
    return s3

def _dedupe_names(names: List[str]) -> List[str]:
    """Prefer longer forms (e.g., 'Jordan Peterson' over 'Dr Peterson')."""
    if not names: return []
    names_sorted = sorted(set(names), key=lambda n: (-len(n.split()), n))
    result, seen_keys = [], set()
    for n in names_sorted:
        key = n.lower().replace("dr ", "").replace("prof ", "")
        surname = n.split()[-1].lower()
        if (key in seen_keys) or any(surname == r.split()[-1].lower() for r in result):
            continue
        result.append(n)
        seen_keys.add(key)
    return result

def _intent_spans(text: str) -> List[Tuple[int,int]]:
    return [m.span() for m in GUEST_INTENT.finditer(text or "")]

def _persons_near_spans(doc, spans: List[Tuple[int,int]], window: int = 50) -> List[str]:
    out = []
    for (s, e) in spans:
        left, right = max(0, s - window), min(len(doc.text), e + window)
        for ent in doc.ents:
            if ent.label_ == "PERSON" and ent.start_char >= left and ent.end_char <= right:
                n = _norm_name(ent.text)
                if n: out.append(n)
    return out

def extract_guests_batch(texts: List[str], titles: Optional[List[str]] = None,
                         max_workers: int = 8, llm_model: str = "gemma:2b") -> List[List[str]]:
    """
    Faster & stricter:
      • Must have guest intent.
      • Use spaCy PERSON near the intent phrase (proximity filter).
      • Skip negative-intent (e.g. "don't bring X").
      • LLM fallback only if spaCy near-intent returns nothing.
      • Optional: title fallback when 'feat./with' in title.
    """
    titles = titles or [""] * len(texts)
    cleaned = [clean_comment(t) for t in texts]
    results = [[] for _ in cleaned]

    # 0) quickly mark rows with intent and non-negative intent
    intent_mask = [bool(GUEST_INTENT.search(t or "")) for t in cleaned]
    negative_mask = [bool(NEG_INTENT.search(t or "")) for t in cleaned]

    # 1) spaCy in batch only for intent rows (speeds things up)
    to_proc_idx = [i for i, ok in enumerate(intent_mask) if ok and not negative_mask[i]]
    to_proc_texts = [cleaned[i] for i in to_proc_idx]
    spans_list = [_intent_spans(cleaned[i]) for i in to_proc_idx]

    if to_proc_idx:
        for doc, i, spans in zip(nlp.pipe(to_proc_texts, batch_size=256, n_process=1), to_proc_idx, spans_list):
            near = _persons_near_spans(doc, spans, window=50)
            if near:
                results[i] = _dedupe_names(near)

    # 2) LLM fallback for remaining intent rows
    llm_idxs = [i for i in to_proc_idx if not results[i]]

    def _llm_extract_names_from_intent(text: str, model_name: str = "gemma:2b") -> List[str]:
        prompt = f"""
        A YouTube comment suggests future PODCAST GUESTS (e.g., "bring X on", "invite Y", "have Z back").
        Extract ONLY the people the viewer wants on the show (ignore examples, comparisons, or negatives like "don't invite").
        Return JSON array ONLY, e.g. ["Jordan Peterson","Dr Eric Berg"].
        Comment: "{text}"
        """.strip()
        try:
            resp = ollama.chat(model=model_name, messages=[{"role":"user","content":prompt}])
            lst = json.loads(resp["message"]["content"])
            if not isinstance(lst, list): return []
            cleaned = [_norm_name(str(x)) for x in lst]
            cleaned = [x for x in cleaned if x]
            return _dedupe_names(cleaned)
        except Exception:
            return []

    if llm_idxs:
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futs = {ex.submit(_llm_extract_names_from_intent, cleaned[i], llm_model): i for i in llm_idxs}
            for fut in tqdm(as_completed(futs), total=len(futs), desc="Guests LLM"):
                i = futs[fut]
                try: results[i] = fut.result()
                except Exception: results[i] = []

    # 3) Title fallback (only when intent present and still empty)
    for i in to_proc_idx:
        if not results[i]:
            t = titles[i] if i < len(titles) else ""
            if t and TITLE_CUE_RE.search(t):
                doc_t = nlp(t)
                cand = [_norm_name(ent.text) for ent in doc_t.ents if ent.label_ == "PERSON"]
                cand = [c for c in cand if c]
                results[i] = _dedupe_names(cand)

    # Non-intent rows (or negative intent) stay empty
    return results


# =====================================================================
# 7) >>> INTENT-DRIVEN TOPIC REQUESTS (span mining + light filters) <<<
# =====================================================================

BAD_TOPIC_TOKENS = {"please","thanks","thank you","now","again","next","today","tomorrow","idk","intro","something","someone"}
CHEMICALS = {"nicotine","propylene glycol"}
MAX_TOPIC_WORDS = 4

# Words that make a topic too generic unless explicitly allowed (keep if in allowlist)
GENERIC_TOPICS = {"science","health","ai","tech","technology","business","politics","finance","economics"}
TOPIC_ALLOWLIST = {"mental health","nutrition","longevity","ai safety","ai at work","relationships"}

def _clean_topic_fragment(s: str) -> List[str]:
    s = s.strip(" .,!?:;|/\\[]()\"'")
    if not s: return []
    parts = re.split(r"\s+(?:and|or)\s+|,|/|;", s)
    out = []
    for p in parts:
        p2 = re.sub(r"@\w+|http\S+", "", p)
        p2 = re.sub(r"[^A-Za-z0-9'\-\s]", " ", p2)
        p2 = re.sub(r"\s+", " ", p2).strip().lower()
        if not p2: continue
        if p2 in CHEMICALS or p2 in BAD_TOPIC_TOKENS: continue
        if p2 in GENERIC_TOPICS and p2 not in TOPIC_ALLOWLIST: continue
        if 1 <= len(p2.split()) <= MAX_TOPIC_WORDS:
            out.append(p2)
    # dedupe
    uniq, seen = [], set()
    for x in out:
        if x not in seen:
            uniq.append(x); seen.add(x)
    return uniq

def _noun_phrases_within(span_text: str) -> List[str]:
    doc = nlp(span_text)
    chunks = []
    for ch in doc.noun_chunks:
        t = ch.text.strip()
        t = re.sub(r"[^A-Za-z0-9'\-\s]", " ", t)
        t = re.sub(r"\s+", " ", t).strip().lower()
        if not t: continue
        if t in CHEMICALS or t in BAD_TOPIC_TOKENS: continue
        if t in GENERIC_TOPICS and t not in TOPIC_ALLOWLIST: continue
        if 1 <= len(t.split()) <= MAX_TOPIC_WORDS:
            chunks.append(t)
    chunks.sort(key=lambda s: len(s.split()), reverse=True)
    uniq, seen = [], set()
    for x in chunks:
        if x not in seen:
            uniq.append(x); seen.add(x)
    return uniq[:5]

def _llm_normalize_topics_from_intent(text: str, model_name: str = "gemma:2b") -> List[str]:
    prompt = f"""
    A YouTube comment requests PODCAST TOPICS (e.g., "talk about X", "do an episode on Y").
    Extract short, specific topic phrases (1–4 words, lower-case) the viewer wants in future episodes.
    Ignore vague praise or general chatter.
    Return JSON array only. Example: ["mental health in men","seed oils","ai at work"]
    Comment: "{text}"
    """.strip()
    try:
        resp = ollama.chat(model=model_name, messages=[{"role":"user","content":prompt}])
        lst = json.loads(resp["message"]["content"])
        if not isinstance(lst, list): return []
        out, seen = [], set()
        for x in lst:
            s = re.sub(r"[^A-Za-z0-9'\-\s]", " ", str(x)).strip().lower()
            s = re.sub(r"\s+", " ", s)
            if not s: continue
            if s in CHEMICALS or s in BAD_TOPIC_TOKENS: continue
            if s in GENERIC_TOPICS and s not in TOPIC_ALLOWLIST: continue
            if 1 <= len(s.split()) <= MAX_TOPIC_WORDS and s not in seen:
                out.append(s); seen.add(s)
        return out[:5]
    except Exception:
        return []

def extract_topics_batch(texts: List[str], max_workers: int = 8, llm_model: str = "gemma:2b") -> List[List[str]]:
    """
    Intent-first flow (compute-light):
      • Must match topic intent.
      • Mine the span after the trigger; noun-chunk in that span; clean & dedupe.
      • LLM only if we still got nothing.
    """
    cleaned = [clean_comment(t) for t in texts]
    results = [[] for _ in cleaned]

    # Which rows have topic-request intent?
    intent_mask = [bool(TOPIC_INTENT.search(t or "")) for t in cleaned]

    # 1) span capture + noun phrase mining for intent rows (batch pipe for speed in long runs)
    spans_idx = [i for i, ok in enumerate(intent_mask) if ok]
    spans_texts = []
    spans_per_idx = []

    for i in spans_idx:
        spans = [m.group(1) for m in TOPIC_CAPTURE.finditer(cleaned[i])]
        if spans:
            spans_texts.extend(spans)
            spans_per_idx.append((i, len(spans)))
        else:
            spans_per_idx.append((i, 0))

    if spans_texts:
        mined = []
        for doc in nlp.pipe(spans_texts, batch_size=256, n_process=1):
            mined.append(_noun_phrases_within(doc.text))
        cursor = 0
        for i, nspans in spans_per_idx:
            if nspans == 0:
                continue
            collected = []
            for _ in range(nspans):
                collected.extend(mined[cursor])
                cursor += 1
            # plus rule-clean fragments
            for s in [m.group(1) for m in TOPIC_CAPTURE.finditer(cleaned[i])]:
                collected.extend(_clean_topic_fragment(s))
            uniq, seen = [], set()
            for tp in collected:
                if tp not in seen:
                    uniq.append(tp); seen.add(tp)
            results[i] = uniq[:5]

    # 2) LLM normalization when still empty but intent exists
    llm_idxs = [i for i, ok in enumerate(intent_mask) if ok and not results[i]]
    if llm_idxs:
        with ThreadPoolExecutor(max_workers=max_workers) as ex:
            futs = {ex.submit(_llm_normalize_topics_from_intent, cleaned[i], llm_model): i for i in llm_idxs}
            for fut in tqdm(as_completed(futs), total=len(futs), desc="Topics LLM"):
                i = futs[fut]
                try: results[i] = fut.result()
                except Exception: results[i] = []

    # Final dedupe per row
    final = []
    for lst in results:
        uniq, seen = [], set()
        for t in lst:
            if t not in seen:
                uniq.append(t); seen.add(t)
        final.append(uniq)
    return final


# -----------------------------
# 8) ENSEMBLE DRIVER
# -----------------------------
def ensemble_with_llm_gate(texts: List[str],
                           llm_workers: int = 8,
                           llm_model: str = "gemma:2b") -> Dict[str, np.ndarray]:
    return ensemble_scores_fast(texts, call_llm_if_uncertain=True,
                                llm_max_workers=llm_workers, llm_model=llm_model)


# -----------------------------
# 9) APPLY TO DATAFRAME (CACHED)
# -----------------------------
def apply_sentiment_fast(df: pd.DataFrame,
                         text_col: str = "comment_text",
                         id_col: str = "comment_id",
                         like_col: str = "comment_like_count",
                         title_col: Optional[str] = "video_title",
                         use_cache: bool = True,
                         llm_workers: int = 8,
                         llm_model: str = "gemma:2b",
                         run_guest_topic: bool = True) -> pd.DataFrame:

    assert text_col in df.columns, f"'{text_col}' not in df"
    assert id_col in df.columns, f"'{id_col}' not in df"

    _ensure_cache()
    out = df.copy()
    out[text_col] = out[text_col].fillna("").astype(str)
    out["cleaned_text"] = out[text_col].map(clean_comment)

    # sentiment cache prep
    out["__text_hash"] = out["cleaned_text"].map(sha1)
    ids = out[id_col].astype(str).tolist()
    id_to_hash = dict(zip(ids, out["__text_hash"].tolist()))
    cached_sent = cache_get_sentiments(ids, id_to_hash) if use_cache else {}
    cached_mask_sent = out[id_col].astype(str).isin(cached_sent.keys())

    for col in ["sentiment_p_pos","sentiment_bucket","p_pos_llm","p_pos_hf","p_neg_hf"]:
        out[col] = np.nan if col!="sentiment_bucket" else None

    if cached_sent:
        idx = out.index[cached_mask_sent]
        for i in idx:
            cid = str(out.at[i, id_col]); row = cached_sent[cid]
            out.at[i,"sentiment_p_pos"] = row["p_pos"]
            out.at[i,"sentiment_bucket"] = row["bucket"]
            out.at[i,"p_pos_llm"] = row["p_pos_llm"]
            out.at[i,"p_pos_hf"] = row["p_pos_hf"]
            out.at[i,"p_neg_hf"] = row["p_neg_hf"]

    need = out.index[~cached_mask_sent].tolist()
    if need:
        texts = out.loc[need, "cleaned_text"].tolist()
        scores = ensemble_with_llm_gate(texts, llm_workers, llm_model)
        out.loc[need, "p_pos_hf"] = scores["p_pos_hf"]
        out.loc[need, "p_neg_hf"] = scores["p_neg_hf"]
        out.loc[need, "p_pos_llm"] = scores["p_pos_llm"]
        out.loc[need, "sentiment_p_pos"] = scores["p_pos"]
        out.loc[need, "sentiment_bucket"] = [bucket_from_p(v) for v in scores["p_pos"]]

        if use_cache:
            to_cache = []
            for i in need:
                to_cache.append((
                    str(out.at[i, id_col]),
                    out.at[i, "__text_hash"],
                    float(out.at[i, "sentiment_p_pos"]),
                    str(out.at[i, "sentiment_bucket"]),
                    float(out.at[i, "p_pos_llm"]),
                    float(out.at[i, "p_pos_hf"]),
                    float(out.at[i, "p_neg_hf"]),
                ))
            cache_put_sentiments(to_cache)

    # impact weighting
    if like_col in out.columns:
        out[like_col] = out[like_col].fillna(0).astype(int)
        out["comment_weight"] = out[like_col].map(like_weight)
    else:
        out["comment_weight"] = 1.0
    out["impact_weighted_sentiment"] = out["sentiment_p_pos"].astype(float) * out["comment_weight"].astype(float)

    # guests/topics (intent-driven) + caching
    if run_guest_topic:
        cached_meta = cache_get_meta(ids, id_to_hash) if use_cache else {}
        cached_mask_meta = out[id_col].astype(str).isin(cached_meta.keys())
        out["guest_mentions"]  = [[] for _ in range(len(out))]
        out["topic_requests"]  = [[] for _ in range(len(out))]

        if cached_meta:
            idx = out.index[cached_mask_meta]
            for i in idx:
                cid = str(out.at[i, id_col]); row = cached_meta[cid]
                out.at[i, "guest_mentions"] = row["guest_mentions"]
                out.at[i, "topic_requests"] = row["topic_requests"]

        need_meta = out.index[~cached_mask_meta].tolist()
        if need_meta:
            texts = out.loc[need_meta, "cleaned_text"].tolist()
            titles = out[title_col].tolist() if title_col and title_col in out.columns else [""]*len(out)

            guests_lists = extract_guests_batch(texts, [titles[i] for i in need_meta],
                                                max_workers=llm_workers, llm_model=llm_model)
            topics_lists = extract_topics_batch(texts, max_workers=llm_workers, llm_model=llm_model)

            for i, glst in zip(need_meta, guests_lists):
                out.at[i, "guest_mentions"] = glst
            for i, tlst in zip(need_meta, topics_lists):
                out.at[i, "topic_requests"] = tlst

            if use_cache:
                to_meta = []
                for i in need_meta:
                    to_meta.append((
                        str(out.at[i, id_col]),
                        out.at[i, "__text_hash"],
                        out.at[i, "guest_mentions"],
                        out.at[i, "topic_requests"],
                    ))
                cache_put_meta(to_meta)

    out.drop(columns=["__text_hash"], inplace=True, errors="ignore")
    return out


# ===========================
# Example
# ===========================
# out = apply_sentiment_fast(
#     df,
#     text_col="comment_text",
#     id_col="comment_id",
#     like_col="comment_like_count",
#     title_col="video_title",
#     use_cache=True,
#     llm_workers=8,
#     llm_model="gemma:2b",
#     run_guest_topic=True
# )


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
out = apply_sentiment_fast(df,
     text_col="comment_text",
     id_col="comment_id",
     like_col="comment_like_count",
     title_col="video_title",
     use_cache=True,
     llm_workers=8,
     llm_model="gemma:2b",
     run_guest_topic=True)

LLM (parallel): 100%|██████████| 46/46 [00:35<00:00,  1.31it/s]
Guests LLM: 100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


In [25]:
out.head(1)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,guest_list,cleaned_text,Topic_Category,sentiment_p_pos,sentiment_bucket,p_pos_llm,p_pos_hf,p_neg_hf,comment_weight,impact_weighted_sentiment,guest_mentions,topic_requests
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Hik6OY-nk4c,Jordan B Peterson: You Need To Listen To Your Wife! We've Built A Lonely & Sexless Society!,Dr Jordan Peterson is a world-renowned former Professor of Psychology at the University of Toron...,2025-01-13T08:00:19Z,2046763,54184,6392,UgzCgbdzte3zTTQex6Z4AaABAg,This was very eye opening. I was sort of like this idiot that Jordan was trying to help.,@JarreVonDuck,UCQMQlxaRQf04JUkceJ6obpw,0,2025-01-23T21:52:03Z,False,False,,[Dr Jordan Peterson],This was very eye opening. I was sort of like this idiot that Jordan was trying to help.,relationship,0.399823,Neutral,0.5,0.333039,0.268008,1.0,0.399823,[],[]


In [26]:
out.columns

Index(['channel_name', 'channel_id', 'video_id', 'video_title',
       'video_description', 'video_published_at', 'view_count',
       'video_like_count', 'comment_count', 'comment_id', 'comment_text',
       'author', 'author_id', 'comment_like_count', 'comment_published_at',
       'is_pinned', 'is_reply', 'parent_comment_id', 'guest_list',
       'cleaned_text', 'Topic_Category', 'sentiment_p_pos', 'sentiment_bucket',
       'p_pos_llm', 'p_pos_hf', 'p_neg_hf', 'comment_weight',
       'impact_weighted_sentiment', 'guest_mentions', 'topic_requests'],
      dtype='object')

In [27]:
out[['video_title', 'guest_list','Topic_Category','cleaned_text', 'sentiment_p_pos', 'sentiment_bucket', 'p_pos_llm', 'p_pos_hf', 'p_neg_hf', 'comment_like_count', 'comment_weight', 'impact_weighted_sentiment']].head(10)

Unnamed: 0,video_title,guest_list,Topic_Category,cleaned_text,sentiment_p_pos,sentiment_bucket,p_pos_llm,p_pos_hf,p_neg_hf,comment_like_count,comment_weight,impact_weighted_sentiment
0,Jordan B Peterson: You Need To Listen To Your Wife! We've Built A Lonely & Sexless Society!,[Dr Jordan Peterson],relationship,This was very eye opening. I was sort of like this idiot that Jordan was trying to help.,0.399823,Neutral,0.5,0.333039,0.268008,0,1.0,0.399823
1,"Hormone Expert: Control Your Hormones Control Your Belly Fat! Cortisol, oestrogen, testosterone.",[Dr. Sara Szal],health,You cant even spell hormone. Shush.,0.0,Negative,0.011648,0.011648,0.878349,0,1.0,0.0
2,Exercise & Nutrition Scientist: The Truth About Exercise On Your Period! Take These 4 Supplements!,[Dr Stacy Sims],health,Im 40 and I agree with this list it is working for meeeee I feel great!,0.985389,Positive,0.985389,0.985389,0.003019,2,1.2,1.182467
3,Body Language Expert Explains Why People Dislike You,"[Vanessa Van Edwards, Steven]",mental health / psychology,Pure gold! Seeing a lot of confusion in the comments. You have to realise that all the advice is...,0.127034,Negative,0.625,0.12839,0.426005,3,1.2,0.152441
4,Exercise & Nutrition Scientist: The Truth About Exercise On Your Period! Take These 4 Supplements!,[Dr Stacy Sims],health,i tried intermittent fasting and my period stopped coming.. im still trying to fix it nobody war...,0.0,Negative,0.005976,0.005976,0.897784,0,1.0,0.0
5,Jordan B Peterson: You Need To Listen To Your Wife! We've Built A Lonely & Sexless Society!,[Dr Jordan Peterson],relationship,@'s not about disagreement though. Its literally saying or doing nothing when you see others do ...,0.066119,Negative,0.65,0.010198,0.741934,54,2.0,0.132237
6,Shaolin Warrior Master: Hidden Epidemic Nobody Talks About! This Modern Habit Is Killing Millions!,[Master Shi Heng Yi],health,Side note: What is it with spiritual people and tapping on tables? Is there an effect created by...,0.216586,Negative,0.5,0.027644,0.137783,0,1.0,0.216586
7,Shaolin Warrior Master: Hidden Epidemic Nobody Talks About! This Modern Habit Is Killing Millions!,[Master Shi Heng Yi],health,"Steven, your questions are amazingly on point. Wise, smart, thoughtful, well timed. Congratulat...",0.951639,Positive,0.9,0.986065,0.003411,0,1.0,0.951639
8,Body Language Expert Explains Why People Dislike You,"[Vanessa Van Edwards, Steven]",mental health / psychology,I prefer to be wary of first impressions and downplay their importance. How realistic are they? ...,0.0,Negative,0.058139,0.058139,0.445123,0,1.0,0.0
9,Body Language Expert Explains Why People Dislike You,"[Vanessa Van Edwards, Steven]",mental health / psychology,Jokes on you when I check my phone I raise it in front of my face and don't scrunch up in defeat...,0.329028,Negative,0.329028,0.329028,0.079858,0,1.0,0.329028


In [28]:
print(out['sentiment_bucket'].value_counts())
print("\n")
print(out['comment_weight'].value_counts())
print("\n")
print(out['guest_mentions'].value_counts())
print("\n")
print(out['topic_requests'].value_counts())

sentiment_bucket
Negative    53
Positive    32
Neutral     15
Name: count, dtype: int64


comment_weight
1.0    70
1.2    26
2.0     3
4.0     1
Name: count, dtype: int64


guest_mentions
[]    100
Name: count, dtype: int64


topic_requests
[]    100
Name: count, dtype: int64


- Fix logic for guest_mentions and topic request
- Guest mention: prompt the llm or NLP script to look for "bring xx on" or "invite xx" or "bring back" --> Variants like this
- Topic request: "more on xx", "discuss xx", "more of xx" -> variants like this