In [1]:
import pandas as pd
from tqdm.notebook import tqdm


df = pd.read_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/raw/Esu8BXLBmZ4_comments.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# df = df.head(100)

In [3]:
print(df.shape)
df.head(2)

(1296, 16)


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_published_at,view_count,video_like_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,2025-09-29T07:00:15Z,58227,3338,UgwK5fuPVdGdnrRuUbB4AaABAg.ANdq3qa6jmIANdqhoWY4_e,I like this format much better than the other ...,@JoelJose12345,UClXaG5i-lnqXbUMb6ayjTRQ,6,2025-09-29T07:08:06Z,False,True,UgwK5fuPVdGdnrRuUbB4AaABAg
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,2025-09-29T07:00:15Z,58227,3338,UgwK5fuPVdGdnrRuUbB4AaABAg.ANdq3qa6jmIANds81ZbBzD,"Yeah, this format is great. Except that one ti...",@Little_Shadow_,UCVLRgNZlbI04EE-ssMhgaiA,4,2025-09-29T07:20:34Z,False,True,UgwK5fuPVdGdnrRuUbB4AaABAg


## Text cleaning

In [4]:
import re
import spacy
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

# ------------------------------------------------------
# 1. Text Cleaning (light only)
# ------------------------------------------------------
def clean_comment(text: str) -> str:
    """
    Light cleaning for comments:
    - Remove @mentions
    - Remove URLs
    - Remove emojis / non-ascii
    - Lowercase
    - Strip whitespace
    - Keep context words (no lemmatization, no stopword removal yet)
    """
    if not isinstance(text, str):
        return ""

    # remove mentions
    text = re.sub(r"@\w+", "", text)

    # remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # remove emojis/non-ascii
    text = text.encode("ascii", "ignore").decode()

    # lowercase + strip
    text = text.lower().strip()

    return text


In [5]:
# Apply light cleaning to the comment_text column
df["cleaned_text"] = df["comment_text"].apply(clean_comment)

# Preview the results
df[["comment_text", "cleaned_text"]].head(10)

Unnamed: 0,comment_text,cleaned_text
0,I like this format much better than the other ...,i like this format much better than the other ...
1,"Yeah, this format is great. Except that one ti...","yeah, this format is great. except that one ti..."
2,Yes Praise Jesus Christ!,yes praise jesus christ!
3,@@Little_Shadow_ Which episode was that one pl...,@ which episode was that one please?!?
4,Bro you should have invited a Muslim like Musl...,bro you should have invited a muslim like musl...
5,💛💛👊,
6,I like the format but man… while I usually dig...,i like the format but man while i usually dig ...
7,I really want a conversation with Esther Perel...,i really want a conversation with esther perel...
8,You need to have Britt Hartley on,you need to have britt hartley on
9,"Invite Omar Suleiman from Yaqeen Institute, he...","invite omar suleiman from yaqeen institute, he..."


## Topic category (llama) -> Once per video

In [6]:
import ollama
import json

def get_topic_category(title: str) -> str:
    """
    Use Ollama to classify the video title into a topic category.
    Example categories: health, mental health, productivity, finance, relationships, entrepreneurship, other.
    """
    prompt = f"""
    You are a helpful assistant. Categorize the following YouTube video title into ONE broad category:
    - health
    - mental health
    - productivity
    - finance
    - relationships
    - entrepreneurship
    - Religion / Spirituality
    - Technology
    - Education
    - Lifestyle
    - Entertainment
    - other

    Title: "{title}"

    Return only the category name, nothing else.
    """

    response = ollama.chat(
        model="llama3.2:3b",  # you can swap to another local model
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip().lower()


In [7]:
# Enable tqdm for pandas
tqdm.pandas()

# Extract unique video_id/title pairs
video_meta = df[["video_id", "video_title"]].drop_duplicates()

# Apply Ollama category classification
video_meta["Topic_Category"] = video_meta["video_title"].apply(get_topic_category)

# Merge back into main dataframe
df = df.merge(video_meta[["video_id", "Topic_Category"]], on="video_id", how="left")

In [8]:
df['Topic_Category'].value_counts()

Topic_Category
religion / spirituality    1296
Name: count, dtype: int64

## Sentiment analysis (Gemma2)

In [9]:
import re
import json
import numpy as np
import pandas as pd
import ollama
from tqdm import tqdm
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

tqdm.pandas()

# -----------------------------
# 0) Helpers & resources
# -----------------------------
NEG_PHRASES = [
    r"\bnot good\b", r"\bnot great\b", r"\bnot helpful\b", r"\bdon't like\b", r"\bdont like\b",
    r"\bnot worth\b", r"\bwaste of time\b", r"\btoo long\b", r"\btoo slow\b", r"\btoo loud\b",
    r"\bmisleading\b", r"\bclickbait\b", r"\bbiased\b", r"\bconfusing\b", r"\bannoying\b",
    r"\bcringe\b", r"\bstupid\b", r"\bdumb\b", r"\bbad\b", r"\bawful\b", r"\bterrible\b",
    r"\buseless\b", r"\bpointless\b", r"\bwrong\b", r"\bpoor (audio|sound|quality)\b",
    r"\birrelevant\b", r"\boff\-topic\b", r"\bproblem\b", r"\bissue\b", r"\bdisappoint(ing|ed)\b",
    r"\brude\b", r"\boffensive\b", r"\bunfunny\b", r"\bboring\b", r"\blazy\b", r"\btoxic\b",
    r"\bhate\b", r"\bgarbage\b", r"\bignorant\b", r"\bweird\b", r"\bnegative\b", r"\bbroken\b",
    r"\bdownvote\b", r"\bterribly\b", r"\bdislike\b", r"\bpathetic\b", r"\bworse\b"
]
NEG_RE = re.compile("|".join(NEG_PHRASES))

SHORT_PRAISE_RE = re.compile(r"^(nice|cool|great|good|amazing|awesome|wow|love|thanks|perfect)[.!]?$", re.I)

# Load spaCy (light model for entity recognition)
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])

def clean_comment(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = text.encode("ascii", "ignore").decode()
    return text.strip()


def bucket_from_p(p: float) -> str:
    if p < 0.35:
        return "Negative"
    if p > 0.65:
        return "Positive"
    return "Neutral"

# -----------------------------
# 1) HF Sentiment (RoBERTa)
# -----------------------------
_HF_MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
_tok = AutoTokenizer.from_pretrained(_HF_MODEL)
_model = AutoModelForSequenceClassification.from_pretrained(_HF_MODEL)
_model.eval()

def roberta_probs(text: str) -> dict:
    if not text:
        return {"neg": 0.0, "neu": 1.0, "pos": 0.0}
    with torch.no_grad():
        inputs = _tok(text[:512], return_tensors="pt")
        logits = _model(**inputs).logits
        probs = F.softmax(logits, dim=1).cpu().numpy()[0]
    return {"neg": float(probs[0]), "neu": float(probs[1]), "pos": float(probs[2])}


# -----------------------------
# 2) LLM Sentiment (Gemma 2B)
# -----------------------------
def gemma_score_01(text: str) -> float:
    if not text:
        return 0.5
    prompt = f"""
    You are a neutral linguistic expert analyzing sentiment.
    Evaluate only the tone of the YouTube comment.
    Consider sarcasm and negation carefully.
    Return JSON only: {{"score": <float between -1.0 and 1.0>}}
    Comment: "{text}"
    """
    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        j = json.loads(resp["message"]["content"])
        s = float(j.get("score", 0.0))
    except Exception:
        s = 0.0

    p = (s + 1.0) / 2.0
    if NEG_RE.search(text) and p > 0.5:
        p -= 0.25
    if SHORT_PRAISE_RE.match(text) and p > 0.7:
        p = 0.6
    return float(np.clip(p, 0.0, 1.0))


# -----------------------------
# 3) Ensemble Sentiment
# -----------------------------
def ensemble_sentiment(text: str) -> dict:
    text_clean = clean_comment(text.lower())
    if not text_clean:
        return {"p_pos": 0.5, "bucket": "Neutral", "p_pos_llm": 0.5, "p_pos_hf": 0.33, "p_neg_hf": 0.33}

    p_pos_llm = gemma_score_01(text_clean)
    hf = roberta_probs(text_clean)
    p_pos_hf, p_neg_hf = hf["pos"], hf["neg"]

    w_llm, w_hf = 0.4, 0.6
    p_pos = w_llm * p_pos_llm + w_hf * p_pos_hf

    if p_neg_hf - p_pos_hf > 0.20 and p_pos > 0.3:
        p_pos -= 0.20

    p_pos = float(np.clip(p_pos, 0.0, 1.0))
    bucket = bucket_from_p(p_pos)
    return {
        "p_pos": p_pos,
        "bucket": bucket,
        "p_pos_llm": p_pos_llm,
        "p_pos_hf": p_pos_hf,
        "p_neg_hf": p_neg_hf,
    }


# -----------------------------
# 4) Likes Weighting
# -----------------------------
def like_weight(likes: float) -> float:
    if likes is None or likes <= 0:
        return 1.0
    if likes < 10:
        return 1.2
    if likes < 100:
        return 2.0
    if likes < 500:
        return 3.0
    return 4.0


# -----------------------------
# 5) Improved Guest Extraction
# -----------------------------

BANNED_GUEST_WORDS = {
    "jesus", "praise jesus", "ohhhh", "video", "motivation", "johari", "topic", "content"
}

GUEST_HINTS = re.compile(r"(with|feat\.|featuring|guest|bring back|have .* on|invite|episode with)", re.I)
TITLE_CUE_RE = re.compile(r"feat\.|featuring|with|guest|w/", re.I)

def extract_guests(text: str, video_title: str = "") -> list:
    """
    Extract guest names from comment or title with 3-layer logic:
    1. spaCy PERSON entities (fast)
    2. Gemma (if hints present)
    3. Fallback to video title if contains 'feat.' etc.
    """
    text = clean_comment(text)
    if not text:
        return []

    doc = nlp(text)
    names = [ent.text.strip() for ent in doc.ents if ent.label_ == "PERSON"]

    # If found via spaCy, trust them (light clean)
    if names:
        names = [n for n in names if len(n) > 2 and n.lower() not in BANNED_GUEST_WORDS]
        return list(set(names))

    # Only call LLM if hints present
    if not GUEST_HINTS.search(text):
        return []

    prompt = f"""
    Extract only the names of people explicitly mentioned or suggested as podcast guests.
    Ignore general words, religious figures, or vague text.
    Return only JSON array of proper names (no duplicates, no empty values).
    Example:
    ["Jordan Peterson", "Dr K", "Alex O'Connor"]
    Comment: "{text}"
    """

    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role": "user", "content": prompt}])
        names = json.loads(resp["message"]["content"])
    except Exception:
        names = []

    # Post-filter
    if isinstance(names, list):
        names = [
            n.strip() for n in names
            if n and len(n) > 2 and n[0].isupper() and n.lower() not in BANNED_GUEST_WORDS
        ]
    else:
        names = []

    # Fallback from video title
    if not names and TITLE_CUE_RE.search(video_title):
        doc_t = nlp(video_title)
        title_names = [ent.text.strip() for ent in doc_t.ents if ent.label_ == "PERSON"]
        names.extend(title_names)

    return list(set(names))



# -----------------------------
# 6) Topic Request Extraction
# -----------------------------
TOPIC_HINTS = re.compile(r"(talk about|episode on|discuss|cover|would love|should do|next guest|topic|content about)", re.I)

def extract_topics(text: str) -> list:
    text = clean_comment(text)
    if not text or not TOPIC_HINTS.search(text):
        return []

    prompt = f"""
    Extract specific topic requests or subjects mentioned in this YouTube comment.
    Return only JSON list of short topics.
    Example: ["AI", "mental health", "fitness"]
    Comment: "{text}"
    """
    try:
        resp = ollama.chat(model="gemma:2b", messages=[{"role":"user","content":prompt}])
        topics = json.loads(resp["message"]["content"])
        if isinstance(topics, list):
            return topics
    except Exception:
        pass
    return []


# -----------------------------
# 7) Apply All to DataFrame
# -----------------------------
def apply_sentiment(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["cleaned_text"] = df["cleaned_text"].fillna("").astype(str)

    # Sentiment
    res = df["cleaned_text"].progress_apply(ensemble_sentiment)
    df["sentiment_p_pos"] = res.apply(lambda r: r["p_pos"])
    df["sentiment_bucket"] = res.apply(lambda r: r["bucket"])
    df["p_pos_llm"] = res.apply(lambda r: r["p_pos_llm"])
    df["p_pos_hf"] = res.apply(lambda r: r["p_pos_hf"])
    df["p_neg_hf"] = res.apply(lambda r: r["p_neg_hf"])

    # Weights
    df["comment_like_count"] = df["comment_like_count"].fillna(0).astype(int)
    df["comment_weight"] = df["comment_like_count"].apply(like_weight)
    df["impact_weighted_sentiment"] = df["sentiment_p_pos"] * df["comment_weight"]

    # New columns — guests & topics
    df["guest_mentions"] = df.progress_apply(
        lambda r: extract_guests(r["cleaned_text"], r.get("video_title", "")), axis=1
    )

    df["topic_requests"] = df["cleaned_text"].progress_apply(extract_topics)

    return df


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# -----------------------------
# 6) Run
# -----------------------------
df_sent = apply_sentiment(df)

100%|██████████| 1296/1296 [08:52<00:00,  2.43it/s]
100%|██████████| 1296/1296 [02:07<00:00, 10.17it/s]
100%|██████████| 1296/1296 [00:44<00:00, 28.94it/s] 


In [11]:
df_sent[[
    "comment_text",
    "cleaned_text",
    "sentiment_p_pos",
    "sentiment_bucket",
    "p_pos_llm",
    "p_pos_hf",
    "p_neg_hf",
    "comment_like_count",
    "comment_weight",
    "impact_weighted_sentiment",
    "guest_mentions",
    "topic_requests"
]].head(35)

Unnamed: 0,comment_text,cleaned_text,sentiment_p_pos,sentiment_bucket,p_pos_llm,p_pos_hf,p_neg_hf,comment_like_count,comment_weight,impact_weighted_sentiment,guest_mentions,topic_requests
0,I like this format much better than the other ...,i like this format much better than the other ...,0.848971,Positive,0.77,0.901619,0.010586,6,1.2,1.018766,[],[]
1,"Yeah, this format is great. Except that one ti...","yeah, this format is great. except that one ti...",0.174155,Negative,0.75,0.123592,0.595837,4,1.2,0.208986,[],[]
2,Yes Praise Jesus Christ!,yes praise jesus christ!,0.892407,Positive,0.775,0.970679,0.003459,2,1.2,1.070889,[],[]
3,@@Little_Shadow_ Which episode was that one pl...,@ which episode was that one please?!?,0.41135,Neutral,0.87,0.105584,0.015177,0,1.0,0.41135,[],[]
4,Bro you should have invited a Muslim like Musl...,bro you should have invited a muslim like musl...,0.178369,Negative,0.89,0.037282,0.241366,2,1.2,0.214043,[muhammad ali],[]
5,💛💛👊,,0.5,Neutral,0.5,0.33,0.33,0,1.0,0.5,[],[]
6,I like the format but man… while I usually dig...,i like the format but man while i usually dig ...,0.280766,Negative,0.675,0.017943,0.775083,1,1.2,0.336919,[dr k],[]
7,I really want a conversation with Esther Perel...,i really want a conversation with esther perel...,0.486593,Neutral,0.725,0.327655,0.056819,1,1.2,0.583912,[],[]
8,You need to have Britt Hartley on,you need to have britt hartley on,0.423873,Neutral,0.91,0.099788,0.073581,2,1.2,0.508647,[],[]
9,"Invite Omar Suleiman from Yaqeen Institute, he...","invite omar suleiman from yaqeen institute, he...",0.881981,Positive,0.81,0.929968,0.007452,0,1.0,0.881981,[omar suleiman],[]


In [12]:
df_sent['sentiment_bucket'].value_counts()

sentiment_bucket
Negative    728
Positive    319
Neutral     249
Name: count, dtype: int64

In [13]:
df_sent['guest_mentions'].value_counts()

guest_mentions
[]                                                                             1001
[alex]                                                                           55
[dr k]                                                                           32
[god]                                                                            26
[dr k, alex]                                                                     12
[jesus christ]                                                                    9
[alex o'connor]                                                                   7
[dr.]                                                                             6
[sandra ann shaw]                                                                 5
[jordan peterson]                                                                 5
[Jordan Peterson, Dr K, Alex O'Connor]                                            3
[muhammad ali]                                               

In [14]:
df_sent['topic_requests'].value_counts()

topic_requests
[]    1296
Name: count, dtype: int64

In [15]:
# df_sent.to_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/processed/processed_snapshot4.csv', index=False)