In [1]:
import pandas as pd
from tqdm.notebook import tqdm


df = pd.read_csv('/Users/riadanas/Desktop/MLE Diary of a CEO/data/raw/Esu8BXLBmZ4_comments.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
print(df.shape)
df.head(2)

(1296, 16)


Unnamed: 0,channel_name,channel_id,video_id,video_title,video_published_at,view_count,video_like_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,2025-09-29T07:00:15Z,58227,3338,UgwK5fuPVdGdnrRuUbB4AaABAg.ANdq3qa6jmIANdqhoWY4_e,I like this format much better than the other ...,@JoelJose12345,UClXaG5i-lnqXbUMb6ayjTRQ,6,2025-09-29T07:08:06Z,False,True,UgwK5fuPVdGdnrRuUbB4AaABAg
1,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,2025-09-29T07:00:15Z,58227,3338,UgwK5fuPVdGdnrRuUbB4AaABAg.ANdq3qa6jmIANds81ZbBzD,"Yeah, this format is great. Except that one ti...",@Little_Shadow_,UCVLRgNZlbI04EE-ssMhgaiA,4,2025-09-29T07:20:34Z,False,True,UgwK5fuPVdGdnrRuUbB4AaABAg


In [3]:
df = df.head(100)
print(df.shape)

(100, 16)


## Text Cleaning (light only)

In [4]:
import re
import spacy
import pandas as pd
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.cluster import KMeans

# ------------------------------------------------------
# 1. Text Cleaning (light only)
# ------------------------------------------------------
def clean_comment(text: str) -> str:
    """
    Light cleaning for comments:
    - Remove @mentions
    - Remove URLs
    - Remove emojis / non-ascii
    - Lowercase
    - Strip whitespace
    - Keep context words (no lemmatization, no stopword removal yet)
    """
    if not isinstance(text, str):
        return ""

    # remove mentions
    text = re.sub(r"@\w+", "", text)

    # remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # remove emojis/non-ascii
    text = text.encode("ascii", "ignore").decode()

    # lowercase + strip
    text = text.lower().strip()

    return text


In [5]:
# Apply light cleaning to the comment_text column
df["cleaned_text"] = df["comment_text"].apply(clean_comment)

# Preview the results
df[["comment_text", "cleaned_text"]].head(10)

Unnamed: 0,comment_text,cleaned_text
0,I like this format much better than the other ...,i like this format much better than the other ...
1,"Yeah, this format is great. Except that one ti...","yeah, this format is great. except that one ti..."
2,Yes Praise Jesus Christ!,yes praise jesus christ!
3,@@Little_Shadow_ Which episode was that one pl...,@ which episode was that one please?!?
4,Bro you should have invited a Muslim like Musl...,bro you should have invited a muslim like musl...
5,💛💛👊,
6,I like the format but man… while I usually dig...,i like the format but man while i usually dig ...
7,I really want a conversation with Esther Perel...,i really want a conversation with esther perel...
8,You need to have Britt Hartley on,you need to have britt hartley on
9,"Invite Omar Suleiman from Yaqeen Institute, he...","invite omar suleiman from yaqeen institute, he..."


In [6]:
df.head(1)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_published_at,view_count,video_like_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id,cleaned_text
0,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,2025-09-29T07:00:15Z,58227,3338,UgwK5fuPVdGdnrRuUbB4AaABAg.ANdq3qa6jmIANdqhoWY4_e,I like this format much better than the other ...,@JoelJose12345,UClXaG5i-lnqXbUMb6ayjTRQ,6,2025-09-29T07:08:06Z,False,True,UgwK5fuPVdGdnrRuUbB4AaABAg,i like this format much better than the other ...


## Ollama call + Prompt

In [7]:
import ollama
import json

def get_topic_category(title: str) -> str:
    """
    Use Ollama to classify the video title into a topic category.
    Example categories: health, mental health, productivity, finance, relationships, entrepreneurship, other.
    """
    prompt = f"""
    You are a helpful assistant. Categorize the following YouTube video title into ONE broad category:
    - health
    - mental health
    - productivity
    - finance
    - relationships
    - entrepreneurship
    - Religion / Spirituality
    - Technology
    - Education
    - Lifestyle
    - Entertainment
    - other

    Title: "{title}"

    Return only the category name, nothing else.
    """

    response = ollama.chat(
        model="llama3.2:3b",  # you can swap to another local model
        messages=[{"role": "user", "content": prompt}]
    )
    return response["message"]["content"].strip().lower()


## Topic Category (once per video)

In [8]:
# Enable tqdm for pandas
tqdm.pandas()

# Extract unique video_id/title pairs
video_meta = df[["video_id", "video_title"]].drop_duplicates()

# Apply Ollama category classification
video_meta["Topic_Category"] = video_meta["video_title"].apply(get_topic_category)

# Merge back into main dataframe
df = df.merge(video_meta[["video_id", "Topic_Category"]], on="video_id", how="left")


In [9]:
video_meta

Unnamed: 0,video_id,video_title,Topic_Category
0,Esu8BXLBmZ4,Atheist vs Christian vs Spiritual Thinker: The...,religion / spirituality


In [10]:
df['Topic_Category'].value_counts()

Topic_Category
religion / spirituality    100
Name: count, dtype: int64

## Sentiment analysis with Ollama

In [11]:
import ollama
import json

def get_sentiment_ollama(comment: str) -> dict:
    """
    Use Ollama to get sentiment score and category:
    - Score from -1.0 (super negative) to 1.0 (super positive).
    - Categories:
        * [-1.0, -0.75): Super Negative
        * [-0.75, -0.25): Negative
        * [-0.25, 0.25]: Neutral
        * (0.25, 0.75]: Positive
        * (0.75, 1.0]: Super Positive
    """
    if not comment.strip():
        return {"score": 0.0, "category": "Neutral"}  # empty comment safeguard

    prompt = f"""
    Analyze the sentiment of this YouTube comment. 
    Respond ONLY in JSON with two fields: "score" and "category".
    - Score must be between -1.0 (very negative) and 1.0 (very positive).
    - Consider negations ("don't like", "not good"), sarcasm, and context.
    - If the comment is off-topic, empty, or meaningless, assign Neutral (0.0).
    
    Comment: "{comment}"
    """

    response = ollama.chat(
        model="llama3.2:3b",
        messages=[{"role": "user", "content": prompt}],
    )

    try:
        result = json.loads(response["message"]["content"])
    except Exception:
        # fallback: force neutral if parsing fails
        result = {"score": 0.0, "category": "Neutral"}

    return result


In [12]:
# Ensure all comments are strings (replace NaN or numbers with empty string)
df["comment_text"] = df["comment_text"].fillna("").astype(str)

In [13]:
# Enable tqdm for pandas
tqdm.pandas()

# Apply with progress bar
df["sentiment_result"] = df["comment_text"].progress_apply(get_sentiment_ollama)
df["sentiment_score"] = df["sentiment_result"].apply(lambda x: x["score"])
df["sentiment_category"] = df["sentiment_result"].apply(lambda x: x["category"])

  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
pd.set_option("display.max_colwidth", None)

df[["cleaned_text", "sentiment_score", "sentiment_category"]].head(20)

Unnamed: 0,cleaned_text,sentiment_score,sentiment_category
0,"i like this format much better than the other . because when it's 2 people, most of the time it's both people having the same opinion. here it's like we take an idea and we get multiple points of view.",0.8,Positive
1,"yeah, this format is great. except that one time when you decided to turn on a feminist. how uncomfortable you felt when she was talking. it's time to see someone about that childhood wound you carry. \nit's showing",-1.0,Sarcastic Negative
2,yes praise jesus christ!,1.0,Extremely Positive
3,@ which episode was that one please?!?,0.67,Positive
4,bro you should have invited a muslim like muslim lantern ( muhammad ali).,1.0,hate speech
5,,1.0,Positive
6,"i like the format but man while i usually dig dr k, i feel like he hijacked the first 35 mins and you could have maybe been more proactive in moving the discussion along - i think he derailed aspects and needed some prompts",-0.2,Negative
7,i really want a conversation with esther perel and doctor k,0.5,Positive
8,you need to have britt hartley on,1.0,Positive
9,"invite omar suleiman from yaqeen institute, he's got the most gentle, respectful manner and he's highly intelligent with an inclusive mindset",0.9,Positive


In [16]:
df['sentiment_category'].value_counts()

sentiment_category
Positive                    58
Negative                    14
Neutral                      5
Very Positive                4
positive                     3
Negative with Sarcasm        1
Positive Sentiment           1
sarcastic negativity         1
Somewhat Positive            1
Critical                     1
Sarcasm                      1
Sarcastic/Intriguing         1
Philosophical/Deep           1
Supportive                   1
Sarcastic Negative           1
Sarcastic/Anti-religious     1
Neutral/Negative             1
Very Negative                1
hate speech                  1
Extremely Positive           1
very negative                1
Name: count, dtype: int64

In [17]:
df.drop(columns=["sentiment_result"], inplace=True)

## Test with Phi3:mini

In [18]:
import ollama
import json

def get_sentiment_phi(comment: str) -> dict:
    """
    Use Ollama to get sentiment score and category:
    - Score from -1.0 (super negative) to 1.0 (super positive).
    - Categories:
        * [-1.0, -0.75): Super Negative
        * [-0.75, -0.25): Negative
        * [-0.25, 0.25]: Neutral
        * (0.25, 0.75]: Positive
        * (0.75, 1.0]: Super Positive
    """
    if not comment.strip():
        return {"score": 0.0, "category": "Neutral"}  # empty comment safeguard

    prompt = f"""
    Analyze the sentiment of this YouTube comment. 
    Respond ONLY in JSON with two fields: "score" and "category".
    - Score must be between -1.0 (very negative) and 1.0 (very positive).
    - Consider negations ("don't like", "not good"), sarcasm, and context.
    - If the comment is off-topic, empty, or meaningless, assign Neutral (0.0).
    
    Comment: "{comment}"
    """

    response = ollama.chat(
        model="gemma:2b",
        messages=[{"role": "user", "content": prompt}],
    )

    try:
        result = json.loads(response["message"]["content"])
    except Exception:
        # fallback: force neutral if parsing fails
        result = {"score": 0.0, "category": "Neutral"}

    return result


In [19]:
# Clean and prepare the text
df["cleaned_comment"] = df["comment_text"].fillna("").astype(str).apply(clean_comment)

In [20]:
# Enable tqdm for pandas
tqdm.pandas()

# Apply with progress bar
df["sentiment_result"] = df["cleaned_comment"].progress_apply(get_sentiment_phi)
df["sentiment_score"] = df["sentiment_result"].apply(lambda x: x["score"])
df["sentiment_category"] = df["sentiment_result"].apply(lambda x: x["category"])

  0%|          | 0/100 [00:00<?, ?it/s]

In [21]:
pd.set_option("display.max_colwidth", None)

df[["cleaned_text", "sentiment_score", "sentiment_category"]].head(20)

Unnamed: 0,cleaned_text,sentiment_score,sentiment_category
0,"i like this format much better than the other . because when it's 2 people, most of the time it's both people having the same opinion. here it's like we take an idea and we get multiple points of view.",0.7,Positive
1,"yeah, this format is great. except that one time when you decided to turn on a feminist. how uncomfortable you felt when she was talking. it's time to see someone about that childhood wound you carry. \nit's showing",0.5,Opinion
2,yes praise jesus christ!,0.5,Positive sentiment
3,@ which episode was that one please?!?,0.6,question
4,bro you should have invited a muslim like muslim lantern ( muhammad ali).,0.2,Sarcasm
5,,0.0,Neutral
6,"i like the format but man while i usually dig dr k, i feel like he hijacked the first 35 mins and you could have maybe been more proactive in moving the discussion along - i think he derailed aspects and needed some prompts",0.2,Comment about video quality
7,i really want a conversation with esther perel and doctor k,0.8,Topic Recommendation
8,you need to have britt hartley on,0.8,Positive
9,"invite omar suleiman from yaqeen institute, he's got the most gentle, respectful manner and he's highly intelligent with an inclusive mindset",0.8,Positive
