<a href="https://colab.research.google.com/github/asadovkamran/advancedai_mock_exam/blob/main/advancedai_mock_exam_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SETUP

In [1]:
!pip -q install transformers sentence-transformers emoji google-api-python-client

import re, numpy as np, pandas as pd
np.random.seed(42)

CONF_THRESHOLDS = [0.6, 0.8]
MIN_WORD_FILTERS = [3, 5, 7]

TEXT_COL = "comment_text"
EMOJI_COL = "emojis"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

DATA COLLECTION AND PREPROCESSING

In [3]:
from googleapiclient.discovery import build
import emoji

API_KEY = "YOUR_API_KEY"
youtube = build("youtube", "v3", developerKey=API_KEY)

def extract_emojis(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return "".join(ch for ch in text if ch in emoji.EMOJI_DATA)

def fetch_comments(video_id: str, max_comments=1000):
    rows = []
    next_page = None

    while len(rows) < max_comments:
        req = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            order="time",
            pageToken=next_page,
            textFormat="plainText"
        )
        resp = req.execute()

        for item in resp.get("items", []):
            s = item["snippet"]["topLevelComment"]["snippet"]
            text = s.get("textDisplay", "")
            rows.append({
                "comment_id": item["snippet"]["topLevelComment"]["id"],
                "comment_text": text,
                "like_count": int(s.get("likeCount", 0)),
                "reply_count": int(item["snippet"].get("totalReplyCount", 0)),
                "emojis": extract_emojis(text),
            })
            if len(rows) >= max_comments:
                break

        next_page = resp.get("nextPageToken")
        if not next_page:
            break

    return pd.DataFrame(rows)


In [4]:
 df_raw = fetch_comments(video_id="q0TVOcgqt8o", max_comments=1000)
 df_raw.to_csv("youtube_comments_raw.csv", index=False)

LOAD DATASET

In [5]:
df = pd.read_csv("youtube_comments_raw.csv")
print(df.shape)
df.head()


(1000, 5)


Unnamed: 0,comment_id,comment_text,like_count,reply_count,emojis
0,Ugz1uaHhxbmIdaC-Zgx4AaABAg,Çox gözəl mahnıdı başa düşənə 😢,1,0,😢
1,UgwgKtmD10aPPGsoeEF4AaABAg,"Həyatda heç vax, məyus olmayın, itirdim fikirl...",1,0,
2,UgzfhLKF0fM3OGBFXS94AaABAg,Yalvarıraaam gəlmə sabah\nKreditin son günü ka...,2,0,
3,UgxB5gMHyqrex8EFuhx4AaABAg,Bu mahnı o qeder derın menası var cox sevdıyım...,1,0,
4,Ugz4OA0qEOTl60O1Xf94AaABAg,Elza xanım çoxx gözəl ifaları var❤️,0,0,❤


REMOVE URLS, NORMALIZE SPACING, REDUCE NOISE

In [6]:
url_re = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)

def normalize_text_az(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = url_re.sub("", text)
    text = text.replace("\n", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_no_url"] = df["comment_text"].astype(str).map(normalize_text_az)


REMOVE EMOJIS FROM TEXT TO CREATE CLEAN TEXT CHANNEL

In [7]:
import emoji
def remove_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace="") if isinstance(text, str) else ""

df["text_no_emoji"] = df["text_no_url"].map(remove_emojis).str.strip()


PARAMETER BASED EXPERIMENTS. MODEL LOCALDOC, 3-CLASS (https://huggingface.co/LocalDoc/sentiment_analysis_azerbaijani)


SENTIMENT MODEL

In [8]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

LD_MODEL = "LocalDoc/sentiment_analysis_azerbaijani"
tok_ld = AutoTokenizer.from_pretrained(LD_MODEL)
mdl_ld = AutoModelForSequenceClassification.from_pretrained(LD_MODEL)
mdl_ld.eval()

LABELS = np.array(["negative","neutral","positive"])

def run_localdoc(texts: pd.Series, batch_size=16, max_length=128) -> np.ndarray:
    texts = texts.fillna("").astype(str)
    out = []
    for i in range(0, len(texts), batch_size):
        batch = texts.iloc[i:i+batch_size].tolist()
        enc = tok_ld(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        with torch.no_grad():
            probs = F.softmax(mdl_ld(**enc).logits, dim=-1)
        out.append(probs.cpu().numpy())
    return np.vstack(out)

probs = run_localdoc(df["text_no_emoji"])
df["ld_neg"], df["ld_neu"], df["ld_pos"] = probs[:,0], probs[:,1], probs[:,2]

def apply_threshold_3class(df, neg, neu, pos, T):
    P = df[[neg, neu, pos]].values
    idx = P.argmax(axis=1)
    conf = P.max(axis=1)
    pred = LABELS[idx].astype(object)
    pred[conf < T] = "neutral"
    return pred, conf

df["ld_label_06"], df["ld_conf"] = apply_threshold_3class(df,"ld_neg","ld_neu","ld_pos",0.6)
df["ld_label_08"], _            = apply_threshold_3class(df,"ld_neg","ld_neu","ld_pos",0.8)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/924 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

DISTRIBUTIONS REPORT FOR THRESHOLDS 0.6 AND 0.8

In [9]:
def dist(col):
    return df[col].value_counts().reindex(["positive","negative","neutral"], fill_value=0)

summary_thresh = pd.DataFrame({
    "LocalDoc_0.6": dist("ld_label_06"),
    "LocalDoc_0.8": dist("ld_label_08"),
})
summary_thresh["delta_(0.8-0.6)"] = summary_thresh["LocalDoc_0.8"] - summary_thresh["LocalDoc_0.6"]
summary_thresh


Unnamed: 0,LocalDoc_0.6,LocalDoc_0.8,delta_(0.8-0.6)
positive,437,423,-14
negative,219,215,-4
neutral,344,362,18


COMMENT LENGTH FILTERING (>= 3, >= 5, >= 7 WORDS)

In [24]:
df["word_count"] = df["text_no_emoji"].fillna("").str.split().str.len()

rows = []

for w in [3, 5, 7]:
    sub = df[df["word_count"] >= w]

    for T, label_col in [(0.6, "ld_label_06"), (0.8, "ld_label_08")]:
        rows.append({
            "min_words": w,
            "threshold": T,
            "num_comments": len(sub),
            "positive": int(sub[label_col].eq("positive").sum()),
            "negative": int(sub[label_col].eq("negative").sum()),
            "neutral":  int(sub[label_col].eq("neutral").sum()),
        })

length_threshold_table = pd.DataFrame(rows)
length_threshold_table



Unnamed: 0,min_words,threshold,num_comments,positive,negative,neutral
0,3,0.6,790,382,212,196
1,3,0.8,790,368,209,213
2,5,0.6,635,331,171,133
3,5,0.8,635,319,170,146
4,7,0.6,493,264,138,91
5,7,0.8,493,256,137,100


In [25]:
pivot = length_threshold_table.pivot_table(
    index=["min_words", "threshold"],
    values=["positive", "negative", "neutral"],
    aggfunc="sum"
)
pivot


Unnamed: 0_level_0,Unnamed: 1_level_0,negative,neutral,positive
min_words,threshold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,0.6,212,196,382
3,0.8,209,213,368
5,0.6,171,133,331
5,0.8,170,146,319
7,0.6,138,91,264
7,0.8,137,100,256


ANALYTICAL TASKS

EMOJI-TEXT INCONSISTENCY

EMOJI-ONLY SENTIMENT LABELING (3-CLASS)

In [13]:
EMOJI_SENT_MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tok_em = AutoTokenizer.from_pretrained(EMOJI_SENT_MODEL)
mdl_em = AutoModelForSequenceClassification.from_pretrained(EMOJI_SENT_MODEL)
mdl_em.eval()

def emoji_to_words(s: str) -> str:
    # convert emojis like ❤️😭 -> "red_heart crying_face"
    if not isinstance(s, str): return ""
    return " ".join(emoji.demojize(ch).strip(":").replace("-", "_") for ch in s if ch in emoji.EMOJI_DATA)

df["emoji_text"] = df["emojis"].astype(str).map(emoji_to_words).str.replace(r"\s+", " ", regex=True).str.strip()

def run_emoji_sent(texts: pd.Series, batch_size=32, max_length=64) -> np.ndarray:
    texts = texts.fillna("").astype(str)
    out=[]
    for i in range(0,len(texts),batch_size):
        batch=texts.iloc[i:i+batch_size].tolist()
        enc=tok_em(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        with torch.no_grad():
            probs=F.softmax(mdl_em(**enc).logits, dim=-1)
        out.append(probs.cpu().numpy())
    return np.vstack(out)

emoji_rows = df[df["emoji_text"].str.strip().ne("")].copy()
ep = run_emoji_sent(emoji_rows["emoji_text"])
emoji_rows["emoji_neg"], emoji_rows["emoji_neu"], emoji_rows["emoji_pos"] = ep[:,0], ep[:,1], ep[:,2]
emoji_rows["emoji_label_06"], emoji_rows["emoji_conf"] = apply_threshold_3class(emoji_rows,"emoji_neg","emoji_neu","emoji_pos",0.6)

df = df.merge(
    emoji_rows[["comment_id","emoji_neg","emoji_neu","emoji_pos","emoji_label_06","emoji_conf"]],
    on="comment_id",
    how="left"
)


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

COMPUTE INCONSISTENCIES

In [14]:
df_both = df[df["emoji_label_06"].notna() & df["ld_label_06"].notna()].copy()
inconsistent = df_both[df_both["emoji_label_06"] != df_both["ld_label_06"]]

len(inconsistent), len(df_both)


(214, 356)

In [15]:
inconsistent.sample(10, random_state=42)[["text_no_emoji","emoji_text","ld_label_06","emoji_label_06"]]


Unnamed: 0,text_no_emoji,emoji_text,ld_label_06,emoji_label_06
37,Həqiqətənde çox gözəl mahnıdı,red_heart,positive,neutral
914,Ürreymden. Keçen bütün hislerim belkede yaşaya...,pensive_face,positive,neutral
317,2025 di ama səs məni 1998 1999 cu illərə babam...,crying_face crying_face,neutral,negative
872,"Xanımın səsidə,musiqiləri də gözəldir.Amma şei...",rose,negative,neutral
565,,crying_face crying_face crying_face crying_fac...,neutral,negative
550,2000ci illərdə aztvdə hansısa serialda ilk dəf...,crying_face,positive,negative
62,Elza xanımın bu əsəri bəsdir ki musiqi tarixin...,thumbs_up pensive_face,positive,negative
442,Sağ ol Elza,palms_up_together handshake bouquet,positive,neutral
364,Bu olanlardan sonra bura gelenleee salam,crying_face,neutral,negative
661,Canım ay mamam.Xəstəxanaya getdiyin günə lənət...,crying_face crying_face crying_face crying_fac...,positive,negative


SEMANTICALLY SIMILAR COMMENTS

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
emb = embedder.encode(df["text_no_emoji"].fillna("").tolist(), batch_size=32, show_progress_bar=True)

sim = cosine_similarity(emb)
np.fill_diagonal(sim, 0)

SIM_THRESHOLD = 0.90
pairs = np.argwhere(sim > SIM_THRESHOLD)
pairs = [(i,j,sim[i,j]) for i,j in pairs if i < j]

len(pairs)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

2439

In [17]:
for i,j,s in pairs[:8]:
    print(f"sim={s:.3f}")
    print("A:", df.iloc[i]["text_no_emoji"])
    print("B:", df.iloc[j]["text_no_emoji"])
    print("-"*60)


sim=0.923
A: Çox gözəl mahnıdı başa düşənə
B: Həqiqətənde çox gözəl mahnıdı
------------------------------------------------------------
sim=0.923
A: Çox gözəl mahnıdı başa düşənə
B: O qədər gözəldirkii mahnı
------------------------------------------------------------
sim=0.905
A: Çox gözəl mahnıdı başa düşənə
B: Gözəl səs gözəl sima gözəl xasiyət möhtəşəm mahnı mohtəsəm sənətkara yaraşır
------------------------------------------------------------
sim=0.976
A: Həyatda heç vax, məyus olmayın, itirdim fikirləşəndə, bəlkədə xilas olduqlarınızdı.
B: Həyatda hec vaxt, məyus olmayın, itirdim fikirləşənde, bəlkədə xilas olduqlarınızdı.
------------------------------------------------------------
sim=0.906
A: Bu mahnı o qeder derın menası var cox sevdıyım ınsandan vaz keçmək xəyallarımızın yarım qalması siz siz olun sevdiyinizdən keçməyin zaman keçdikcə yoxluğu sizidə yox edəcək amma əlinizdən heçnə gəlməyəcək..
B: Əziz,Elza xanım Seyidcahan sizə nə yazsam belə kifayət etməz,ancaq, çox istəy

SEMANTIC OUTLIERS

In [18]:
avg_sim = sim.mean(axis=1)
df["avg_semantic_similarity"] = avg_sim

thr = np.percentile(avg_sim, 5)   # bottom 5%
outliers = df[df["avg_semantic_similarity"] < thr]
len(outliers)


50

In [19]:
outliers.sample(8, random_state=42)[["text_no_emoji","avg_semantic_similarity"]]


Unnamed: 0,text_no_emoji,avg_semantic_similarity
269,2025,0.205681
836,17 yaşında uşaqlığıma son vida.,0.17748
483,Ay blyaa,0.175638
927,2020 de dinyelenler?,0.108827
332,2025 də qulaq asan var ?,0.184373
981,İl 2020 dinləyən var?,0.109896
464,Rippin Manors,0.161192
451,2026 hələ dinləyən var?,0.202264


POPULAR COMMENT ANALYSIS

In [20]:
like_thr  = df["like_count"].quantile(0.9)
reply_thr = df["reply_count"].quantile(0.9)

popular = df[(df["like_count"] >= like_thr) | (df["reply_count"] >= reply_thr)].copy()
len(popular)


1000

In [21]:
from collections import Counter
from nltk import ngrams

def tok_words(t):
    return [w for w in str(t).lower().split() if len(w) > 2]

wc = Counter()
bg = Counter()

for t in popular["text_no_emoji"].fillna(""):
    w = tok_words(t)
    wc.update(w)
    bg.update(ngrams(w, 2))

wc.most_common(20), bg.most_common(10)


([('son', 247),
  ('elza', 178),
  ('bir', 131),
  ('çox', 124),
  ('amma', 57),
  ('gözəl', 51),
  ('görüş', 49),
  ('vida,', 49),
  ('var', 48),
  ('qulaq', 48),
  ('belə', 45),
  ('bele', 45),
  ('kaş', 44),
  ('mahnı', 43),
  ('cox', 43),
  ('xanım', 40),
  ('hər', 39),
  ('səni', 39),
  ('kimi', 35),
  ('her', 32)],
 [(('son', 'vida,'), 49),
  (('vida,', 'son'), 49),
  (('elza', 'xanım'), 34),
  (('son', 'görüş'), 25),
  (('son', 'vida'), 23),
  (('elza', 'xanim'), 23),
  (('son', 'görüş.'), 20),
  (('elza', 'xanımın'), 18),
  (('kaş', 'görüş'), 17),
  (('çox', 'gözəl'), 16)])

SEMANTIC CATEGORY-BASED SENTIMENT ANALYSIS (5 CATEGORIES)

ZERO-SHOT SEMANTIC CATEGORY CLASSIFIER

In [22]:
from transformers import pipeline

zsc = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

CATS = [
    "Rational positive feedback on content",
    "Emotional reaction",
    "Criticism and dissatisfaction",
    "Direct address to the author",
    "Troll / non-constructive comment"
]

def classify_category(text):
    text = str(text).strip()
    if not text:
        return "Troll / non-constructive comment"
    out = zsc(text, CATS, multi_label=False)
    return out["labels"][0]

df["semantic_category"] = df["text_no_emoji"].apply(classify_category)
df["semantic_category"].value_counts()


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


Unnamed: 0_level_0,count
semantic_category,Unnamed: 1_level_1
Rational positive feedback on content,326
Emotional reaction,269
Troll / non-constructive comment,263
Criticism and dissatisfaction,111
Direct address to the author,31


SENTIMENTS COUNT PER CATEGORY

In [23]:
table = pd.crosstab(df["semantic_category"], df["ld_label_06"])
table = table.reindex(CATS).fillna(0).astype(int)
table


ld_label_06,negative,neutral,positive
semantic_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rational positive feedback on content,39,88,199
Emotional reaction,65,54,150
Criticism and dissatisfaction,49,37,25
Direct address to the author,7,14,10
Troll / non-constructive comment,59,151,53
