In [1]:
!pip install emoji
!pip install fasttext
!pip uninstall numpy
!pip install "numpy<2.0"

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/608.4 kB[0m [31m20.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-3

In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline
import emoji
import re
import regex
import fasttext

In [2]:
df = pd.read_csv("/content/youtube.csv")

In [3]:
df.sample()

Unnamed: 0,comment_id,video_id,video_url,video_category,comment_text,sentiment_label,sentiment_score,like_count,reply_count,published_at
3167,UgxEEFvtUJRpN44iJwx4AaABAg.ARpcrwu9FZjARx9F_vN8n3,eYLWMs9X0sQ,https://www.youtube.com/watch?v=eYLWMs9X0sQ,Gaming,Supper bro,neutral,0.0,1,,3 weeks ago


In [4]:
def classify_pure_junk(text):
  if text is None:
    return True
  text = text.strip()
  if(len(text) < 3):
    return True
  if re.fullmatch(r"[^\w\s]+", text):
    return True
  if re.fullmatch(r"(http\S+|www\S+)", text):
    return True
  return False

In [5]:
df["is_pure_junk"] = df["comment_text"].apply(classify_pure_junk)

In [6]:
df["is_pure_junk"].value_counts()

Unnamed: 0_level_0,count
is_pure_junk,Unnamed: 1_level_1
False,17160
True,1911


In [7]:
df =  df[df["is_pure_junk"] == False]

In [8]:
def clean_text(text):
  text = text.lower()
  text = emoji.demojize(text)
  text = re.sub(r"(.)\1{2,}", r"\1\1", text)
  text = re.sub(r"#\w+", "", text)
  text = re.sub(r"@\w+", "", text)
  text = re.sub(r"https?://[^\s]+", "", text)
  text = re.sub(r"\s+", " ", text).strip()
  return text

In [9]:
df["cleaned_text"] =  df["comment_text"].apply(clean_text)

In [10]:
hinglish_vocab = {
    # Pronouns / people
    "bhai","bhaiya","bhaii","bhaai","yaar","dost","log","public",
    "hum","tum","tu","aap","main","mein","mera","meri","tera","teri",
    "uska","uski","unka","unki","apna","apni","apne",

    # Negation / affirmation
    "haan","han","ha","nahi","nai","na","bilkul","haanji",

    # Common verbs (very important)
    "hai","hain","tha","thi","the","ho","hota","hoti","hote",
    "hua","hui","gaya","gayi","gaye","kar","karo","kiya","kiye",
    "karte","karna","jaa","ja","aaya","aayi","aaye","de","do",
    "diya","liye","lena","le","bol","bolo","dekho","dekh",

    # Question words
    "kya","kyu","kyun","kaise","kab","kahan","kaun",

    # Connectors / particles
    "toh","tho","hi","bhi","bas","sirf","aur","par","lekin",
    "matlab","waise","shayad","ab","phir","tab",

    # Quantity / degree
    "bohot","bahut","bhut","zyada","zyaada","kam","thoda","thodi",

    # Sentiment / opinion words
    "acha","accha","mast","badiya","badhiya","sahi","galat",
    "bakwas","bekar","ganda","solid","op","best","nice",

    # YouTube / social context
    "video","content","channel","creator","subscribe","subscribed",
    "like","share","comment","support","views",

    # Misc common
    "scene","wala","wali","wale","basically","actually","literally"
}

In [11]:
def contains_hindi(text):
  return bool(regex.search(r"\p{Devanagari}", text))

In [12]:
def count_hinglish_words(text):
    words = re.findall(r"\b[a-zA-Z]+\b", text.lower())
    count = 0
    for word in words:
        if word in hinglish_vocab:
            count += 1
    return count

In [13]:
lid_model = fasttext.load_model("/content/lid.176.bin")

In [14]:
def get_lid_lang(text):
  pred = lid_model.predict(text)
  lang = pred[0][0].replace("__label__","")
  confidence = pred[1][0]
  return lang, confidence

In [15]:
def get_lang(text):
    text = text.strip()
    if not text:
        return "other"

    model_lang, model_conf = get_lid_lang(text)
    has_hindi = contains_hindi(text)
    hinglish_cnt = count_hinglish_words(text)

    if has_hindi and model_lang == "hi":
        return "hi"

    if has_hindi and model_lang == "en":
        return "mixed"

    if model_lang == "en" and hinglish_cnt >= 1:
        return "hinglish"

    if model_conf < 0.6 and hinglish_cnt >= 1:
        return "hinglish"

    if model_lang == "en":
        return "english"

    return "other"

In [16]:
df["language"] = df['cleaned_text'].apply(get_lang)

In [17]:
df["lang_confidence"] = df["cleaned_text"].apply(
    lambda text : lid_model.predict(text)[1][0]
)

In [18]:
df["language"].value_counts()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
hinglish,7613
english,6867
other,2179
hi,478
mixed,23


In [19]:
sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
    # truncation=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-xlm-roberta-base-sentiment
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]



special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
sentiment_pipe.predict("Thanks for such an amazing video on this platform i am blessed")

In [20]:
def weak_label(text):
  if not text:
    return None, None
  pred = sentiment_pipe.predict(text[:552])[0]
  return pred["label"], pred["score"]

In [21]:
df[["label", "label_confidence"]] =  df["cleaned_text"].apply(
    lambda text : pd.Series(weak_label(text))
)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [22]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neutral,7101
positive,5220
negative,4826


In [23]:
df["sentiment_label"].value_counts()

Unnamed: 0_level_0,count
sentiment_label,Unnamed: 1_level_1
neutral,9621
positive,5917
negative,1622


In [24]:
df.to_csv("processed-yt.csv")