# Import The Required Libraries

In [13]:
import numpy as np
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from shekar import Normalizer, WordTokenizer
from shekar.preprocessing import StopWordRemover, PunctuationRemover

### Load Datasest

In [14]:
df = pd.read_csv("cleaned.csv")

### Prepare & Perform Pre-Processing

In [15]:
normalizer = Normalizer()
pipeline = StopWordRemover() | PunctuationRemover()
tokenizer = WordTokenizer()

In [16]:
def preprocess_comment(text):
    text = "" if pd.isna(text) else str(text)
    text = normalizer(text)
    text = pipeline(text)
    tokens = list(tokenizer(text))
    return " ".join(tokens)

### Skip Some Unnecessary Cases

In [17]:
p1 = 'چیز اضافه'
p2 = 'چیزی اضافه'
pattern = f'({p1}|{p2})'

mask = ((df['comment_text'].str.contains(pattern, na=False) &(df['comment_text'].str.len() < 20))|
    (df['comment_text'].str.len() < 5))

df['comment_clean'] = np.where(mask ,df['comment_text'],df['comment_text'].astype(str).apply(preprocess_comment))

  mask = ((df['comment_text'].str.contains(pattern, na=False) &(df['comment_text'].str.len() < 20))|


### Load Model

In [18]:
MODEL = "HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-multi"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

use_cuda = torch.cuda.is_available()
device = 0 if use_cuda else -1
if use_cuda:
    model = model.to("cuda")

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    truncation=True
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/651M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-multi
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


### Execute The Model

In [46]:
texts = df.loc[~mask, 'comment_clean'].fillna("").astype(str).tolist()

batch_size = 32
results = sentiment_pipe(texts, batch_size=batch_size)

### Collect Results In A Proper Way

In [48]:
raw_labels = [r['label'] for r in results]
scores = [r.get('score', None) for r in results]

In [49]:
def map_label_to_num(raw_label):
    L = raw_label
    if "happy" in L or 'delighted' in L:
        return 1
    if 'furious' in L or 'angry' in L:
        return -1
    return 0

In [55]:
df['sentiment'] = 0
df['sentiment_score'] = 1

mapped = [map_label_to_num(l) for l in raw_labels]

df.loc[~mask, 'sentiment'] = mapped
df['sentiment'] = df['sentiment'].astype(int)

df.loc[~mask, 'sentiment_score'] = scores
df['sentiment_score'] = df['sentiment_score'].round(3)


  df.loc[~mask, 'sentiment_score'] = scores


### Show Some Samples

In [57]:
df[['comment_clean', 'sentiment', 'sentiment_score']].head(10)

Unnamed: 0,comment_clean,sentiment,sentiment_score
0,چیزی اضافه ایی نیست,0,1.0
1,درس نمی‌ده اصلا نمره هارو نمی‌ده اصلا ترم اخر ...,-1,0.469
2,دنبال استاد ادب دانشجو میخواید گزینه خوبیه,1,0.912
3,سخت‌گیر پربازده,1,0.939
4,چیزی اضافه ایی نیست,0,1.0
5,دقت کنید جلسه مطالب بخونید ارشد شب امتحانی‌نیس...,1,0.528
6,چیزی اضافه ایی نیست,0,1.0
7,اعصابی‌براتون نمی‌مونه نمره بگیرید بد ترم ۴۰ ک...,1,0.886
8,استاد هستند امتحان‌های سختی می‌گیرن اون خونده ...,1,0.988
9,سعی کنید سر کلاس گوش بدید مطالب,0,0.996


### Save Locally for Better and Faster Results

In [58]:
df.to_csv("final.csv",index=False, encoding="utf-8-sig")