In [97]:
import warnings
warnings.simplefilter('ignore')
import logging
logging.disable(logging.WARNING)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, precision_recall_fscore_support
from sklearn.metrics import classification_report

import torch
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_metric

In [70]:
filename = "/kaggle/input/uzbek-news-preprocessed-datasets/combined_uznews_30k_preprocessed_ready_for_modeling.csv"
df = pd.read_csv(filename)
df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,text,sentiment
0,milliy elektron ommaviy axborot vositalari ass...,1
1,wizzair abu dabi arzon aviakompaniyasi abu dab...,1
2,yildan beri yuvinmagan dunyoning eng kir odami...,0
3,o‘zbekiston birinchi kuzgi sovuqni kutmoqda du...,1
4,iyundan tibbiyot xodimlarining oylik maoshi os...,1


# Splitting the dataset into training and test sets for model evaluation

In [71]:
seed = 42
df, df_test = train_test_split(df, test_size=.1, random_state=seed)

In [None]:
df['sentiment'].value_counts()

## Oversampling the training data to balance classes

In [74]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1));
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'sentiment']);

In [75]:
train_os['sentiment'].value_counts()

1    21427
0    21427
Name: sentiment, dtype: int64

## Creating Train, Validation, and Test Sets

In [76]:
X = train_os['text'].values
y = train_os['sentiment'].values

A validation set will be extracted from the training set to monitor the validation accuracy, and so prevent overfitting.

In [77]:
X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size=.1, stratify=train_labels, random_state=seed)

In [78]:
X_test = df_test['text'].values
y_test = df_test['sentiment'].values

In [79]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_valid.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 38568
VALIDATION DATA: 4286
TESTING DATA: 3073


## Tokenization and Encoding

In [80]:
tokenizer = AutoTokenizer.from_pretrained('rifkat/uztext-3Gb-BPE-Roberta')

First, we check the length of the longest tokenized sentence by roberta tokenizer:

In [81]:
token_lens = []

for txt in X_train:
    tokens = tokenizer.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)
max_length

126

In [82]:
MAX_LEN=128

In [83]:
train_encodings = tokenizer.__call__(X_train.tolist(), truncation=True, max_length=MAX_LEN, padding='max_length')
val_encodings = tokenizer.__call__(X_valid.tolist(), truncation=True, max_length=MAX_LEN, padding='max_length')
test_encodings = tokenizer.__call__(X_test.tolist(), truncation=True, max_length=MAX_LEN, padding='max_length')

## Creating PyTorch Dataset

In [84]:
class UznewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = UznewsDataset(train_encodings, train_labels)
val_dataset = UznewsDataset(val_encodings, val_labels)
test_dataset = UznewsDataset(test_encodings, test_labels)

# Training the model

In [85]:
model = AutoModelForSequenceClassification.from_pretrained('rifkat/uztext-3Gb-BPE-Roberta', num_labels=2)
model = model.to('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [86]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [87]:
tokenizer = AutoTokenizer.from_pretrained('rifkat/uztext-3Gb-BPE-Roberta')

In [88]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=3, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy='no',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [89]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.429,0.343378,0.85581,0.855147,0.856743,0.855944
2,0.2201,0.277223,0.908306,0.943712,0.868409,0.904496
3,0.1089,0.369431,0.919505,0.956345,0.879141,0.91612


<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>
<class 'transformers.trainer_utils.EvalPrediction'>


TrainOutput(global_step=7233, training_loss=0.2740413415783028, metrics={'train_runtime': 907.1983, 'train_samples_per_second': 127.54, 'train_steps_per_second': 7.973, 'total_flos': 3831751973523456.0, 'train_loss': 0.2740413415783028, 'epoch': 3.0})

# Results on Test Set

In [90]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

preds = trainer.predict(test_dataset)
report = classification_report(y_true=preds.label_ids, y_pred=preds.predictions.argmax(-1), target_names=['Negative', 'Positive'])
metrics = compute_metrics(preds)

print('Classification Report:\n', report)
print('Metrics:\n', metrics);

<class 'transformers.trainer_utils.EvalPrediction'>
Classification Report:
               precision    recall  f1-score   support

    Negative       0.59      0.62      0.60       651
    Positive       0.90      0.89      0.89      2422

    accuracy                           0.83      3073
   macro avg       0.74      0.75      0.75      3073
weighted avg       0.83      0.83      0.83      3073

Metrics:
 {'accuracy': 0.8291571753986332, 'f1': 0.8304137994295616, 'precision': 0.8318203879991514, 'recall': 0.8291571753986332}


## Predictions on user input

In [93]:
# while True:
#     text = input("Enter a sentence to classify (or 'exit' to quit): ")
#     if text.lower() == 'exit':
#         break

# #     text = clean_text(text)
#     inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to('cuda')
#     outputs = model(**inputs)
#     predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

#     label_map = {0: "negative", 1: "positive"}
#     predicted_label = label_map[predictions.argmax().item()]
#     predicted_prob = predictions[0][predictions.argmax().item()].item()

#     print(f"Predicted label: {predicted_label}")
#     print(f"Predicted probability: {predicted_prob:.4f}")

Enter a sentence to classify (or 'exit' to quit):  So‘nggi paytlarda ijtimoiy tarmoqlarda Toshkentda o‘limga olib keluvchi ruh sotuvchi ayrim ayollar haqida mish-mishlar tarqala boshladi. Foydalanuvchilarning so‘zlariga ko‘ra, bu atirlarni yutgan odamlar hushidan ketishgan va hatto o‘lib ketishgan, ayollar esa kvartiralarni talon-taroj qilishgan. Politsiya bo'limi bu xulosani rad etdi


Predicted label: negative
Predicted probability: 0.9996


Enter a sentence to classify (or 'exit' to quit):  Ota-onalar Toshkentdagi bolalar bog‘chasi mudirining o‘g‘lini kichik qizlarga nisbatan nojo‘ya xatti-harakatlarda aybladi. Mazkur holat yuzasidan jinoyat ishi qo‘zg‘atilib, tergov harakatlari olib borilmoqda


Predicted label: negative
Predicted probability: 0.9862


Enter a sentence to classify (or 'exit' to quit):  Buxoro viloyatida yana bir qizning linchilanishi sahnalashtirildi. Sochlarini oldirib, kaltaklagan.


Predicted label: negative
Predicted probability: 0.7660


Enter a sentence to classify (or 'exit' to quit):  Surxondaryo viloyatida erkak o‘z xotinini xiyonat qilgani uchun o‘z farzandlarining ko‘z o‘ngida bo‘g‘ib o‘ldirdi.


Predicted label: negative
Predicted probability: 0.9996


Enter a sentence to classify (or 'exit' to quit):  Ijtimoiy tarmoqlarda mahalliy aholidan biri Toshkent viloyati hokimi Zoir Mirzayev bilan uchrashuvda vazir bo‘lishga tayyorligini ma’lum qilgan kulgili video tarqaldi. Shu bois viloyat rahbaridan O‘zbekiston Prezidentligiga o‘z nomzodini taklif qilishni so‘raydi.


Predicted label: positive
Predicted probability: 0.9999


Enter a sentence to classify (or 'exit' to quit):  Ijtimoiy tarmoqlarda o‘smirning shafqatsizlarcha kaltaklangani aks etgan video tarqaldi. Politsiya voqeaga aloqador shaxslarni aniqlashga harakat qilmoqda.


Predicted label: negative
Predicted probability: 0.9990


Enter a sentence to classify (or 'exit' to quit):  17 yoshli bir guruh yigitlar masjid yaqinidagi ehson qutisini o‘g‘irlab ketishdi. Ular pulni xarid qilish va o'yin-kulgiga sarflashdi.


Predicted label: negative
Predicted probability: 0.9693


Enter a sentence to classify (or 'exit' to quit):  Toshkentda ayol o‘ziga benzin sepib, o‘zini yoqib yubordi. Bunga ko'cha savdosi uchun joy bo'yicha kelishmovchilik sabab bo'lgan


Predicted label: negative
Predicted probability: 0.9930


Enter a sentence to classify (or 'exit' to quit):  Rossiyaning kamazi O‘zbekistondagi kollej va texnikumlarda o‘z o‘quv dasturlarini yo‘lga qo‘yadi. Ularning bitiruvchilari Rossiya Federatsiyasiga ishlash uchun borishlari mumkin


Predicted label: positive
Predicted probability: 0.9999


Enter a sentence to classify (or 'exit' to quit):  Farg‘ona viloyatida yana bir dahshatli YTH sodir bo‘ldi — Nexia’dagi o‘smir katta tezlikda yo‘ldan chiqib ketib, ikki yosh bolani urib yubordi.


Predicted label: negative
Predicted probability: 0.9996


KeyboardInterrupt: Interrupted by user

## save the model

In [94]:
# save_path = 'latest_uzroberta_news_sentiment_fine_tuned.bin'
# torch.save(model.state_dict(), save_path)