In [1]:
import pandas as pd

In [2]:
cleaned_kaspi_reviews = pd.read_csv('cleaned_kaspi_reviews.csv')

In [3]:
cleaned_kaspi_reviews.shape

(119048, 6)

In [4]:
kazakh_reviews = cleaned_kaspi_reviews[cleaned_kaspi_reviews['language'] == 'kazakh']

In [5]:
kazakh_reviews.value_counts('rating')

rating
5.0    5796
4.0     542
3.0     175
1.0     171
2.0      72
Name: count, dtype: int64

In [6]:
kazakh_reviews = kazakh_reviews.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'category', 'language'])

In [7]:
kazakh_reviews['target'] = kazakh_reviews['rating'].apply(lambda x: 0 if x>=4 else 1)

In [8]:
kazakh_reviews

Unnamed: 0,rating,combined_text,target
3,5.0,иісі қатты ұнады . \n,0
24,5.0,keremet ! 48 sagatka dein ustaidy \n,0
43,5.0,"иісі өте керемет , маған унада \n",0
56,5.0,"салқын , әдемі , жағымды жұпар . бірақ иісі ...",0
92,5.0,маған ұнады ! \n,0
...,...,...,...
118821,5.0,керемет . алуға кеңес брать . микрофон жоқ . \n,0
118898,5.0,"артықшылығы : дыбыс , естілуі , формалық тү...",0
118920,5.0,бәрі жақсы . екі телефонға қосылады . қуаты те...,0
118921,4.0,алғаныма 15 күн болда . әзірге жақсы істеп тұр...,0


In [9]:
kazakh_reviews.value_counts('target')

target
0    6338
1     418
Name: count, dtype: int64

In [10]:
kazakh_reviews.iloc[0].combined_text

'иісі қатты ұнады . \n'

In [11]:
kazakh_reviews.head(1)
kazakh_reviews = kazakh_reviews.drop(columns=['rating'])
kazakh_reviews.head(1)

Unnamed: 0,combined_text,target
3,иісі қатты ұнады . \n,0


In [0]:
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(
    kazakh_reviews, test_size=0.2, random_state=42, stratify=kazakh_reviews.target
)

In [17]:
train_data.to_csv('train_data.csv')
test_data.to_csv('test.csv')

In [14]:


train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
from transformers import AutoTokenizer
from transformers import RobertaModel
from torch.utils.data import Dataset, DataLoader
import torch


MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05

tokenizer = AutoTokenizer.from_pretrained("kz-transformers/kaz-roberta-conversational")

class ReviewsData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.combined_text
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=False,
            max_length=self.max_len,
            padding=False,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }
from torch.nn.utils.rnn import pad_sequence

def collator(items):
    return {
        "ids": torch.permute(pad_sequence([i["ids"] for i in items]), (1, 0)),
        "mask": torch.permute(pad_sequence([i["mask"] for i in items]), (1, 0)),
        "token_type_ids": torch.permute(
            pad_sequence([i["token_type_ids"] for i in items]), (1, 0)
        ),
        "targets": torch.permute(pad_sequence([i["targets"].unsqueeze(0) for i in items]), (1, 0)),
    }
print("FULL Dataset: {}".format(kazakh_reviews.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = ReviewsData(train_data, tokenizer, MAX_LEN)
testing_set = ReviewsData(test_data, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0,
                'collate_fn': collator
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                'collate_fn': collator
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("kz-transformers/kaz-roberta-conversational")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
model = RobertaClass()
model.to(device)
# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()

# for param in model.parameters():
#     param.requires_grad = False
#
# for param in model.pre_classifier.parameters():
#     param.requires_grad = True
#
# for param in model.classifier.parameters():
#     param.requires_grad = True
#
# for param in model.l1.pooler.parameters():
#     param.requires_grad = True
#
#
# optimizer = torch.optim.Adam(
#     params=list(model.pre_classifier.parameters()) + list(model.classifier.parameters()) + list(model.l1.pooler.parameters()),
#     lr=LEARNING_RATE
# )
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

import torch
import numpy as np
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
)

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets.float())

def train_one_epoch(model, training_loader, optimizer):
    model.train()
    epoch_loss = 0
    for n, data in enumerate(training_loader, 0):
        optimizer.zero_grad()

        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / (n + 1)

def validation(model, testing_loader):
    model.eval()
    epoch_loss = 0
    val_targets = []
    val_preds = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data["ids"]
            mask = data["mask"]
            token_type_ids = data["token_type_ids"]
            targets = data["targets"]
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)

            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

            epoch_loss += loss.item()


    val_targets = np.concatenate(val_targets)
    val_preds = np.concatenate(val_preds)
    auc = roc_auc_score(val_targets, val_preds)
    ap = average_precision_score(val_targets, val_preds)
    f1 = f1_score(val_targets, val_preds > 0.5)


    return (
        epoch_loss / len(testing_loader),
        val_targets,
        val_preds,
        auc,
        ap,
        f1,
    )

def train_loop(model, train_loader, val_loader, optimizer, proj_config=None):
    best_valid_loss = np.inf
    max_epochs_without_improvement = 2
    epochs_without_improvement = 0

    for epoch in range(EPOCHS):

        train_loss = train_one_epoch(model, train_loader, optimizer)
        valid_loss, _, _, auc, ap, f1 = validation(model, val_loader)

        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     epochs_without_improvement = 0
        #     torch.save(model.state_dict(), f"{proj_config.export_dir}_old_new/{proj_config.tag}_{proj_config.batch_size}_{proj_config.lr}")
        # else:
        #     epochs_without_improvement += 1

        if epochs_without_improvement >= max_epochs_without_improvement:
            print(f"No improvement in {max_epochs_without_improvement} epochs. Early stopping...")
            break


        print({
            "training_ep_loss": train_loss,
            "valid_ep_loss": valid_loss,
            "ROC_AUC": auc,
            "AP": ap,
            "f1": f1,
        })


    return None


FULL Dataset: (6756, 2)
TRAIN Dataset: (5404, 2)
TEST Dataset: (1352, 2)


Some weights of RobertaModel were not initialized from the model checkpoint at kz-transformers/kaz-roberta-conversational and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
train_loop(model, training_loader, testing_loader, optimizer)

{'training_ep_loss': 0.1666705386841273, 'valid_ep_loss': 0.1259079661176226, 'ROC_AUC': 0.944151832657353, 'AP': 0.6376831919386384, 'f1': 0.45161290322580644}
{'training_ep_loss': 0.11355628754933965, 'valid_ep_loss': 0.12211342206839251, 'ROC_AUC': 0.9560659831756047, 'AP': 0.6843043058896173, 'f1': 0.504201680672269}
{'training_ep_loss': 0.0742247105412841, 'valid_ep_loss': 0.14325666777960427, 'ROC_AUC': 0.9476350082619798, 'AP': 0.6237013729258315, 'f1': 0.5696202531645569}


In [24]:
model.eval()
epoch_loss = 0
val_targets = []
val_preds = []
with torch.no_grad():
    for _, data in enumerate(testing_loader, 0):
        ids = data["ids"]
        mask = data["mask"]
        token_type_ids = data["token_type_ids"]
        targets = data["targets"]
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)

        val_targets.extend(targets.cpu().detach().numpy().tolist())
        val_preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        epoch_loss += loss.item()

targets = np.array(val_targets)
outputs = np.array(val_preds)

val_targets = np.concatenate(val_targets)
val_preds = np.concatenate(val_preds)
auc = roc_auc_score(val_targets, val_preds)
ap = average_precision_score(val_targets, val_preds)
f1 = f1_score(val_targets, val_preds > 0.5)

In [25]:
f1

0.06818181818181818

In [None]:
f1_score(val_targets, val_preds > 0.5)

In [None]:
tokenizer.decode(testing_set[155]['ids'])

In [None]:
val_targets

In [None]:
val_preds = pd.DataFrame((val_preds>0.5), columns=['pred'])

In [None]:
val_preds

In [None]:
idxs = val_preds[val_preds['pred']==True].reset_index()['index'].to_list()

In [None]:
idxs

In [None]:
for i in idxs:
    print(tokenizer.decode(testing_set[i]['ids']))
    print(val_targets[i])
    print()

In [None]:
val_targets = pd.DataFrame((val_targets), columns=['label'])

In [None]:
idxs = val_targets[val_targets['label']==1].reset_index()['index'].to_list()

In [None]:
for i in idxs:
    print(tokenizer.decode(testing_set[i]['ids']))
    print(val_preds.iloc[i])
    print()