In [26]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
import pickle
import os

In [27]:
pandarallel.initialize(progress_bar=False, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Data preparation

In [3]:
list_converters = {'sentences': pd.eval, "special_pattern": pd.eval, "anti_hs": pd.eval, "hs_specific_verb": pd.eval, "adj_bef_keyword": pd.eval, "adj_after_keyword": pd.eval}
data = pd.read_csv("../../data/data_cleaned_sentences_phases_rules.csv", sep='|', converters=list_converters)

In [4]:
data["title"] = data["title"].parallel_apply(lambda title: title if isinstance(title, str) else "") 
data["text"] = data.parallel_apply(lambda row: " ".join([sent for sent in [row["title"]] + row["sentences"]]), axis=1)

In [5]:
row = data.loc[19]
print(row.title)
print(row.sentences)
print("--------------------")
print(row.text)
print(row.Label)

haber
['filistinde yahudi yerleşimcilerin polis korumasında mescidi aksanın avlusuna girmesi gerginliğe neden oldu', 'yahudi yerleşimcilerden oluşan kişilik grup israil polisinin koruması altında mescidi aksanın avlusuna girdi', 'bunun üzerine birgrup filistinli gönüllü kadın yahudi yerleşimcilere tepki göstererek oturma eylemi düzenledi israilli kolluk güçleri olayı protesto eden filistinlilere gerçek mermilerle saldırdı', 'filistinlilerden gerçek ise plastik mermilerle yaralanırken onlanca kişi ise atılan göz yaşartıcı gazdan etkilendi', 'israil polisinin dün sabah saatlerinden itibaren aksanın kapılarında güvenlik önlemlerini arttırdığı ifade edildi', 'öte yandan dün israil gazzede hamasm silahlı kanadı izzeddin elkassam tugayiarının eğitim alanına hava saldırısı düzenledi', 'saldırıda ölen ya da yaralanan olmadı', 'imi lifi mm buj']
--------------------
haber filistinde yahudi yerleşimcilerin polis korumasında mescidi aksanın avlusuna girmesi gerginliğe neden oldu yahudi yerleşimci

## Huggingface custom dataset

In [6]:
data["all_rules"] = data.apply(lambda row: np.array(row["special_pattern"] + [row["general_rule"]] + row["anti_hs"] + row["hs_specific_verb"] + row["adj_bef_keyword"] + row["adj_after_keyword"]).astype(np.float32), axis=1)

In [7]:
data.loc[0, "all_rules"].shape

(26,)

In [8]:
import torch
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class HDVDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, idxs, all_rules):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.rev_label_encodings = {0: "not_hate", 1: "hate"}
        self.encodings = encodings
        self.labels = [self.label_encodings[label] for label in labels]
        self.idxs = idxs
        self.rules = all_rules

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['rules'] = torch.tensor(self.rules[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
dataset_dict = {}
for phase in ["train", "val", "test"]:
    idxs = list(data.loc[data["phase"] == phase, "id"].values)
    texts = list(data.loc[data["phase"] == phase, "text"].values)
    labels = list(data.loc[data["phase"] == phase, "Label"].values)
    all_rules = list(data.loc[data["phase"] == phase, "all_rules"].values)
    
    encodings = tokenizer(texts, truncation=True, padding=True)
    dataset = HDVDataset(encodings, labels, idxs, all_rules)
    dataset_dict[phase] = dataset

In [11]:
print(f"Training dist: {Counter(dataset_dict['train'].labels)}")
print(f"Validation dist: {Counter(dataset_dict['val'].labels)}")
print(f"Test dist: {Counter(dataset_dict['test'].labels)}")

Training dist: Counter({1: 10074, 0: 9944})
Validation dist: Counter({1: 1257, 0: 1243})
Test dist: Counter({1: 1255, 0: 1242})


## Evaluation metrics

In [28]:
prec = load_metric("precision")
rec = load_metric("recall")
acc = load_metric("accuracy")
f1 = load_metric("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    result = {}
    for mtrc in [prec, rec, acc, f1]:
        mtrc_result = mtrc.compute(predictions=predictions, references=labels)
        result.update(mtrc_result)
    return result

## Huggingface models

In [29]:
from torch import nn
from tqdm.auto import tqdm
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.utils.data import DataLoader
from transformers import AutoConfig, AutoModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_epochs = 2
batch_size = 4
checkpoint = "dbmdz/bert-base-turkish-128k-uncased"

In [34]:
results_path = "../experiments_berturk_weighted_drop015/results/"

In [31]:
class HSRuleModel(nn.Module):
    def __init__(self, checkpoint, num_labels, rule_dimension=None): 
        super(HSRuleModel, self).__init__() 
        self.num_labels = num_labels
        self.rule_dimension = rule_dimension
        
        self.relu = nn.ReLU()
        #Load Model with given checkpoint and extract its body
        self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, output_attentions=True, output_hidden_states=True))
        self.dropout1 = nn.Dropout(0.2, inplace=False) 
        self.classifier1 = nn.Linear(768, 128) # load and initialize weights
        self.dropout2 = nn.Dropout(0.2, inplace=False) 
        self.classifier2 = nn.Linear(128, 2) # load and initialize weights
        
        #self.weighter1 = nn.Linear(768 * 8, 768 * 2)
        #self.w_dropout1 = nn.Dropout(0.2, inplace=False) 
        self.weighter2 = nn.Linear(768 * 2, 2)
        self.w_dropout2 = nn.Dropout(0.2, inplace=False) 
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, rules=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        sequence_output = self.dropout1(outputs[0]) #outputs[0]=last hidden state
        #sequence_output = sequence_output[:, 0, :].view(-1, 768)
        sequence_output = sequence_output[:, :2, :]
        weights = self.weighter2(sequence_output.contiguous().view(sequence_output.shape[0], -1))
        weights = self.w_dropout2(self.relu(weights))
        #weights = self.weighter2(weights)
        #weights = self.w_dropout2(self.relu(weights))
        weights = torch.unsqueeze(self.softmax(weights), dim=2)
        
        sequence_output = torch.mean(sequence_output * weights, dim=1)
        sequence_output = torch.squeeze(sequence_output, 1)
        
        #output_with_rules = torch.cat((sequence_output, rules), dim=1)
        output = self.relu(self.classifier1(sequence_output)) # calculate losses
        output = self.dropout2(output)
        #output = torch.cat((output, rules), dim=1)
        logits = self.classifier2(output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [17]:
train_loader = DataLoader(dataset_dict["train"], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset_dict["val"], batch_size=batch_size, shuffle=True)

In [18]:
model = HSRuleModel(checkpoint, num_labels=2, rule_dimension=26).to(device)
optimizer = AdamW(model.parameters(), lr=1e-05)
lr_scheduler = ReduceLROnPlateau(optimizer, 'max', patience=2, min_lr=1e-09, factor=0.5, verbose=True)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
metric_df = pd.DataFrame(columns=["epoch", "step", "F1", "Accuracy", "Precision", "Recall"])

best_val_f1 = 0.
for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        if i % 500 == 0 and i != 0:
            os.makedirs(results_path, exist_ok=True)
            #torch.save({
            #        'epoch': epoch,
            #        'steps': i,
            #        'model_state_dict': model.state_dict(),
            #        'optimizer_state_dict': optimizer.state_dict()
            #    }, 
            #    os.path.join(results_path, f"steps_{i}.pth")
            #)

            model.eval()
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                with torch.no_grad():
                    outputs = model(**batch)

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                f1.add_batch(predictions=predictions, references=batch["labels"])
                acc.add_batch(predictions=predictions, references=batch["labels"])
                rec.add_batch(predictions=predictions, references=batch["labels"])
                prec.add_batch(predictions=predictions, references=batch["labels"])
                
            model.train()
            
            step_f1_score = f1.compute()['f1']
            metric_df = metric_df.append({
                    "epoch": epoch,
                    "step": i,
                    "Precision": prec.compute()['precision'],
                    "Recall": rec.compute()['recall'],
                    "Accuracy": acc.compute()['accuracy'],
                    "F1": step_f1_score
                },
                ignore_index = True
            )
            display(metric_df)
            if step_f1_score > best_val_f1:
                best_val_f1 = step_f1_score
                
                torch.save({
                        'epoch': epoch,
                        'steps': i,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict()
                    }, 
                    os.path.join(results_path, f"best_model.pth")
                )
            lr_scheduler.step(step_f1_score)

  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


Epoch 00016: reducing learning rate of group 0 to 5.0000e-06.


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


  metric_df = metric_df.append({


Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.872878,0.8652,0.829986,0.920446
1,0.0,1000.0,0.884226,0.8764,0.835694,0.938743
2,0.0,1500.0,0.899297,0.8968,0.882759,0.916468
3,0.0,2000.0,0.901333,0.8964,0.864766,0.94113
4,0.0,2500.0,0.903715,0.8984,0.863143,0.94829
5,0.0,3000.0,0.905997,0.9072,0.923204,0.889419
6,0.0,3500.0,0.887819,0.8788,0.830332,0.953858
7,0.0,4000.0,0.913249,0.912,0.905395,0.921241
8,0.0,4500.0,0.907995,0.9084,0.917208,0.898966
9,0.0,5000.0,0.915493,0.9136,0.900693,0.930788


Epoch 00020: reducing learning rate of group 0 to 2.5000e-06.


In [28]:
metric_df # drop02

Unnamed: 0,epoch,step,F1,Accuracy,Precision,Recall
0,0.0,500.0,0.858177,0.8432,0.786994,0.943516
1,0.0,1000.0,0.868186,0.8704,0.888426,0.848846
2,0.0,1500.0,0.868164,0.8756,0.92922,0.814638
3,0.0,2000.0,0.899765,0.8976,0.885891,0.914081
4,0.0,2500.0,0.901464,0.9004,0.89685,0.906126
5,0.0,3000.0,0.910161,0.9084,0.897833,0.922832
6,0.0,3500.0,0.907989,0.9028,0.866329,0.953858
7,0.0,4000.0,0.906851,0.9032,0.878449,0.937152
8,0.0,4500.0,0.903202,0.8972,0.857654,0.953858
9,0.0,5000.0,0.905493,0.9064,0.919606,0.891806


In [28]:
metric_df #.to_csv(os.path.join(results_path, "try_2_metrics.csv"), index=False) # try2

Unnamed: 0,step,F1,Accuracy,Precision,Recall
0,500.0,0.844279,0.8492,0.878007,0.813047
1,1000.0,0.880531,0.8704,0.820619,0.949881
2,1500.0,0.901244,0.8984,0.881369,0.922037
3,2000.0,0.901408,0.8964,0.864234,0.941925
4,2500.0,0.89644,0.8976,0.911934,0.881464
5,3000.0,0.908307,0.9064,0.894981,0.922037
6,3500.0,0.907225,0.906,0.90047,0.914081
7,4000.0,0.903487,0.9048,0.921423,0.886237
8,4500.0,0.89695,0.9,0.93071,0.865553
9,5000.0,0.9128,0.9128,0.91794,0.907717


In [26]:
metric_df # try1

Unnamed: 0,step,F1,Accuracy,Precision,Recall
0,500.0,0.648876,0.7,0.788396,0.551313
1,1000.0,0.788177,0.7764,0.752533,0.827367
2,1500.0,0.832146,0.8304,0.828211,0.836118
3,2000.0,0.861423,0.852,0.813871,0.914877
4,2500.0,0.845734,0.8532,0.896613,0.800318
5,3000.0,0.874606,0.8728,0.867084,0.882259
6,3500.0,0.881712,0.8784,0.862909,0.901352
7,4000.0,0.873874,0.8768,0.900422,0.848846
8,4500.0,0.875187,0.8664,0.825229,0.931583
9,5000.0,0.882604,0.8788,0.860272,0.906126


## Prediction results with rules

In [24]:
checkpoint_best = torch.load(os.path.join(results_path, "best_model.pth"))
model_best = HSRuleModel(checkpoint, num_labels=2, rule_dimension=26).to(device)
model_best.load_state_dict(checkpoint_best['model_state_dict'])
model.train()

test_dataloader = DataLoader(dataset_dict["test"], batch_size=batch_size, shuffle=False)
all_predictions = []
for batch in tqdm(test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    acc.add_batch(predictions=predictions, references=batch["labels"])
    rec.add_batch(predictions=predictions, references=batch["labels"])
    prec.add_batch(predictions=predictions, references=batch["labels"])
    all_predictions.extend(predictions.cpu().tolist())
test_set_dict = {
    "Precision": prec.compute()['precision'],
    "Recall": rec.compute()['recall'],
    "Accuracy": acc.compute()['accuracy'],
    "F1": f1.compute()['f1']
}

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 625/625 [00:19<00:00, 31.61it/s]


In [25]:
test_set_dict # with weighting - middle step

{'Precision': 0.8953040800615858,
 'Recall': 0.9266932270916335,
 'Accuracy': 0.9086904285142171,
 'F1': 0.9107282693813624}

In [23]:
test_set_dict

{'Precision': 0.8771535580524344,
 'Recall': 0.9330677290836653,
 'Accuracy': 0.9006808169803765,
 'F1': 0.9042471042471042}

In [27]:
test_set_dict # big model double rule weighting drop 0.2 pth

{'Precision': 0.8925556408288565,
 'Recall': 0.9266932270916335,
 'Accuracy': 0.907088506207449,
 'F1': 0.9093041438623924}

In [22]:
test_set_dict # big model double rule weighting pth

{'Precision': 0.9059011164274322,
 'Recall': 0.9051792828685259,
 'Accuracy': 0.9050861033239888,
 'F1': 0.9055400557991231}

In [18]:
test_set_dict # big model double rule best model pth

{'Precision': 0.906885758998435,
 'Recall': 0.9235059760956176,
 'Accuracy': 0.9138966760112135,
 'F1': 0.9151204105803395}

In [21]:
test_set_dict # big model with rule

{'Precision': 0.906396255850234,
 'Recall': 0.9258964143426295,
 'Accuracy': 0.9146976371645975,
 'F1': 0.916042569964525}

In [21]:
test_set_dict 

{'Precision': 0.9091633466135458,
 'Recall': 0.9091633466135458,
 'Accuracy': 0.9086904285142171,
 'F1': 0.9091633466135458}

In [41]:
test_set_dict

{'Precision': 0.88558352402746,
 'Recall': 0.9250996015936255,
 'Accuracy': 0.9022827392871445,
 'F1': 0.9049103663289166}

In [19]:
df_pred_rule = pd.DataFrame(data={"id": dataset_dict["test"].idxs, "Label": dataset_dict["test"].labels, "pred": all_predictions})
df_pred_rule["Label"] = df_pred_rule["Label"].map(dataset_dict["test"].rev_label_encodings)
df_pred_rule["pred"] = df_pred_rule["pred"].map(dataset_dict["test"].rev_label_encodings)
sum(df_pred_rule["Label"] == df_pred_rule["pred"]) / df_pred_rule.shape[0]

0.9138966760112135

In [21]:
sum(df_pred_rule["Label"] == df_pred_rule["pred"]) - sum(df_pred_baseline["Label"] == df_pred_baseline["pred"]) 

29

In [20]:
df_pred_baseline = pd.read_csv("../../data/berturk_baseline_preds_test_set.csv", sep="|")

In [22]:
df_all_preds = pd.merge(df_pred_rule, df_pred_baseline.drop("Label", axis=1), on="id", how="inner", suffixes=["_rule", "_baseline"])

In [23]:
df_all_preds[df_all_preds["pred_rule"] != df_all_preds["pred_baseline"]]

Unnamed: 0,id,Label,pred_rule,pred_baseline,text
17,134,hate,hate,not_hate,suriyeli qassapa yıl türk sanıklara beraat ıı ...
60,442,hate,hate,not_hate,ben yahudiyim dedi konu mankeni obama ben yahu...
78,625,hate,not_hate,hate,kj ta dr canay umunç yarim bırakılan hikayeler...
118,1108,hate,hate,not_hate,vatanın mı var senin mehmet akarca hürriyet ah...
141,1336,hate,not_hate,hate,agah oktay güner one minute gerçeği yeni bir o...
...,...,...,...,...,...
2463,24719,not_hate,hate,not_hate,hakkımızı almak istiyoruz hakkımızı almak isti...
2480,24909,hate,hate,not_hate,bilinçli siyasetle terör biter mustafa miyasog...
2482,24945,hate,hate,not_hate,transseksüel eşini travestiyle aldattı transse...
2483,24947,hate,hate,not_hate,pkklıya geberdi diyelim pkkhya geberdi diyelim...


In [24]:
df_all_preds.to_csv("../../data/berturk_baseline_and_rules_preds_test_set_big_rule_model_91389.csv", sep="|", index=False)

In [32]:
list_converters = {'sentences': pd.eval, "special_pattern": pd.eval, "anti_hs": pd.eval, "hs_specific_verb": pd.eval, "adj_bef_keyword": pd.eval, "adj_after_keyword": pd.eval}
ukrayna_data = pd.read_csv("../../data/data_cleaned_sentences_rules_ukrayna.csv", sep='|', converters=list_converters)
ukrayna_data["title"] = ukrayna_data["title"].parallel_apply(lambda title: title if isinstance(title, str) else "") 
ukrayna_data["text"] = ukrayna_data.parallel_apply(lambda row: " ".join([sent for sent in [row["title"]] + row["sentences"]]), axis=1)
ukrayna_data["all_rules"] = ukrayna_data.apply(lambda row: np.array(row["special_pattern"] + [row["general_rule"]] + row["anti_hs"] + row["hs_specific_verb"] + row["adj_bef_keyword"] + row["adj_after_keyword"]).astype(np.float32), axis=1)

phase = "test"
ukrayna_test_idxs = list(ukrayna_data.loc[ukrayna_data["phase"] == phase, "id"].values)
ukrayna_test_texts = list(ukrayna_data.loc[ukrayna_data["phase"] == phase, "text"].values)
ukrayna_test_labels = list(ukrayna_data.loc[ukrayna_data["phase"] == phase, "Label"].values)
ukrayna_test_all_rules = list(ukrayna_data.loc[ukrayna_data["phase"] == phase, "all_rules"].values)

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")
ukrayna_test_encodings = tokenizer(ukrayna_test_texts, truncation=True, padding=True)
ukrayna_test_dataset = HDVDataset(ukrayna_test_encodings, ukrayna_test_labels, ukrayna_test_idxs, ukrayna_test_all_rules)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [35]:
checkpoint_best = torch.load(os.path.join(results_path, "best_model.pth"))
model_best = HSRuleModel(checkpoint, num_labels=2, rule_dimension=26).to(device)
model_best.load_state_dict(checkpoint_best['model_state_dict'])
model_best.eval()

ukrayna_test_dataloader = DataLoader(ukrayna_test_dataset, batch_size=batch_size, shuffle=False)
all_predictions = []
for batch in tqdm(ukrayna_test_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model_best(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    f1.add_batch(predictions=predictions, references=batch["labels"])
    acc.add_batch(predictions=predictions, references=batch["labels"])
    rec.add_batch(predictions=predictions, references=batch["labels"])
    prec.add_batch(predictions=predictions, references=batch["labels"])
    all_predictions.extend(predictions.cpu().tolist())
ukrayna_test_set_dict = {
    "Precision": prec.compute()['precision'],
    "Recall": rec.compute()['recall'],
    "Accuracy": acc.compute()['accuracy'],
    "F1": f1.compute()['f1']
}

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 8/8 [00:00<00:00, 35.31it/s]


In [36]:
ukrayna_test_set_dict

{'Precision': 1.0,
 'Recall': 0.8666666666666667,
 'Accuracy': 0.9333333333333333,
 'F1': 0.9285714285714286}

In [17]:
ukrayna_test_set_dict

{'Precision': 1.0,
 'Recall': 0.9333333333333333,
 'Accuracy': 0.9666666666666667,
 'F1': 0.9655172413793104}

In [18]:
df_pred_ukrayna_rule = pd.DataFrame(data={"id": ukrayna_test_dataset.idxs, "Label": ukrayna_test_dataset.labels, "pred": all_predictions})
df_pred_ukrayna_rule["Label"] = df_pred_ukrayna_rule["Label"].map(ukrayna_test_dataset.rev_label_encodings)
df_pred_ukrayna_rule["pred"] = df_pred_ukrayna_rule["pred"].map(ukrayna_test_dataset.rev_label_encodings)
print(sum(df_pred_ukrayna_rule["Label"] == df_pred_ukrayna_rule["pred"]) / df_pred_ukrayna_rule.shape[0])

df_pred_ukrayna_baseline = pd.read_csv("../../data/berturk_baseline_preds_ukrayna_test_set.csv", sep="|")
df_all_ukrayna_preds = pd.merge(df_pred_ukrayna_rule, df_pred_ukrayna_baseline.drop("Label", axis=1), on="id", how="inner", suffixes=["_rule", "_baseline"])

0.9666666666666667


In [19]:
df_all_ukrayna_preds

Unnamed: 0,id,Label,pred_rule,pred_baseline,text
0,0,hate,hate,hate,rus işgali mutlaka durdurulmalı rus işgali mut...
1,1,hate,hate,hate,ruslar barbarca saldırıyor uydu görüntüleri yü...
2,2,hate,hate,hate,rus yayılmacılığı putinle hortladı rus yayılm ...
3,3,hate,hate,hate,harkivde rusların ilerlemesi sürüyor harkivde ...
4,4,hate,hate,hate,ruslar kievde ingiliz gazetecilere ateş açtı r...
5,5,hate,hate,not_hate,ruslar ilerliyor ukrayna direniyor ruslar iler...
6,6,hate,hate,not_hate,ruslar starokostiantyniv üssünü vurdu ruslar s...
7,7,hate,hate,hate,direnme teslim ol ukrayna murat özer murat dir...
8,8,hate,hate,hate,ruslar çocukları hastanede esir aldı ruslar co...
9,9,hate,hate,hate,ruslar abdli gazeteciyi öldürdü ruslar abdli g...


In [20]:
df_all_ukrayna_preds.to_csv("../../data/berturk_baseline_and_rules_preds_ukrayna_test_set.csv", sep="|", index=False)

In [55]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [20]:
training_args = TrainingArguments(
    output_dir=os.path.join(results_path, "berturk_128K_baseline"),               # output directory
    num_train_epochs=2,                                                  # total number of training epochs
    per_device_train_batch_size=4,                                       # batch size per device during training
    per_device_eval_batch_size=4,                                        # batch size for evaluation
    warmup_steps=500,                                                    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                                   # strength of weight decay
    logging_dir=os.path.join(results_path, "berturk_128K_baseline"),              # directory for storing logs
    logging_steps=20,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    save_steps=1000,
    learning_rate=1e-05,
    run_name="berturk_baseline_128K_uncased_lre-5",
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,                                                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                                                  # training arguments, defined above
    train_dataset=train_dataset,                                         # training dataset
    eval_dataset=val_dataset,                                            # evaluation dataset
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

***** Running training *****
  Num examples = 20018
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10010
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33matifemre[0m ([33mnlpboun[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Precision,Recall,Accuracy,F1
500,0.6075,0.470783,0.764359,0.825776,0.7844,0.793881
1000,0.3553,0.389471,0.867442,0.890215,0.8764,0.878681
1500,0.2569,0.47633,0.87027,0.896579,0.8808,0.883229
2000,0.3364,0.452168,0.831586,0.946698,0.8768,0.885417
2500,0.2499,0.445796,0.858083,0.933174,0.8888,0.894055
3000,0.407,0.363237,0.893397,0.893397,0.8928,0.893397
3500,0.3747,0.380949,0.86099,0.94113,0.894,0.899278
4000,0.3078,0.378997,0.915226,0.884646,0.9008,0.899676
4500,0.3537,0.377014,0.870944,0.939539,0.8996,0.903942
5000,0.4162,0.342529,0.918219,0.902148,0.9104,0.910112


***** Running Evaluation *****
  Num examples = 2500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 4
Saving model checkpoint to ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-1000
Configuration saved in ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-1000/config.json
Model weights saved in ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2500
  Batch size = 4
Saving model checkpoint to ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-2000
Configuration saved in ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-2000/config.json
Model weights saved in ../experiments_berturk_baseline/results/berturk_128K_baseline/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples 

TrainOutput(global_step=10010, training_loss=0.38251381393198247, metrics={'train_runtime': 1725.6669, 'train_samples_per_second': 23.2, 'train_steps_per_second': 5.801, 'total_flos': 1.053391421239296e+16, 'train_loss': 0.38251381393198247, 'epoch': 2.0})

## Evaluation (validation)

In [22]:
eval_results = trainer.evaluate(val_dataset)
eval_results

***** Running Evaluation *****
  Num examples = 2500
  Batch size = 4


{'eval_loss': 0.34252944588661194,
 'eval_precision': 0.9182186234817814,
 'eval_recall': 0.9021479713603818,
 'eval_accuracy': 0.9104,
 'eval_f1': 0.9101123595505618,
 'eval_runtime': 21.9222,
 'eval_samples_per_second': 114.04,
 'eval_steps_per_second': 28.51,
 'epoch': 2.0}

## Testing

In [23]:
preds_dict = trainer.predict(test_dataset)
predictions = preds_dict.predictions
predictions = np.argmax(predictions, axis=1)
print(f"Preds: {predictions}\n GT's: {preds_dict.label_ids}")
print(preds_dict.metrics)

***** Running Prediction *****
  Num examples = 2497
  Batch size = 4


Preds: [0 1 1 ... 1 1 1]
 GT's: [1 1 1 ... 1 1 1]
{'test_loss': 0.39748504757881165, 'test_precision': 0.9060240963855422, 'test_recall': 0.8988047808764941, 'test_accuracy': 0.9022827392871445, 'test_f1': 0.9024, 'test_runtime': 22.2395, 'test_samples_per_second': 112.278, 'test_steps_per_second': 28.103}


In [27]:
data

Unnamed: 0,id,date,pub_name,type,title,content,Label,sentences,text,phase
0,0,05 Ekim 2015 Pazartesi,akşam,ulusal,sahte polislerin kuryesi yakalandı,sahte polislerin kuryesi yakalandı antalya'da ...,hate,[sahte polislerin kuryesi yakalandı antalyada ...,sahte polislerin kuryesi yakalandı sahte polis...,train
1,1,11 Eylül 2015 Cuma,akşam,ulusal,kürt üz ama hain değiliz,kürt'üz ama hain değiliz suriye sınırında devr...,hate,[kürtüz ama hain değiliz suriye sınırında devr...,kürt üz ama hain değiliz kürtüz ama hain değil...,test
2,2,25 Eylül 2015 Cuma,akşam,ulusal,suriyeli gelinden altın vurgunu,kuyumcuda altın alırken fotoğraf çektirdi. sur...,hate,"[kuyumcuda altın alırken fotoğraf çektirdi, su...",suriyeli gelinden altın vurgunu kuyumcuda altı...,test
3,3,07 Eylül 2015 Pazartesi,anayurt,ulusal,mustafa nevruz sınacı,"mustafa nevruz sınacı lgercek. abd'li yahudi, ...",hate,"[mustafa nevruz sınacı lgercek, abdli yahudi b...",mustafa nevruz sınacı mustafa nevruz sınacı lg...,train
4,4,21 Eylül 2015 Pazartesi,anayurt,ulusal,mustafa nevruz sınacı,"mustafa nevruz sınacı yazıyor, gercek. abd'li ...",hate,"[mustafa nevruz sınacı yazıyor gercek, abdli y...",mustafa nevruz sınacı mustafa nevruz sınacı ya...,train
...,...,...,...,...,...,...,...,...,...,...
25010,25061,02 Mayıs 2014 Cuma,yeni asya,hepsi,amnesty ınternational ve global ahlaksızlık,doğu veya batı s. bulut@saidnursi. de amnesty ...,hate,"[doğu veya batı, de amnesty ınternational ve g...",amnesty ınternational ve global ahlaksızlık do...,train
25011,25062,26 Mart 2014 Çarşamba,yeni konya,hepsi,çanakkale asla unutulmamalı llnutturulmamalı,"çanakkale, asla unutulmamalı, llnutturulmamalı...",hate,[çanakkale asla unutulmamalı llnutturulmamalı ...,çanakkale asla unutulmamalı llnutturulmamalı ç...,val
25012,25063,04 Nisan 2014 Cuma,yeni mesaj,hepsi,sömürü projesi olarak bop,btp genel başkanı prof. dr. haydar bas ın kale...,hate,"[btp genel başkanı prof, haydar bas ın kalemin...",sömürü projesi olarak bop btp genel başkanı pr...,train
25013,25064,24 Şubat 2014 Pazartesi,yeni mesaj,hepsi,doğruluş zeminimiz helali bir millet istiklali...,prof. dr. nurullah çetin doğruluş zeminimiz: '...,hate,[nurullah çetin doğruluş zeminimiz helali bir ...,doğruluş zeminimiz helali bir millet istiklali...,train


In [28]:
df_pred = pd.DataFrame(data={"id": test_dataset.idxs, "Label": test_dataset.labels, "pred": predictions})
df_pred = pd.merge(df_pred, data[["id", "text"]], how="left", on="id")
df_pred

Unnamed: 0,id,Label,pred,text
0,1,1,0,kürt üz ama hain değiliz kürtüz ama hain değil...
1,2,1,1,suriyeli gelinden altın vurgunu kuyumcuda altı...
2,11,1,1,sabahattin önkibar daha politika günlüğü sabah...
3,15,1,1,kafkasya nın bitmeyen sorunu yukarı karabağ ce...
4,26,1,1,sözde kürt özde kripto ermeni sözde kürt özde ...
...,...,...,...,...
2492,25018,1,1,elin gavuru kadar olamayan islamcılar elin gav...
2493,25035,1,1,müslümanları katleden haçlı zihniyetinin noel ...
2494,25044,1,1,sen sağ ben selamet sen yoluna ben yoluma it s...
2495,25049,1,1,yahudinin bir ayağı türkiyede yahudinin bir ay...


In [30]:
df_pred.to_csv("../../data/berturk_baseline_preds_test_set.csv", sep="|", index=False)

In [31]:
ukrayna_data = pd.read_csv("../../data/data_cleaned_sentences_ukrayna.csv", sep='|', converters={'sentences': pd.eval})
ukrayna_data["title"] = ukrayna_data["title"].parallel_apply(lambda title: title if isinstance(title, str) else "") 
ukrayna_data["text"] = ukrayna_data.parallel_apply(lambda row: " ".join([sent for sent in [row["title"]] + row["sentences"]]), axis=1)

ukrayna_test_idxs, ukrayna_test_texts, ukrayna_test_labels = list(ukrayna_data.loc[ukrayna_data["phase"] == "test", "id"].values), list(ukrayna_data.loc[ukrayna_data["phase"] == "test", "text"].values), list(ukrayna_data.loc[ukrayna_data["phase"] == "test", "Label"].values)
ukrayna_test_encodings = tokenizer(ukrayna_test_texts, truncation=True, padding=True)
ukrayna_test_dataset = HDVDataset(ukrayna_test_encodings, ukrayna_test_labels, ukrayna_test_idxs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [33]:
ukrayna_preds_dict = trainer.predict(ukrayna_test_dataset)
ukrayna_predictions = ukrayna_preds_dict.predictions
ukrayna_predictions = np.argmax(ukrayna_predictions, axis=1)
print(f"Preds: {ukrayna_predictions}\n GT's: {ukrayna_preds_dict.label_ids}")
print(ukrayna_preds_dict.metrics)

***** Running Prediction *****
  Num examples = 30
  Batch size = 4


Preds: [1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 GT's: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
{'test_loss': 0.37824195623397827, 'test_precision': 1.0, 'test_recall': 0.8, 'test_accuracy': 0.9, 'test_f1': 0.888888888888889, 'test_runtime': 0.2473, 'test_samples_per_second': 121.297, 'test_steps_per_second': 32.346}


In [34]:
df_pred_ukrayna = pd.DataFrame(data={"id": ukrayna_test_dataset.idxs, "Label": ukrayna_test_dataset.labels, "pred": ukrayna_predictions})
df_pred_ukrayna = pd.merge(df_pred_ukrayna, ukrayna_data[["id", "text"]], how="left", on="id")
df_pred_ukrayna

Unnamed: 0,id,Label,pred,text
0,0,1,1,rus işgali mutlaka durdurulmalı rus işgali mut...
1,1,1,1,ruslar barbarca saldırıyor uydu görüntüleri yü...
2,2,1,1,rus yayılmacılığı putinle hortladı rus yayılm ...
3,3,1,1,harkivde rusların ilerlemesi sürüyor harkivde ...
4,4,1,1,ruslar kievde ingiliz gazetecilere ateş açtı r...
5,5,1,0,ruslar ilerliyor ukrayna direniyor ruslar iler...
6,6,1,0,ruslar starokostiantyniv üssünü vurdu ruslar s...
7,7,1,1,direnme teslim ol ukrayna murat özer murat dir...
8,8,1,1,ruslar çocukları hastanede esir aldı ruslar co...
9,9,1,1,ruslar abdli gazeteciyi öldürdü ruslar abdli g...


In [35]:
df_pred_ukrayna.to_csv("../../data/berturk_baseline_preds_ukrayna_test_set.csv", sep="|", index=False)

In [50]:
class HDVDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts_idxs, labels, tokenizer):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.rev_label_encodings = {0: "not_hate", 1: "hate"}
        
        self.texts, self.idxs = list(np.array(texts_idxs)[:, 0]), list(np.array(texts_idxs)[:, 1])
        self.encodings = tokenizer(self.texts, truncation=True, padding=True)
        self.labels = [self.label_encodings[label] for label in labels]
        self.preds = []

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    def _get_preds_with_idx(self):
        df_preds = pd.DataFrame(data={"idx": self.idxs, "prediction": self.preds})
        df_preds["prediction"] = df_preds["prediction"].map(self.rev_label_encodings)
        return df_preds

In [50]:
texts_idxs, labels = list(data[["text", "id"]].values), list(data["Label"].values)
train_texts_idxs_2, val_texts_idxs_2, train_labels_2, val_labels_2 = train_test_split(texts_idxs, labels, stratify=labels, test_size=.2, shuffle=True, random_state=17)
val_texts_idxs_2, test_texts_idxs_2, val_labels_2, test_labels_2 = train_test_split(val_texts_idxs_2, val_labels_2, stratify=val_labels_2, test_size=.5, shuffle=True, random_state=17)

In [51]:
test_dataset_idx = HDVDatasetTest(test_texts_idxs_2, test_labels_2, tokenizer)

In [27]:
preds_dict = trainer.predict(test_dataset)
predictions = preds_dict.predictions
predictions = np.argmax(predictions, axis=1)
print(f"Preds: {predictions}\n GT's: {preds_dict.label_ids}")
print(preds_dict.metrics)

***** Running Prediction *****
  Num examples = 2507
  Batch size = 4


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.42105528712272644, 'test_precision': 0.8891419893697798, 'test_recall': 0.9271575613618369, 'test_accuracy': 0.9050658157159952, 'test_f1': 0.9077519379844963, 'test_runtime': 23.0777, 'test_samples_per_second': 108.633, 'test_steps_per_second': 27.169}


In [28]:
predictions

array([1, 0, 1, ..., 1, 0, 1])

In [52]:
preds_dict_2 = trainer.predict(test_dataset_idx)
predictions_2 = preds_dict_2.predictions
predictions_2 = np.argmax(predictions_2, axis=1)
print(f"Preds: {predictions_2}\n GT's: {preds_dict_2.label_ids}")
print(preds_dict_2.metrics)

***** Running Prediction *****
  Num examples = 2507
  Batch size = 4


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.42105528712272644, 'test_precision': 0.8891419893697798, 'test_recall': 0.9271575613618369, 'test_accuracy': 0.9050658157159952, 'test_f1': 0.9077519379844963, 'test_runtime': 23.2055, 'test_samples_per_second': 108.035, 'test_steps_per_second': 27.019}


In [53]:
test_dataset_idx.preds = predictions_2
df_preds = test_dataset_idx._get_preds_with_idx()

In [54]:
df_preds

Unnamed: 0,idx,prediction
0,3973,hate
1,16849,not_hate
2,9186,hate
3,3072,hate
4,10346,hate
...,...,...
2502,3672,hate
2503,18446,not_hate
2504,3657,hate
2505,19535,not_hate


## Report to csv

In [58]:
df_label_preds = pd.merge(data, df_preds, left_on="id", right_on="idx", how="right").drop("idx", axis=1)
df_label_preds.to_excel("../outputs/labels_preds_berturk_2022-04-14.xlsx", index=False)

In [59]:
df_label_preds

Unnamed: 0,id,text,Label,prediction
0,3973,haber seyfullah koyuncu freddy mercurynin aske...,hate,hate
1,16849,kilisede hz fatıma nın doğumu kutlandı kilised...,not_hate,not_hate
2,9186,yunanlıların verdiği zararları anlatan resmi d...,hate,hate
3,3072,itı hr it serdar çalışkan yaptığı açıklamada ş...,hate,hate
4,10346,mültecileri dövüp geri gönderdiler mültecileri...,hate,hate
...,...,...,...,...
2502,3672,saitiyor su rl yıu ur bpfjmkmmiami rj fiil il ...,hate,hate
2503,18446,bu işbirllfii türkiye ye örnek oucak bu işbirl...,not_hate,not_hate
2504,3657,mersinde işlenen cinayetle ilgili suriyeli tut...,hate,hate
2505,19535,siparişle kurulan proje örgütlerdir siparişle ...,not_hate,not_hate


In [60]:
data_raw = pd.read_csv("../data/data_cleaned_sentences_2020-04-10.csv", sep='|', converters={'sentences': pd.eval})

In [63]:
data_raw = data_raw.reset_index().rename(columns={"index": "id"})

In [65]:
df_data_and_preds = pd.merge(data_raw, df_preds, left_on="id", right_on="idx", how="right").drop("idx", axis=1)

In [68]:
df_data_and_preds.to_excel("../outputs/data_and_preds_2022-04-14.xlsx", index=False)

In [67]:
sum(df_data_and_preds["Label"] == df_data_and_preds["prediction"]) / df_data_and_preds.shape[0]

0.9050658157159952

In [70]:
Counter(val_labels)

Counter({'hate': 1264, 'not_hate': 1243})

In [63]:
df = pd.read_excel("../outputs/data_and_preds_2022-04-14.xlsx")

In [64]:
(sum(df["Label"] == df["prediction"]) - len(remove_id_from_test)) / (df.shape[0] - len(remove_id_from_test))

0.9046856227472968

## Duplicate rows

In [10]:
id_excel = pd.read_excel("../outputs/data_and_preds_2022-04-14.xlsx")["id"].values.tolist()

In [11]:
list(np.array(test_texts_idxs_2)[:, 1]) == id_excel

True

In [12]:
set.intersection(set(np.array(test_texts_idxs_2)[:, 1]), set(np.array(train_texts_idxs_2)[:, 1]))

set()

In [48]:
first_pair = data[data[["text"]].duplicated(keep="first")].sort_values("text").id.values.tolist()
last_pair = data[data[["text"]].duplicated(keep="last")].sort_values("text").id.values.tolist()

In [49]:
pairs = [[first_pair[i], last_pair[i]] for i in range(len(first_pair))]
pairs

[]

In [40]:
train_idxs, val_idxs, test_idxs = list(np.array(train_texts_idxs_2)[:, 1]), list(np.array(val_texts_idxs_2)[:, 1]), list(np.array(test_texts_idxs_2)[:, 1])

In [50]:
def detect_duplicate_rows(train_idxs, test_idxs, pairs):
    remove_id_from_test = []
    for pair in pairs:
        if pair[0] in train_idxs and pair[1] in train_idxs:
            print(pair, " is all in train set!")
        elif pair[0] in train_idxs and pair[1] in test_idxs:
            print(f"{pair[0]} in train, {pair[1]} in test, please delete from test!!!")
            remove_id_from_test.append(pair[1])
        elif pair[1] in train_idxs and pair[0] in test_idxs:
            print(f"{pair[1]} in train, {pair[0]} in test, please delete from test!!!")
            remove_id_from_test.append(pair[0])
        elif pair[0] in test_idxs and pair[1] in test_idxs:
            print(f"{pair[0]} in test, {pair[1]} in test, please delete one of them from test!!!")
            remove_id_from_test.append(pair[1])
    remove_test_indices = []
    for remove_id in remove_id_from_test:
        remove_test_indices.append(test_idxs.index(remove_id))
    remove_test_indices
    return remove_id_from_test, remove_test_indices

In [51]:
remove_id_from_test = detect_duplicate_rows(train_idxs, test_idxs, pairs)
remove_id_from_test

([], [])

In [52]:
remove_id_from_val = detect_duplicate_rows(train_idxs, val_idxs, pairs)
remove_id_from_val

([], [])

## Inference

In [66]:
model = AutoModelForSequenceClassification.from_pretrained("../experiments/results/berturk_128K/checkpoint-10000")

In [67]:
class HDVDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts_idxs, labels, tokenizer, remove_idxs):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.rev_label_encodings = {0: "not_hate", 1: "hate"}
        
        self.texts, self.idxs = list(np.array(texts_idxs)[:, 0]), list(np.array(texts_idxs)[:, 1])
        print(len(self.idxs))
        self.texts = [text for i, text in enumerate(self.texts) if i not in remove_idxs]
        self.idxs = [idx for i, idx in enumerate(self.idxs) if i not in remove_idxs]
        self.encodings = tokenizer(self.texts, truncation=True, padding=True)
        self.labels = [self.label_encodings[label] for i, label in enumerate(labels) if i not in remove_idxs]
        self.preds = []
        print(len(self.idxs))
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    def _get_preds_with_idx(self):
        df_preds = pd.DataFrame(data={"idx": self.idxs, "prediction": self.preds})
        df_preds["prediction"] = df_preds["prediction"].map(self.rev_label_encodings)
        return df_preds

In [69]:
test_dataset_idx_cleaned = HDVDatasetTest(test_texts_idxs_2, test_labels_2, tokenizer, remove_test_indices)

2507
2497


In [75]:
trainer = Trainer(
    model=model,                                                         # the instantiated 🤗 Transformers model to be trained
    compute_metrics=compute_metrics
)

In [77]:
preds_dict_3 = trainer.predict(test_dataset_idx_cleaned)
predictions_3 = preds_dict_3.predictions
predictions_3 = np.argmax(predictions_3, axis=1)
print(f"Preds: {predictions_3}\n GT's: {preds_dict_3.label_ids}")
print(preds_dict_3.metrics)

***** Running Prediction *****
  Num examples = 2497
  Batch size = 8


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.4224632978439331, 'test_precision': 0.8884644766997708, 'test_recall': 0.9266932270916335, 'test_accuracy': 0.9046856227472968, 'test_f1': 0.9071762870514819, 'test_runtime': 17.3161, 'test_samples_per_second': 144.201, 'test_steps_per_second': 18.076}
