In [1]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
import pickle
import os

In [None]:
pandarallel.initialize(progress_bar=False, nb_workers=8)

## Data preparation

In [None]:
data = pd.read_csv("../data/data_cleaned_sentences_2020-04-10.csv", sep='|', converters={'sentences': pd.eval})

In [None]:
data["title"] = data["title"].parallel_apply(lambda title: title if isinstance(title, str) else "") 
data["text"] = data.parallel_apply(lambda row: " ".join([sent for sent in [row["title"]] + row["sentences"]]), axis=1)

In [None]:
data = data.reset_index().rename(columns={"index": "id"})
data = data[["id", "text", "Label"]]

In [None]:
row = data.loc[19]
print(row.title)
print(row.sentences)
print("--------------------")
print(row.text)
print(row.Label)

In [None]:
pickle.dump(data, open("../data/data_id-text-label_2022-10-14.pkl", "wb"))

## Huggingface custom dataset

In [2]:
data = pickle.load(open("../data/data_id-text-label_2022-10-14.pkl", "rb"))
data

Unnamed: 0,id,text,Label
0,0,sahte polislerin kuryesi yakalandı sahte polis...,hate
1,1,kürt üz ama hain değiliz kürtüz ama hain değil...,hate
2,2,suriyeli gelinden altın vurgunu kuyumcuda altı...,hate
3,3,mustafa nevruz sınacı mustafa nevruz sınacı lg...,hate
4,4,mustafa nevruz sınacı mustafa nevruz sınacı ya...,hate
...,...,...,...
25061,25061,amnesty ınternational ve global ahlaksızlık do...,hate
25062,25062,çanakkale asla unutulmamalı llnutturulmamalı ç...,hate
25063,25063,sömürü projesi olarak bop btp genel başkanı pr...,hate
25064,25064,doğruluş zeminimiz helali bir millet istiklali...,hate


In [53]:
import torch
from collections import Counter
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_metric

In [4]:
class HDVDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, idxs):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.encodings = encodings
        self.labels = [self.label_encodings[label] for label in labels]
        self.idxs = self.idxs

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [23]:
idxs, texts, labels = list(data["id"].values), list(data["text"].values), list(data["Label"].values)

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, stratify=labels, test_size=.2, shuffle=True, random_state=17)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, stratify=val_labels, test_size=.5, shuffle=True, random_state=17)

In [7]:
print(f"Training dist: {Counter(train_labels)}")
print(f"Validation dist: {Counter(val_labels)}")
print(f"Test dist: {Counter(test_labels)}")

Training dist: Counter({'hate': 10107, 'not_hate': 9945})
Validation dist: Counter({'hate': 1264, 'not_hate': 1243})
Test dist: Counter({'hate': 1263, 'not_hate': 1244})


In [54]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-128k-uncased")

In [9]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [10]:
train_dataset = HDVDataset(train_encodings, train_labels)
val_dataset = HDVDataset(val_encodings, val_labels)
test_dataset = HDVDataset(test_encodings, test_labels)

## Evaluation metrics

In [72]:
prec = load_metric("precision")
rec = load_metric("recall")
acc = load_metric("accuracy")
f1 = load_metric("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    result = {}
    for mtrc in [prec, rec, acc, f1]:
        mtrc_result = mtrc.compute(predictions=predictions, references=labels)
        result.update(mtrc_result)
    return result

## Huggingface models

In [12]:
%env WANDB_PROJECT=hdv_hate_speech
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_NOTEBOOK_NAME=berturk

env: WANDB_PROJECT=hdv_hate_speech
env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=berturk


In [74]:
import wandb
from transformers import Trainer, TrainingArguments

In [14]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mnlpboun[0m (use `wandb login --relogin` to force relogin)


True

In [15]:
results_path = "../experiments/results/"
logs_path = "../experiments/logs/"

In [16]:
model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased", num_labels=2)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [17]:
training_args = TrainingArguments(
    output_dir=os.path.join(results_path, "berturk_128K"),               # output directory
    num_train_epochs=2,                                                  # total number of training epochs
    per_device_train_batch_size=4,                                       # batch size per device during training
    per_device_eval_batch_size=4,                                        # batch size for evaluation
    warmup_steps=500,                                                    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                                   # strength of weight decay
    logging_dir=os.path.join(results_path, "berturk_128K"),              # directory for storing logs
    logging_steps=20,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    save_steps=1000,
    learning_rate=1e-05,
    report_to='wandb',
    run_name="berturk_128K_uncased_lre-5",
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,                                                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                                                  # training arguments, defined above
    train_dataset=train_dataset,                                         # training dataset
    eval_dataset=val_dataset,                                            # evaluation dataset
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 20052
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 10026
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Precision,Recall,Accuracy,F1
500,0.5074,0.470382,0.742204,0.84731,0.774631,0.791282
1000,0.4121,0.487355,0.869938,0.883703,0.874751,0.876766
1500,0.3315,0.4535,0.88062,0.898734,0.887515,0.889585
2000,0.2632,0.537385,0.93053,0.80538,0.87156,0.863444
2500,0.6747,0.489855,0.929856,0.818038,0.877144,0.87037
3000,0.593,0.378214,0.902478,0.893196,0.897487,0.897813
3500,0.4252,0.367395,0.888298,0.924842,0.90347,0.906202
4000,0.3721,0.365355,0.894453,0.918513,0.904268,0.906323
4500,0.5525,0.460086,0.850035,0.959652,0.894296,0.901524
5000,0.2249,0.378452,0.916599,0.89557,0.906262,0.905962


***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
Saving model checkpoint to ../experiments/results/berturk_128K/checkpoint-1000
Configuration saved in ../experiments/results/berturk_128K/checkpoint-1000/config.json
Model weights saved in ../experiments/results/berturk_128K/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
Saving model checkpoint to ../experiments/results/berturk_128K/checkpoint-2000
Configuration saved in ../experiments/results/berturk_128K/checkpoint-2000/config.json
Model weights saved in ../experiments/results/berturk_128K/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4
Saving model checkpoint to ../experiments/results/berturk_128K

TrainOutput(global_step=10026, training_loss=0.38614428408388934, metrics={'train_runtime': 1770.0672, 'train_samples_per_second': 22.657, 'train_steps_per_second': 5.664, 'total_flos': 1.055180576415744e+16, 'train_loss': 0.38614428408388934, 'epoch': 2.0})

## Evaluation (validation)

In [19]:
eval_results = trainer.evaluate(val_dataset)
eval_results

***** Running Evaluation *****
  Num examples = 2507
  Batch size = 4


{'eval_loss': 0.3649916350841522,
 'eval_precision': 0.9016018306636155,
 'eval_recall': 0.935126582278481,
 'eval_accuracy': 0.9158356601515756,
 'eval_f1': 0.9180582524271845,
 'eval_runtime': 22.3554,
 'eval_samples_per_second': 112.143,
 'eval_steps_per_second': 28.047,
 'epoch': 2.0}

## Testing

In [50]:
class HDVDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts_idxs, labels, tokenizer):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.rev_label_encodings = {0: "not_hate", 1: "hate"}
        
        self.texts, self.idxs = list(np.array(texts_idxs)[:, 0]), list(np.array(texts_idxs)[:, 1])
        self.encodings = tokenizer(self.texts, truncation=True, padding=True)
        self.labels = [self.label_encodings[label] for label in labels]
        self.preds = []

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    def _get_preds_with_idx(self):
        df_preds = pd.DataFrame(data={"idx": self.idxs, "prediction": self.preds})
        df_preds["prediction"] = df_preds["prediction"].map(self.rev_label_encodings)
        return df_preds

In [50]:
texts_idxs, labels = list(data[["text", "id"]].values), list(data["Label"].values)
train_texts_idxs_2, val_texts_idxs_2, train_labels_2, val_labels_2 = train_test_split(texts_idxs, labels, stratify=labels, test_size=.2, shuffle=True, random_state=17)
val_texts_idxs_2, test_texts_idxs_2, val_labels_2, test_labels_2 = train_test_split(val_texts_idxs_2, val_labels_2, stratify=val_labels_2, test_size=.5, shuffle=True, random_state=17)

In [51]:
test_dataset_idx = HDVDatasetTest(test_texts_idxs_2, test_labels_2, tokenizer)

In [27]:
preds_dict = trainer.predict(test_dataset)
predictions = preds_dict.predictions
predictions = np.argmax(predictions, axis=1)
print(f"Preds: {predictions}\n GT's: {preds_dict.label_ids}")
print(preds_dict.metrics)

***** Running Prediction *****
  Num examples = 2507
  Batch size = 4


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.42105528712272644, 'test_precision': 0.8891419893697798, 'test_recall': 0.9271575613618369, 'test_accuracy': 0.9050658157159952, 'test_f1': 0.9077519379844963, 'test_runtime': 23.0777, 'test_samples_per_second': 108.633, 'test_steps_per_second': 27.169}


In [28]:
predictions

array([1, 0, 1, ..., 1, 0, 1])

In [52]:
preds_dict_2 = trainer.predict(test_dataset_idx)
predictions_2 = preds_dict_2.predictions
predictions_2 = np.argmax(predictions_2, axis=1)
print(f"Preds: {predictions_2}\n GT's: {preds_dict_2.label_ids}")
print(preds_dict_2.metrics)

***** Running Prediction *****
  Num examples = 2507
  Batch size = 4


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.42105528712272644, 'test_precision': 0.8891419893697798, 'test_recall': 0.9271575613618369, 'test_accuracy': 0.9050658157159952, 'test_f1': 0.9077519379844963, 'test_runtime': 23.2055, 'test_samples_per_second': 108.035, 'test_steps_per_second': 27.019}


In [53]:
test_dataset_idx.preds = predictions_2
df_preds = test_dataset_idx._get_preds_with_idx()

In [54]:
df_preds

Unnamed: 0,idx,prediction
0,3973,hate
1,16849,not_hate
2,9186,hate
3,3072,hate
4,10346,hate
...,...,...
2502,3672,hate
2503,18446,not_hate
2504,3657,hate
2505,19535,not_hate


## Report to csv

In [58]:
df_label_preds = pd.merge(data, df_preds, left_on="id", right_on="idx", how="right").drop("idx", axis=1)
df_label_preds.to_excel("../outputs/labels_preds_berturk_2022-04-14.xlsx", index=False)

In [59]:
df_label_preds

Unnamed: 0,id,text,Label,prediction
0,3973,haber seyfullah koyuncu freddy mercurynin aske...,hate,hate
1,16849,kilisede hz fatıma nın doğumu kutlandı kilised...,not_hate,not_hate
2,9186,yunanlıların verdiği zararları anlatan resmi d...,hate,hate
3,3072,itı hr it serdar çalışkan yaptığı açıklamada ş...,hate,hate
4,10346,mültecileri dövüp geri gönderdiler mültecileri...,hate,hate
...,...,...,...,...
2502,3672,saitiyor su rl yıu ur bpfjmkmmiami rj fiil il ...,hate,hate
2503,18446,bu işbirllfii türkiye ye örnek oucak bu işbirl...,not_hate,not_hate
2504,3657,mersinde işlenen cinayetle ilgili suriyeli tut...,hate,hate
2505,19535,siparişle kurulan proje örgütlerdir siparişle ...,not_hate,not_hate


In [60]:
data_raw = pd.read_csv("../data/data_cleaned_sentences_2020-04-10.csv", sep='|', converters={'sentences': pd.eval})

In [63]:
data_raw = data_raw.reset_index().rename(columns={"index": "id"})

In [65]:
df_data_and_preds = pd.merge(data_raw, df_preds, left_on="id", right_on="idx", how="right").drop("idx", axis=1)

In [68]:
df_data_and_preds.to_excel("../outputs/data_and_preds_2022-04-14.xlsx", index=False)

In [67]:
sum(df_data_and_preds["Label"] == df_data_and_preds["prediction"]) / df_data_and_preds.shape[0]

0.9050658157159952

In [70]:
Counter(val_labels)

Counter({'hate': 1264, 'not_hate': 1243})

In [63]:
df = pd.read_excel("../outputs/data_and_preds_2022-04-14.xlsx")

In [64]:
(sum(df["Label"] == df["prediction"]) - len(remove_id_from_test)) / (df.shape[0] - len(remove_id_from_test))

0.9046856227472968

## Duplicate rows

In [10]:
id_excel = pd.read_excel("../outputs/data_and_preds_2022-04-14.xlsx")["id"].values.tolist()

In [11]:
list(np.array(test_texts_idxs_2)[:, 1]) == id_excel

True

In [12]:
set.intersection(set(np.array(test_texts_idxs_2)[:, 1]), set(np.array(train_texts_idxs_2)[:, 1]))

set()

In [35]:
first_pair = data[data[["text"]].duplicated(keep="first")].sort_values("text").id.values.tolist()
last_pair = data[data[["text"]].duplicated(keep="last")].sort_values("text").id.values.tolist()

In [39]:
pairs = [[first_pair[i], last_pair[i]] for i in range(len(first_pair))]
pairs

[[15058, 2247],
 [886, 855],
 [1658, 1653],
 [270, 269],
 [8027, 8026],
 [888, 887],
 [3733, 3732],
 [7759, 7758],
 [1968, 1967],
 [6104, 6095],
 [7982, 7861],
 [11343, 2715],
 [9919, 9705],
 [951, 933],
 [3124, 3117],
 [11376, 11368],
 [19910, 7489],
 [10388, 10130],
 [998, 997],
 [8034, 7955],
 [2530, 2529],
 [554, 545],
 [8926, 8925],
 [22303, 22298],
 [4438, 4432],
 [6020, 6015],
 [860, 828],
 [1243, 1141],
 [1164, 1163],
 [973, 972],
 [11053, 9867],
 [6744, 6400],
 [7632, 7631],
 [19117, 6778],
 [3607, 3590],
 [1054, 1011],
 [11955, 11945],
 [24835, 24800],
 [2110, 2109],
 [3218, 3217],
 [11342, 2714],
 [557, 539],
 [5744, 5742],
 [21625, 21619],
 [7201, 7179],
 [1804, 1788],
 [12017, 12002],
 [7984, 7862],
 [11374, 11367],
 [16850, 3637],
 [2449, 2448]]

In [40]:
train_idxs, val_idxs, test_idxs = list(np.array(train_texts_idxs_2)[:, 1]), list(np.array(val_texts_idxs_2)[:, 1]), list(np.array(test_texts_idxs_2)[:, 1])

In [48]:
def detect_duplicate_rows(train_idxs, test_idxs, pairs):
    remove_id_from_test = []
    for pair in pairs:
        if pair[0] in train_idxs and pair[1] in train_idxs:
            print(pair, " is all in train set!")
        elif pair[0] in train_idxs and pair[1] in test_idxs:
            print(f"{pair[0]} in train, {pair[1]} in test, please delete from test!!!")
            remove_id_from_test.append(pair[1])
        elif pair[1] in train_idxs and pair[0] in test_idxs:
            print(f"{pair[1]} in train, {pair[0]} in test, please delete from test!!!")
            remove_id_from_test.append(pair[0])
        elif pair[0] in test_idxs and pair[1] in test_idxs:
            print(f"{pair[0]} in test, {pair[1]} in test, please delete one of them from test!!!")
            remove_id_from_test.append(pair[1])
    remove_test_indices = []
    for remove_id in remove_id_from_test:
        remove_test_indices.append(test_idxs.index(remove_id))
    remove_test_indices
    return remove_id_from_test, remove_test_indices

In [49]:
remove_id_from_test = detect_duplicate_rows(train_idxs, test_idxs, pairs)
remove_id_from_test

[15058, 2247]  is all in train set!
[886, 855]  is all in train set!
[1658, 1653]  is all in train set!
[270, 269]  is all in train set!
[8027, 8026]  is all in train set!
888 in train, 887 in test, please delete from test!!!
[7759, 7758]  is all in train set!
6104 in train, 6095 in test, please delete from test!!!
[11343, 2715]  is all in train set!
[9919, 9705]  is all in train set!
[951, 933]  is all in train set!
[3124, 3117]  is all in train set!
[11376, 11368]  is all in train set!
[10388, 10130]  is all in train set!
[998, 997]  is all in train set!
[8034, 7955]  is all in train set!
[2530, 2529]  is all in train set!
545 in train, 554 in test, please delete from test!!!
[8926, 8925]  is all in train set!
22303 in train, 22298 in test, please delete from test!!!
[4438, 4432]  is all in train set!
[6020, 6015]  is all in train set!
[860, 828]  is all in train set!
[1243, 1141]  is all in train set!
[1164, 1163]  is all in train set!
[973, 972]  is all in train set!
[11053, 9867] 

[887, 6095, 554, 22298, 1011, 3217, 21619, 7201, 1788, 3637]

In [52]:
remove_id_from_val = detect_duplicate_rows(train_idxs, val_idxs, pairs)
remove_id_from_val

[15058, 2247]  is all in train set!
[886, 855]  is all in train set!
[1658, 1653]  is all in train set!
[270, 269]  is all in train set!
[8027, 8026]  is all in train set!
3732 in train, 3733 in test, please delete from test!!!
[7759, 7758]  is all in train set!
1967 in train, 1968 in test, please delete from test!!!
[11343, 2715]  is all in train set!
[9919, 9705]  is all in train set!
[951, 933]  is all in train set!
[3124, 3117]  is all in train set!
[11376, 11368]  is all in train set!
19910 in train, 7489 in test, please delete from test!!!
[10388, 10130]  is all in train set!
[998, 997]  is all in train set!
[8034, 7955]  is all in train set!
[2530, 2529]  is all in train set!
[8926, 8925]  is all in train set!
[4438, 4432]  is all in train set!
[6020, 6015]  is all in train set!
[860, 828]  is all in train set!
[1243, 1141]  is all in train set!
[1164, 1163]  is all in train set!
[973, 972]  is all in train set!
[11053, 9867]  is all in train set!
[6744, 6400]  is all in train s

[3733, 1968, 7489, 6778, 539, 5742]

## Inference

In [66]:
model = AutoModelForSequenceClassification.from_pretrained("../experiments/results/berturk_128K/checkpoint-10000")

In [67]:
class HDVDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts_idxs, labels, tokenizer, remove_idxs):
        self.label_encodings = {"not_hate": 0, "hate": 1}
        self.rev_label_encodings = {0: "not_hate", 1: "hate"}
        
        self.texts, self.idxs = list(np.array(texts_idxs)[:, 0]), list(np.array(texts_idxs)[:, 1])
        print(len(self.idxs))
        self.texts = [text for i, text in enumerate(self.texts) if i not in remove_idxs]
        self.idxs = [idx for i, idx in enumerate(self.idxs) if i not in remove_idxs]
        self.encodings = tokenizer(self.texts, truncation=True, padding=True)
        self.labels = [self.label_encodings[label] for i, label in enumerate(labels) if i not in remove_idxs]
        self.preds = []
        print(len(self.idxs))
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
    def _get_preds_with_idx(self):
        df_preds = pd.DataFrame(data={"idx": self.idxs, "prediction": self.preds})
        df_preds["prediction"] = df_preds["prediction"].map(self.rev_label_encodings)
        return df_preds

In [69]:
test_dataset_idx_cleaned = HDVDatasetTest(test_texts_idxs_2, test_labels_2, tokenizer, remove_test_indices)

2507
2497


In [75]:
trainer = Trainer(
    model=model,                                                         # the instantiated 🤗 Transformers model to be trained
    compute_metrics=compute_metrics
)

In [77]:
preds_dict_3 = trainer.predict(test_dataset_idx_cleaned)
predictions_3 = preds_dict_3.predictions
predictions_3 = np.argmax(predictions_3, axis=1)
print(f"Preds: {predictions_3}\n GT's: {preds_dict_3.label_ids}")
print(preds_dict_3.metrics)

***** Running Prediction *****
  Num examples = 2497
  Batch size = 8


Preds: [1 0 1 ... 1 0 1]
 GT's: [1 0 1 ... 1 0 1]
{'test_loss': 0.4224632978439331, 'test_precision': 0.8884644766997708, 'test_recall': 0.9266932270916335, 'test_accuracy': 0.9046856227472968, 'test_f1': 0.9071762870514819, 'test_runtime': 17.3161, 'test_samples_per_second': 144.201, 'test_steps_per_second': 18.076}
