# XLNet version 1

Hate speech identification project, D7047E <br>
Binary text classification task using pretrained XLNet models

In [1]:
""" 
%pip install numpy
%pip install torch
%pip install torchvision
%pip install sentencepiece
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install accelerate
%pip install imbalanced-learn
"""

# PyTorch
import torch
import torchvision

# Hugging Face
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import datasets
import evaluate

# Misc
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Set up loaders 
import sys; sys.path.append("..")
import proj_dataset
proj_dataset.FOLDERSPATH = r'..\\datasets'
proj_dataset.SOLID_HARDLINECAP = 40_000

# Set up cudas
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available(): torch.cuda.empty_cache()
print(f"using {device}")

using cuda




In [2]:
d,v,_,_,_ = proj_dataset.get_loaders()

Loaded 13240 lines from ..\\datasets\\OLID\\OLID_Tain.txt
Loaded 5852 lines from english_dataset.tsv
Loaded 1153 lines from hasoc2019_en_test-2919.tsv
Loaded 40000 lines from file_off.xlsx to ensure sufficient randomness, cap: 20000
Loaded 40000 lines from file_not.xlsx to ensure sufficient randomness, cap: 20000


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanne\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


24891 60438 19891 24687 87417


In [3]:
solid_path = r'..\\datasets' + r'\\OffenseEval\\'
file_off = r'file_off.xlsx'
file_not = r'file_not.xlsx'
d_off = proj_dataset._load_ds3_file(
            solid_path, 
            file_off, 
            nrows=40000, 
            capsize=20000)


d_not = proj_dataset._load_ds3_file(
            solid_path, 
            file_not, 
            nrows=40000, 
            capsize=20000)

Loaded 40000 lines from file_off.xlsx to ensure sufficient randomness, cap: 20000
Loaded 40000 lines from file_not.xlsx to ensure sufficient randomness, cap: 20000


In [11]:
def afsdas(v, b):
    for i in b[1]:
        v[str(i.item())] += 1

def getTrainLabelDistrib(loader):
    labels_dist = {"0":0, "1":0}
    for e in loader:
        afsdas(labels_dist, e)
    print(labels_dist)

getTrainLabelDistrib(d)
getTrainLabelDistrib(v)

{'0': 8000, '1': 8000}
{'0': 32000, '1': 32000}


In [6]:
# Constants and vars

# aux files
path_output = r"trained\\"

# Constants
GLOBAL_SEED = 1337
TOKENIZER_MAX_LENGTH = 128 #100 works with 1 epoch 
torch.manual_seed(GLOBAL_SEED)

tokenizer_config = {
    "padding": "max_length", 
    "truncation": "longest_first",
    "max_length": TOKENIZER_MAX_LENGTH,
    #"add_special_tokens": True,
    #"return_tensors": "pt",
    #"return_token_type_ids": False, 
    #"return_attention_mask": True, 
    #"pad_to_max_length": False
}

In [7]:
tokenizer = XLNetTokenizer.from_pretrained(
    'xlnet-base-cased',
    device=device
)

def tokenize_examples(examples):
    return tokenizer(examples["text"], **tokenizer_config)

In [8]:
# Load and split dataset
def load_test_set():
    olid_dataset = pd.read_csv(path_tr, sep="\t", names=["id","text","label","other_1","other_2"])
    olid_dataset = olid_dataset.drop(axis=0, index=0) # Remove column names
    olid_dataset = olid_dataset.drop(axis=1, labels=["id", "other_1", "other_2"]).to_dict()
    for i in trange(len(olid_dataset["label"])):
        olid_dataset["label"][i+1] = 1 if olid_dataset["label"][i+1] == "OFF" else 0 

    olid_dataset = {
        "label": [y for y in olid_dataset["label"].values()],
        "text": [x for x in olid_dataset["text"].values()]     
    }

    olid_dataset = Dataset.from_dict(olid_dataset)
    olid_dataset = olid_dataset.map(tokenize_examples, batched=True)
    print(olid_dataset)

In [9]:
def getTrainLabelDistrib(loader):
    labels_dist = {}
    for e in loader:
        lab = e["label"]
        if lab in labels_dist:
            labels_dist[lab] += 1
        else:
            labels_dist[lab] = 0
    print(labels_dist)

In [10]:
getTrainLabelDistrib(olid_test)
getTrainLabelDistrib(olid_train)
getTrainLabelDistrib(olid_val)
getTrainLabelDistrib(olid_train_os)

NameError: name 'olid_test' is not defined

In [None]:
# Model 
# docs: https://huggingface.co/docs/transformers/model_doc/xlnet

xlnet_model = XLNetForSequenceClassification.from_pretrained(
    "xlnet-base-cased", 
    num_labels=2
)
xlnet_model.to(device)

print(xlnet_model.config)
print(next(xlnet_model.parameters()).is_cuda)

In [None]:
# Fine-tuning
# https://huggingface.co/docs/transformers/training

metric = evaluate.load("accuracy")
def compute_model_metrics(eval_pred):
    global metric
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return metric.compute(predictions=preds, references=labels)


device_batch_size = 32
grad_steps = 8
calc_steps = len(olid_train_os) // (device_batch_size * grad_steps * 5) 


tr_args = TrainingArguments(
    do_train=True, do_eval=True, evaluation_strategy="epoch", output_dir=path_output,
    
    logging_steps=calc_steps, # For larger size dataset
    eval_accumulation_steps=10, # To prevent cuda OOM after training
    per_device_eval_batch_size=device_batch_size,
    per_device_train_batch_size=device_batch_size,
    bf16=True, 
    # dataloader_pin_memory=True, dataloader_num_workers=8, # Load data on GPU
    #fp16=True,

    num_train_epochs = 10, 
    gradient_accumulation_steps=grad_steps,
    weight_decay=0.1,
    save_strategy="no",
    # use_cpu=True
)


trainer = Trainer(
    model=xlnet_model,
    args=tr_args,
    train_dataset=olid_train,
    eval_dataset=olid_val,
    compute_metrics=compute_model_metrics
)
trainer.train()

In [None]:
predictions = trainer.predict(olid_test)
print(predictions.predictions.shape, predictions.label_ids.shape) 
preds = np.argmax(predictions.predictions, axis=-1)

labels_t = {"0":0, "1":0}
labels_p = {"0":0, "1":0}
for i in range(len(predictions.predictions)):
    #print(predictions.label_ids[i], preds[i], predictions.predictions[i]) # Debug overfitting
    labels_t[str(predictions.label_ids[i])] += 1
    labels_p[str(preds[i])] += 1
print(f"True labels {labels_t}")
print(f"Predicted labels {labels_p}")

loss_his = {"tr_loss":[], "val_loss":[]}

for e in trainer.state.log_history:
    if "loss" in e:
        loss_his["tr_loss"].append(e["loss"])
    elif "eval_loss" in e:
        loss_his["val_loss"].append(e["eval_loss"])

metric = evaluate.combine(["accuracy", "f1", "precision", "recall", "BucketHeadP65/confusion_matrix"])
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
plt.plot(loss_his["tr_loss"], label="tr_loss")
plt.ylabel('loss')
plt.xlabel('epoch')
plt.ylim([0,2])
plt.show()

plt.plot(loss_his["val_loss"], label="tr_loss")
plt.ylabel('loss')
plt.xlabel('epoch')
plt.ylim([0,2])
plt.show()