# XLNet version 1

Hate speech identification project, D7047E <br>
Binary text classification task using pretrained XLNet models

In [19]:
""" 
%pip install numpy
%pip install torch
%pip install torchvision
%pip install sentencepiece
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install accelerate
"""

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
import torchvision

# Hugging Face
from transformers import XLNetConfig, XLNetTokenizer, XLNetForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate

# Misc
from tqdm import tqdm, trange
import sentencepiece as spm
import pandas as pd
import numpy as np
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda:0


In [20]:
# Constants and vars

# aux files
path_tr = r"..\\OLID_Tain.txt" 
path_te_a = r"..\\OLID_TEST.txt" 
path_te_b = r"..\\OLID_TEST_B_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_te_c = r"..\\OLID_TEST_C_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_spm = r"data\\proj_xlnet" # +.model / +.vocab
path_output = r"trained\\"

# Constants
GLOBAL_SEED = 1337
TOKENIZER_MAX_LENGTH = 128

tokenizer_config = {
    "padding": "max_length", 
    "truncation": "longest_first",
    "max_length": TOKENIZER_MAX_LENGTH,
}

In [21]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

def tokenize_examples(examples):
    return tokenizer(examples["text"], **tokenizer_config)

def getDatasetElement(i):
    # Not nice to use globals for this, I know. 2LZY2FIX
    global olid_dataset
    return olid_dataset[i]["text"], olid_dataset[i]["label"]

In [22]:
# Load and split dataset
olid_dataset = pd.read_csv(path_tr, sep="\t", names=["id","text","label","other_1","other_2"])
olid_dataset = olid_dataset.drop(axis=0, index=0) # Remove column names
olid_dataset = olid_dataset.drop(axis=1, labels=["id", "other_1", "other_2"]).to_dict()
for i in trange(len(olid_dataset["label"])):
    olid_dataset["label"][i+1] = 1 if olid_dataset["label"][i+1] == "OFF" else 0 

olid_dataset = {
    "label": [y for y in olid_dataset["label"].values()],
    "text": [x for x in olid_dataset["text"].values()]     
}

olid_dataset = Dataset.from_dict(olid_dataset)
olid_dataset = olid_dataset.map(tokenize_examples, batched=True)
print(olid_dataset)

100%|██████████| 13240/13240 [00:00<00:00, 2219439.07it/s]
Map: 100%|██████████| 13240/13240 [00:02<00:00, 5337.18 examples/s]

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 13240
})





In [23]:
def _get_split(s):
    return [int(x * olid_dataset.num_rows) for x in s]
    
split = _get_split([0.7, 0.15, 0.15]) # [int(hate_dataset.num_rows * 0.8), int(hate_dataset.num_rows * 0.2)]
#split = [1000,50,1000]
olid_train = olid_dataset.shuffle(seed=GLOBAL_SEED).select(range(split[0]))
olid_val = olid_dataset.shuffle(seed=GLOBAL_SEED).select(range(split[1]))
olid_test = olid_dataset.shuffle(seed=GLOBAL_SEED).select(range(split[2]))
print(olid_train)
print(olid_val)
print(olid_test)


Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 9268
})
Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1986
})
Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1986
})


In [24]:
# Model 
# docs: https://huggingface.co/docs/transformers/model_doc/xlnet

xlnet_model = XLNetForSequenceClassification.from_pretrained(
    "xlnet/xlnet-base-cased", 
    num_labels=2
)
xlnet_model.to(device)

# Note: cannot get custom configuration to work, maybe worth fixing down the line
# either that or review the vocab size. Our vocab is <20k but pretrained model is 32k.
#    -> hopefully not a problem (surely)
#xlnet_model = XLNetForSequenceClassification(model_config)

print(xlnet_model.config)
print(next(xlnet_model.parameters()).is_cuda)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetConfig {
  "_name_or_path": "xlnet/xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.40.2",
  "untie_r": true,
  "use_mems_eval": true,
  "use_mems_train": false,
  "vocab_size": 32000
}

True


In [25]:
# Fine-tuning
# https://huggingface.co/docs/transformers/training

metric = evaluate.load("accuracy")
def compute_model_metrics(eval_pred):
    global metric
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return metric.compute(predictions=preds, references=labels)

tr_args = TrainingArguments(
    do_train=True, do_eval=True, evaluation_strategy="epoch", output_dir=path_output,
    
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    num_train_epochs=1, 
    save_strategy="no",
)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        W = torch.tensor([
            1.0, 
            0.5,
        ])
        loss_fct = nn.CrossEntropyLoss(weight=W).to(device)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)).to(device)
        return (loss, outputs) if return_outputs else loss


#optim = torch.optim.Adam(xlnet_model.parameters(), lr=1e-3, weight_decay=0.2)
trainer = CustomTrainer(
    model=xlnet_model,
    args=tr_args,
    train_dataset=olid_train,
    eval_dataset=olid_val,
    compute_metrics=compute_model_metrics
)
trainer.train()

                                                 
  8%|▊         | 96/1159 [21:07<01:44, 10.14it/s] 

{'loss': 0.5334, 'grad_norm': 7.752789497375488, 'learning_rate': 2.842968075927524e-05, 'epoch': 0.43}


                                                 
  8%|▊         | 96/1159 [22:16<01:44, 10.14it/s]  

{'loss': 0.5244, 'grad_norm': 3.027865409851074, 'learning_rate': 6.859361518550475e-06, 'epoch': 0.86}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                 

[A[A                                           
  8%|▊         | 96/1159 [22:48<01:44, 10.14it/s]  
[A
                                                 
100%|██████████| 1159/1159 [02:51<00:00,  6.77it/s]

{'eval_loss': 0.5219206809997559, 'eval_accuracy': 0.6666666666666666, 'eval_runtime': 9.8188, 'eval_samples_per_second': 202.266, 'eval_steps_per_second': 25.36, 'epoch': 1.0}
{'train_runtime': 171.2861, 'train_samples_per_second': 54.108, 'train_steps_per_second': 6.766, 'train_loss': 0.525686716601394, 'epoch': 1.0}





TrainOutput(global_step=1159, training_loss=0.525686716601394, metrics={'train_runtime': 171.2861, 'train_samples_per_second': 54.108, 'train_steps_per_second': 6.766, 'total_flos': 660067607721984.0, 'train_loss': 0.525686716601394, 'epoch': 1.0})

In [27]:
predictions = trainer.predict(olid_test)
print(predictions.predictions.shape, predictions.label_ids.shape)
print()
preds = np.argmax(predictions.predictions, axis=-1)

labels_t = {"0":0, "1":0}
labels_p = {"0":0, "1":0}
for i in range(len(predictions.predictions)):
    print(predictions.label_ids[i], preds[i], predictions.predictions[i])
    labels_t[str(predictions.label_ids[i])] += 1
    labels_p[str(preds[i])] += 1
print(f"True labels {labels_t}")
print(f"Predicted labels {labels_p}")

loss_his = []
for e in trainer.state.log_history:
    if "loss" in e:
        loss_his.append(e["loss"])


metric = evaluate.combine(["accuracy", "f1", "precision", "recall", "BucketHeadP65/confusion_matrix"])
metric.compute(predictions=preds, references=predictions.label_ids)

  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 249/249 [00:09<00:00, 25.63it/s]


(1986, 2) (1986,)

0 0 [ 1.1506485 -0.4169079]
0 0 [ 1.1433214  -0.41283822]
0 0 [ 1.1531802  -0.42123997]
0 0 [ 1.1667421 -0.4306187]
1 0 [ 1.1464943  -0.41332233]
1 0 [ 1.1410435  -0.41381174]
0 0 [ 1.1439055  -0.41170654]
0 0 [ 1.141874   -0.41236493]
0 0 [ 1.1429522  -0.41249973]
0 0 [ 1.1422906 -0.4132429]
0 0 [ 1.1411703 -0.4101127]
0 0 [ 1.1754177  -0.43539184]
0 0 [ 1.1449523  -0.41567382]
1 0 [ 1.1426265  -0.41286656]
0 0 [ 1.1530157  -0.41930217]
0 0 [ 1.1422724 -0.4126671]
1 0 [ 1.1448321 -0.4141724]
0 0 [ 1.1387514  -0.41027337]
0 0 [ 1.1414804 -0.4123168]
1 0 [ 1.1455171  -0.41421896]
0 0 [ 1.1506673 -0.4184029]
0 0 [ 1.1422057  -0.41300356]
1 0 [ 1.1403593  -0.41151404]
0 0 [ 1.1540713  -0.42033392]
1 0 [ 1.1415836  -0.41056406]
1 0 [ 1.1422755  -0.41363984]
0 0 [ 1.142489   -0.41343617]
0 0 [ 1.1630381  -0.42618954]
0 0 [ 1.1407737  -0.41111213]
0 0 [ 1.1564785  -0.42254362]
0 0 [ 1.1500138 -0.4177701]
1 0 [ 1.1503639  -0.41682702]
0 0 [ 1.1432141  -0.41346836]
0 0 [ 1.1

  _warn_prf(average, modifier, msg_start, len(result))


{'accuracy': 0.6666666666666666,
 'f1': 0.0,
 'precision': 0.0,
 'recall': 0.0,
 'confusion_matrix': array([[1324,    0],
        [ 662,    0]], dtype=int64)}