# XLNet version 1

Hate speech identification project, D7047E <br>
Binary text classification task using pretrained XLNet models

In [71]:
""" 
%pip install torch
%pip install torchvision
%pip install sentencepiece
%pip install transformers
"""

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision

from transformers import XLNetModel, XLNetConfig, XLNetTokenizer, AutoTokenizer, XLNetForSequenceClassification

from tqdm import tqdm, trange
import sentencepiece as spm
import pandas as pd
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"using {device}")

using cuda:0


In [138]:
# Constants and vars

# aux files
path_tr = r"..\\OLID_Tain_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_te_a = r"..\\OLID_TEST_A_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_te_b = r"..\\OLID_TEST_B_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_te_c = r"..\\OLID_TEST_C_ATUSER_URL_EmojiRemoved_Pedro.txt" 
path_temp_vocab_tsv = r"data\\temp_vocab.txt"
path_spm = r"data\\proj_xlnet" # +.model / +.vocab

# Model configs
VOCAB_SIZE = 20_000
ALWAYS_OVERWRITE_VOCAB = True
model_config = XLNetConfig(
    vocab_size = VOCAB_SIZE,
    num_labels = 1
)

In [134]:
# Load data, format, preprocess
#tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-base-cased")

def loadVocabStringFromFile(filepath):
    res_s = ""
    extra = {"encoding": "utf-8"}
    file_len = sum(1 for _ in open(filepath, **extra))
    with open(filepath, 'r', **extra) as trf:
        for i, line in enumerate(tqdm(trf, total=file_len)):
            if i > 0:
                l = line.split("\t")[1:2]
                res_s += l[0]+" "
    return res_s

# Load vocab
# TEMPORARY NOTE: THIS CODE REQUIRES EMPTY FOLDER [...]/XlNetModel/data/
if ALWAYS_OVERWRITE_VOCAB or not os.path.isfile(path_temp_vocab_tsv):
    print("Creating intermediate vocab help file")
    res_s = "" 
    res_s += loadVocabStringFromFile(path_tr)
    res_s += loadVocabStringFromFile(path_te_a)
    res_s += loadVocabStringFromFile(path_te_b)
    res_s += loadVocabStringFromFile(path_te_c)
   
    with open(path_temp_vocab_tsv, 'w', encoding="utf-8") as tmpf:
        tmpf.write(res_s)
else:
    print("File already found")

spm_tr = spm.SentencePieceTrainer.train(
    input = path_tr,
    model_prefix=path_spm, 
    input_format = "text",
    vocab_size = VOCAB_SIZE
)
tokenizer = XLNetTokenizer(vocab_file = path_spm+".model")

Creating intermediate vocab help file


  0%|          | 0/13241 [00:00<?, ?it/s]

100%|██████████| 13241/13241 [00:00<00:00, 287952.27it/s]
100%|██████████| 861/861 [00:00<00:00, 287913.24it/s]
100%|██████████| 241/241 [00:00<00:00, 237841.71it/s]
100%|██████████| 214/214 [00:00<00:00, 108246.63it/s]


In [135]:
# Load and split dataset
df = pd.read_csv(path_tr, sep="\t", names=["id","text","label","other_1","other_2"])
df = df.drop(axis=0, index=0) # Remove column names
df = df.drop(axis=1, labels=["id", "other_1", "other_2"]).to_dict()
for i in trange(len(df["label"])-1):
    df["label"][i+1] = 1 if df["label"][i+1] == "OFF" else 0

def getExample(index):
    if index <= 0: index = 1
    return df["text"][index], torch.LongTensor([df["label"][index]])

getExample(1) # index starts at 1

  0%|          | 0/13239 [00:00<?, ?it/s]

100%|██████████| 13239/13239 [00:00<00:00, 2202808.26it/s]


('@USER She should ask a few native Americans what their take on this is .',
 tensor([1]))

In [149]:
# Model 
# docs: https://huggingface.co/docs/transformers/model_doc/xlnet

model = XLNetForSequenceClassification.from_pretrained("xlnet/xlnet-base-cased", num_labels=2)

# Note: cannot get custom configuration to work, maybe worth fixing down the line
# either that or review the vocab size. Our vocab is <20k but pretrained model is 32k.
#    -> hopefully not a problem (surely)
#model = XLNetForSequenceClassification(model_config)

print(model.config)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetConfig {
  "_name_or_path": "xlnet/xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.40.1",
  "untie_r": true,
  "use_mems_eval": true,
  "use_mems_train": false,
  "vocab_size": 32000
}



In [159]:
ex_input, ex_label = tokenizer(getExample(0)[0], return_tensors="pt"), getExample(0)[1]
ex_input['labels'] = ex_label
output = model(**ex_input)
pred = torch.argmax(output.logits)

print(output.logits)
print(output.loss.item())
print(pred)

tensor([[0.0619, 0.0903]], grad_fn=<AddmmBackward0>)
0.6790687441825867
tensor(1)


In [None]:
# Fine-tuning
