In [1]:
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup, DataCollatorWithPadding, RobertaForSequenceClassification
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, accuracy_score
from datasets import load_dataset

2024-03-30 21:37:52.006204: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 21:37:52.006299: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 21:37:52.126996: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
dataset = load_dataset("ClaudiaRichard/mbti_classification_v2")
dataset 

Downloading and preparing dataset parquet/ClaudiaRichard--mbti_classification_v2 to /root/.cache/huggingface/datasets/parquet/ClaudiaRichard--mbti_classification_v2-d575ab717075612b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/10.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.72M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.06M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/ClaudiaRichard--mbti_classification_v2-d575ab717075612b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 95166
    })
    validation: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 25377
    })
    test: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 38067
    })
})

In [3]:
dataset['train'][:6]

{'I/E': [0, 0, 0, 0, 0, 1],
 'N/S': [0, 0, 0, 0, 0, 0],
 'T/F': [1, 1, 1, 1, 1, 0],
 'J/P': [0, 0, 0, 0, 0, 1],
 'post': ["Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...",
  'All things in moderation.  Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...',
  'It appears to be too late. :sad:',
  'Get high in backyard, roast and eat marshmellows in backyard while conversing over something intellectual, followed by massages and kisses.',
  "Banned for too many b's in that sentence. How could you! Think of the B!",
  "Sex can be boring if it's in the same position often. For example me and my girlfriend are currently in an environment where we have to creatively use cowgirl and missionary. There isn't enough..."]}

In [4]:
train_text = dataset['train']['post']

validation_text = dataset['validation']['post']

test_text = dataset['test']['post']

train_text[:3]

["Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...",
 'All things in moderation.  Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...',
 'It appears to be too late. :sad:']

In [5]:
label1 = dataset['train']['I/E']
label2 = dataset['train']['N/S']
label3 = dataset['train']['T/F']
label4 = dataset['train']['J/P']
train_labels = [[a, b, c, d] for a, b, c, d in zip(label1, label2, label3, label4)]
train_labels = [[float(element) for element in sublist] for sublist in train_labels]

label1 = dataset['validation']['I/E']
label2 = dataset['validation']['N/S']
label3 = dataset['validation']['T/F']
label4 = dataset['validation']['J/P']
validation_labels = [[a, b, c, d] for a, b, c, d in zip(label1, label2, label3, label4)]
validation_labels = [[float(element) for element in sublist] for sublist in validation_labels]

label1 = dataset['test']['I/E']
label2 = dataset['test']['N/S']
label3 = dataset['test']['T/F']
label4 = dataset['test']['J/P']
test_labels = [[a, b, c, d] for a, b, c, d in zip(label1, label2, label3, label4)]
test_labels = [[float(element) for element in sublist] for sublist in test_labels]

train_labels[:3]

[[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0]]

In [6]:
len(train_text), len(validation_text), len(test_text)

(95166, 25377, 38067)

In [7]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=1024)
        return {'input_ids': encoding['input_ids'].flatten(),'token_type_ids': encoding['token_type_ids'].flatten() , 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}

In [8]:
batch_size = 8

In [9]:
checkpoint = "xlnet-large-cased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataset = TextClassificationDataset(train_text, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

validation_dataset = TextClassificationDataset(validation_text, validation_labels, tokenizer)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

test_dataset = TextClassificationDataset(test_text, test_labels, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)

config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [10]:
batch = next(iter(train_dataloader))
print({k: v.shape for k, v in batch.items()})

{'input_ids': torch.Size([8, 63]), 'token_type_ids': torch.Size([8, 63]), 'attention_mask': torch.Size([8, 63]), 'labels': torch.Size([8, 4])}


In [11]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-large-cased", num_labels = 4, problem_type="multi_label_classification")

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
torch.cuda.is_available()

True

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 1024)
    (layer): ModuleList(
      (0-23): 24 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=1024, out_features=4096, bias=True)
          (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=1024, out_features=1024, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
  

In [14]:
epochs = 6

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = AdamW(model.parameters(), lr = 1e-5)    
total_steps = len(train_dataloader) * epochs
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)



In [15]:
from tqdm.auto import tqdm
from tqdm import trange


train_loss_set = []
val_loss_set = []

progress_bar = tqdm(range(total_steps))


for _ in trange(epochs, desc="Epoch"):

  # Training
  
  model.train()

  tr_loss = 0 
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_token_types = batch['token_type_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    optimizer.zero_grad()

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
    logits = outputs[0]
    loss = loss_fn(logits.view(-1,4), b_labels.type_as(logits).view(-1,4)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    progress_bar.update(1)
    
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  model.eval()

  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]
  val_loss = 0
  nb_val_steps, nb_val_examples = 0, 0

  # Predict
  for batch in validation_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_token_types = batch['token_type_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      loss = loss_fn(outs[0].view(-1,4), b_labels.type_as(outs[0]).view(-1,4)) #convert labels to float for calculation
      val_loss_set.append(loss.item())

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy() 

    val_loss += loss.item()
    nb_val_examples += b_input_ids.size(0)
    nb_val_steps += 1

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)
  
  print("Validation loss: {}".format(val_loss/nb_val_steps))

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

  0%|          | 0/71376 [00:00<?, ?it/s]


Epoch:   0%|          | 0/6 [00:00<?, ?it/s][A

Train loss: 0.5616598243411147
Validation loss: 0.5382737708399808



Epoch:  17%|█▋        | 1/6 [54:56<4:34:41, 3296.31s/it][A

F1 Validation Accuracy:  63.936024451939275
Flat Validation Accuracy:  27.253024392166136
Train loss: 0.5338169873116516
Validation loss: 0.5334311865112814



Epoch:  33%|███▎      | 2/6 [1:49:56<3:39:54, 3298.65s/it][A

F1 Validation Accuracy:  63.415698467244866
Flat Validation Accuracy:  28.789849075934903
Train loss: 0.5042057102890566
Validation loss: 0.5495820067942687



Epoch:  50%|█████     | 3/6 [2:45:01<2:45:04, 3301.44s/it][A

F1 Validation Accuracy:  64.8602701748421
Flat Validation Accuracy:  29.711943886196163
Train loss: 0.4561599604121554
Validation loss: 0.5962248531484544



Epoch:  67%|██████▋   | 4/6 [3:40:04<1:50:04, 3302.14s/it][A

F1 Validation Accuracy:  61.6482232455839
Flat Validation Accuracy:  28.714978129802578
Train loss: 0.3954785439632357
Validation loss: 0.6681481564585959



Epoch:  83%|████████▎ | 5/6 [4:35:11<55:03, 3303.86s/it]  [A

F1 Validation Accuracy:  60.69259223870819
Flat Validation Accuracy:  27.816526776214683
Train loss: 0.3408684359860895
Validation loss: 0.7428415512347349



Epoch: 100%|██████████| 6/6 [5:30:17<00:00, 3302.91s/it]

F1 Validation Accuracy:  60.87929358176951
Flat Validation Accuracy:  27.09540134767703





In [16]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [17]:
import evaluate

accuracy_metric = evaluate.load("accuracy", "f1")
model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    b_input_ids = batch['input_ids']
    b_token_types = batch['token_type_ids']
    b_input_mask = batch['attention_mask']
    b_labels = batch['labels']
    with torch.no_grad():
        outs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)
    predictions = (pred_label >= 0.5).int().reshape(-1)
    
    accuracy_metric.add_batch(predictions=predictions, references=b_labels.int().reshape(-1))

accuracy_metric.compute()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.7109110252975017}

In [18]:
# preds = []
# labels = []
# for batch in test_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     b_input_ids = batch['input_ids']
#     b_token_types = batch['token_type_ids']
#     b_input_mask = batch['attention_mask']
#     b_labels = batch['labels']
#     with torch.no_grad():
#         outs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
#     b_logit_pred = outs[0]
#     pred_label = torch.sigmoid(b_logit_pred).cpu()
#     predictions = (pred_label >= 0.5).int().reshape(-1)
#     preds.append(predictions)
#     labels.append(b_labels)
    
# accuracy = accuracy_score(labels, preds)*100

In [19]:
# preds = []
# labels = []
# for batch in validation_dataloader:
#     batch = {k: v.to(device) for k, v in batch.items()}
#     b_input_ids = batch['input_ids']
#     b_token_types = batch['token_type_ids']
#     b_input_mask = batch['attention_mask']
#     b_labels = batch['labels']
#     with torch.no_grad():
#         outs = model(b_input_ids, token_type_ids=b_token_types, attention_mask=b_input_mask)
#     b_logit_pred = outs[0]
#     pred_label = torch.sigmoid(b_logit_pred).cpu()
#     predictions = (pred_label >= 0.5).int().reshape(-1)
#     preds.append(predictions)
#     labels.append(b_labels)
    
# accuracy = accuracy_score(labels, preds)*100

In [20]:
torch.save(model.state_dict(), "/kaggle/working/XLNet_classifier_mbti2.pth")