# Setup

In [1]:
# General imports.
from tqdm.notebook import tqdm

# Specific imports.
import pandas as pd

import torch
from torch import nn
from torch.nn import functional as F
from torch.optim import AdamW, SGD
from torch.utils.data import DataLoader
from transformers import get_scheduler, AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizerFast, BertForSequenceClassification
from torch.profiler import profile, record_function, ProfilerActivity
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split



In [2]:
data = pd.read_csv("/kaggle/input/mbti-tweets/cleaned_df.csv")
data = data.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])

# Dataset

In [3]:
labels = ['intj', 'intp', 'entj', 'entp', 'infj', 'infp', 'enfj', 'enfp', 'istj', 'isfj', 'estj', 'esfj', 'istp', 'isfp', 'estp', 'esfp']
id2label = {id:label for id,label in enumerate(labels)}
label2id = {label:id for id,label in enumerate(labels)}

In [4]:
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, data, model_name="bert-base-uncased"):
        self.data = data
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        self.label = data['label'].apply(lambda l: label2id[l])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        res = self.tokenizer(text=self.data.iloc[idx].get('cleaned_text'), 
                             padding='max_length', 
                             max_length=500, 
                             truncation=True, 
                             return_tensors='pt')
        #F.one_hot(torch.tensor(self.label[idx]), num_classes=16).to(torch.float)
        return {
            'input_ids': res["input_ids"].squeeze(), 
            'token_type_ids': res["token_type_ids"].squeeze(), 
            'attention_mask': res["attention_mask"].squeeze(),
            "labels": torch.tensor(self.label[idx])
        }

In [5]:
ds = CustomTextDataset(data)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
BATCH_SIZE = 20

train_indices, test_indices = train_test_split(
    range(len(ds)),
    test_size=0.1,
)
train_split = Subset(ds, train_indices)
test_split = Subset(ds, test_indices)

train_batches = DataLoader(
    train_split, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers = 0,
    pin_memory = True,
    drop_last = True
)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE)

In [20]:
# ds = CustomTextDataset(data)
# train_loader = DataLoader(
#     ds, 
#     batch_size = 20, 
#     shuffle = True, 
#     num_workers = 0, 
#     pin_memory = True, 
#     drop_last = True,
# )

# Training

In [7]:
class ClassifierHead(nn.Module):
    def __init__(self, hidden_size, num_classes, seq_length):
        super().__init__()
        self.rnn = nn.RNN(768, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size*seq_length, num_classes)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = x.reshape(x.shape[0], -1)
        x = self.linear(x)
        return x

class BERTWithClassifierHead(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        # ClassifierHead is already defined.
        self.classifier = ClassifierHead(20, num_classes, 500)
        
    def forward(self, x):
        x = self.bert(**x)
        x = x.last_hidden_state
        x = self.classifier(x)
        return x

In [8]:
model = BERTWithClassifierHead(num_classes=16)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 16, id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
# batch = next(iter(train_batches))
# X = {k: batch[k] for k in batch.keys() if k not in ["label"]}
# y = batch["label"]
# X = {k: v.to(device) for k, v in X.items()}
# y = y.to(device)
# output = model(**X).logits
# print(output.shape, y.shape)

torch.Size([25, 16]) torch.Size([25, 16])


In [9]:
epochs = 3
lr = 5e-5
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Define Loss
criterion = nn.CrossEntropyLoss()

# Define optimizer.
optimizer = AdamW(model.parameters(), lr=lr)
# optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)

# Define LR Scheduler.
num_training_steps = epochs * len(train_batches)
lr_scheduler = get_scheduler(
    name="cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

model.to(device)

BERTWithClassifierHead(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [39]:
# model.to(device)
# with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
#     with record_function("model_inference"):
#         batch = next(iter(train_loader))
#         X = {k: batch[k] for k in batch.keys() if k not in ["label"]}
#         y = batch["label"]
#         X = {k: v.to(device) for k, v in X.items()}
#         y = y.to(device)
#         model(X)
# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
# print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

In [10]:
scaler = torch.cuda.amp.GradScaler()

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(epochs):
    
    running_loss = 0.0
    for i, batch in enumerate(train_batches, 0):
        
        batch = {k: v.to(device) for k, v in batch.items()}
        # Unpack the dictionary.
        X = {k: batch[k] for k in batch.keys() if k not in "labels"}
        y = batch["labels"]

#         One step.
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(X)
            loss = criterion(outputs, y)
            
#         loss.backward()
#         optimizer.step()

        scaler.scale(loss).backward()

        scaler.step(optimizer)
        lr_scheduler.step()
        optimizer.zero_grad()
        
        scaler.update()
        
        progress_bar.update(1)
    
        running_loss += loss.item()
        if i % 50 == 49:    # print every 45 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 50:.3f}')
            running_loss = 0.0
            
print("Finished Training!")


  0%|          | 0/978 [00:00<?, ?it/s]

[1,    50] loss: 2.636
[1,   100] loss: 2.600
[1,   150] loss: 2.633
[1,   200] loss: 2.577
[1,   250] loss: 2.623
[1,   300] loss: 2.608
[2,    50] loss: 2.573
[2,   100] loss: 2.574
[2,   150] loss: 2.554
[2,   200] loss: 2.586
[2,   250] loss: 2.545
[2,   300] loss: 2.569
[3,    50] loss: 2.555
[3,   100] loss: 2.566
[3,   150] loss: 2.486
[3,   200] loss: 2.542
[3,   250] loss: 2.516
[3,   300] loss: 2.523
Finished Training!


In [29]:
scaler = torch.cuda.amp.GradScaler()

batch = next(iter(train_batches))
    
batch = {k: v.to(device) for k, v in batch.items()}
# Unpack the dictionary.
X = {k: batch[k] for k in batch.keys() if k not in "labels"}
y = batch["labels"]
for epoch in range(100):

#         One step.
    with torch.autocast(device_type='cuda', dtype=torch.float16):
        outputs = model(X)
        loss = criterion(outputs, y)
            
#         loss.backward()
#         optimizer.step()

    scaler.scale(loss).backward()

    scaler.step(optimizer)
    lr_scheduler.step()
    optimizer.zero_grad()

    scaler.update()

    progress_bar.update(1)

    print(f'{epoch}=', loss)
            
print("Finished Training!")

0= tensor(4.3928, device='cuda:0', grad_fn=<NllLossBackward0>)
1= tensor(3.9684, device='cuda:0', grad_fn=<NllLossBackward0>)
2= tensor(3.6470, device='cuda:0', grad_fn=<NllLossBackward0>)
3= tensor(3.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
4= tensor(2.9921, device='cuda:0', grad_fn=<NllLossBackward0>)
5= tensor(2.5773, device='cuda:0', grad_fn=<NllLossBackward0>)
6= tensor(2.4415, device='cuda:0', grad_fn=<NllLossBackward0>)
7= tensor(2.4590, device='cuda:0', grad_fn=<NllLossBackward0>)
8= tensor(2.5209, device='cuda:0', grad_fn=<NllLossBackward0>)
9= tensor(2.5610, device='cuda:0', grad_fn=<NllLossBackward0>)
10= tensor(2.5553, device='cuda:0', grad_fn=<NllLossBackward0>)
11= tensor(2.5352, device='cuda:0', grad_fn=<NllLossBackward0>)
12= tensor(2.4210, device='cuda:0', grad_fn=<NllLossBackward0>)
13= tensor(2.4558, device='cuda:0', grad_fn=<NllLossBackward0>)
14= tensor(2.3696, device='cuda:0', grad_fn=<NllLossBackward0>)
15= tensor(2.3812, device='cuda:0', grad_fn=<NllLo

In [11]:
torch.save(model.state_dict(), '/kaggle/working/mbti.pth')

In [9]:
model = BERTWithClassifierHead(num_classes=16)
# model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=16)
model.load_state_dict(torch.load('/kaggle/working/mbti.pth'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [12]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
res = tokenizer(text="this is some text hello hello hello", 
                             padding='max_length', 
                             max_length=500, 
                             truncation=True, 
                             return_tensors='pt')


In [26]:
output = model(res)
print(output)

tensor([[ 0.8819,  1.4050,  0.1773,  0.3725,  1.2178,  1.7634,  0.0153,  0.8813,
         -0.2533, -0.4657, -1.7859, -1.7720, -0.3982,  0.2173, -1.5934, -0.8718]],
       grad_fn=<AddmmBackward0>)


In [29]:
m = nn.Softmax(dim=1)
scaled = m(output)
print(scaled)
prediction = torch.argmax(scaled)
prediction

tensor([[0.0927, 0.1564, 0.0458, 0.0557, 0.1297, 0.2238, 0.0390, 0.0927, 0.0298,
         0.0241, 0.0064, 0.0065, 0.0258, 0.0477, 0.0078, 0.0161]],
       grad_fn=<SoftmaxBackward0>)


tensor(5)

In [31]:
type(labels[prediction])

str

In [12]:
correct = 0
total = 0

with torch.no_grad():
    for batch in test_batches:
        X = {k: batch[k] for k in batch.keys() if k not in "label"}
        y = batch["label"]

        X = {k: v.to(device) for k, v in X.items()}
        y = y.to(device)

        outputs = model(**X).logits
        _, predicted = torch.max(outputs, 1)
        _, actual = torch.max(y, 1)

        total += actual.size(0)
        correct += (predicted == actual).sum().item()
    
print(f'Accuracy of the model: {100 * correct // total} %')


Accuracy of the model: 16 %


In [16]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in test_batches:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    actual = torch.argmax(batch['labels'], dim=-1)
    metric.add_batch(predictions=predictions, references=actual)

metric.compute()

{'accuracy': 0.21335168616655195}

tensor([ 1,  0,  3,  8,  7, 14,  5,  1,  4,  1,  4,  4,  5,  6,  0,  5,  0,  7,
         1,  6,  7, 10,  0, 14,  1], device='cuda:0')
