# BERTweet-Large

## Imports

In [1]:
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer

# Ensure deterministic behavior
seed = 12345678

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

import wandb
wandb.login()

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

cuda:0


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malexlu314[0m. Use [1m`wandb login --relogin`[0m to force relogin


## Data

In [2]:
class AlzheimersTweetsDataset(Dataset):
    def __init__(self, root, transform=None, target_transform=None):
        self.tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large", normalization=True, use_fast=False)
        self.transform = transform
        self.target_transform = target_transform

        self.df = pd.read_csv(root)
        self.length = len(self.df)

        self.tweets = self.tokenizer(self.df["tweet"].values.tolist(), padding=True, truncation=True, return_tensors='pt')["input_ids"]
        self.labels = torch.LongTensor(self.df["label"].values)

        if self.transform is not None:
            self.tweets = self.transform(self.tweets)

        if self.target_transform is not None:
            self.labels = self.target_transform(self.labels)
        
    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        return self.tweets[idx], self.labels[idx]

In [3]:
def get_data(test=False):
    if test:
        return AlzheimersTweetsDataset("data/test.csv")
    
    trainset = AlzheimersTweetsDataset("data/train.csv")
    valset = AlzheimersTweetsDataset("data/val.csv")
    return trainset, valset

def make_loader(dataset, batch_size):
    loader = DataLoader(dataset=dataset, 
                        batch_size=batch_size, 
                        shuffle=True, 
                        pin_memory=True, num_workers=2)
    return loader

## Model

In [4]:
def get_model():
    return AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-large")

# Training

In [5]:
def train(model, train_loader, val_loader, optimizer, config):    
    # Tell wandb to watch what the model gets up to: gradients, weights, and more!
    # wandb.watch(model, log="all", log_freq=10)

    # Run training and track with wandb
    example_ct = 0  # number of examples seen
    batch_ct = 0

    for epoch in tqdm(range(config.epochs)):
        model.train()
        train_correct, val_correct = 0, 0
        for batch, labels in train_loader:
            batch, labels = batch.to(device), labels.to(device)

            output = model(batch, labels=labels)
            loss = output.loss

            partial_loss = loss / config.accum
            partial_loss.backward()
            
            predicted = output.logits.argmax(dim=-1)
            train_correct += (predicted == labels).sum().item()

            example_ct += len(batch)
            batch_ct += 1

            if (batch_ct % config.accum == 0) or (batch_ct == len(train_loader)):
                optimizer.step()
                optimizer.zero_grad()

            if (batch_ct % (config.log_interval * config.accum)) == 0:
                wandb.log({"epoch": epoch, "loss": loss.item()}, step=example_ct)
                print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")

        with torch.no_grad():
            model.eval()
            for batch, labels in val_loader:
                batch, labels = batch.to(device), labels.to(device)
                output = model(batch, labels=labels)

                predicted = output.logits.argmax(dim=-1)
                val_correct += (predicted == labels).sum().item()
        
        train_accuracy = train_correct / len(train_loader.dataset)
        val_accuracy = val_correct / len(val_loader.dataset)

        wandb.log({"train_accuracy": train_accuracy, "val_accuracy": val_accuracy}, step=example_ct)
        print(f"Epoch {str(epoch).zfill(2)} Summary: (Train %: {train_accuracy:%}, Val%: {val_accuracy:%})")

        model.save_pretrained(os.path.join(wandb.run.dir, f"model_{epoch}"))

## Pipeline

In [6]:
def make(config):
    # Make the data
    train, val = get_data()
    train_loader = make_loader(train, batch_size=config.batch_size//config.accum)
    val_loader = make_loader(val, batch_size=config.batch_size//config.accum)

    # Make the model
    model = get_model().to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
    
    return model, train_loader, val_loader, optimizer


In [7]:
def model_pipeline(hyperparameters):
    assert hyperparameters["batch_size"] % hyperparameters["accum"] == 0

    # tell wandb to get started
    with wandb.init(project="Alzheimers", config=hyperparameters):
      # access all HPs through wandb.config, so logging matches execution!
      config = wandb.config

      # make the model, data, and optimization problem
      model, train_loader, val_loader, optimizer = make(config)
      print(model)

      # and use them to train the model
      train(model, train_loader, val_loader, optimizer, config)

    return model

In [9]:
config = {
    "epochs": 20,
    "batch_size": 32,
    "accum": 4,
    "learning_rate": 1e-5,
    "log_interval": 4
}

In [11]:
model = model_pipeline(config)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Bert

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

  0%|          | 0/20 [00:00<?, ?it/s]

Loss after 00128 examples: 0.519
Loss after 00256 examples: 0.600
Loss after 00384 examples: 0.504
Loss after 00512 examples: 0.623
Loss after 00640 examples: 0.659
Loss after 00768 examples: 0.468
Loss after 00896 examples: 0.666
Loss after 01024 examples: 0.563
Loss after 01152 examples: 0.536
Loss after 01280 examples: 0.724
Loss after 01408 examples: 0.537
Loss after 01536 examples: 0.464
Loss after 01664 examples: 0.477
Loss after 01792 examples: 0.231
Loss after 01920 examples: 0.511
Loss after 02048 examples: 0.244
Loss after 02176 examples: 0.511
Loss after 02304 examples: 0.978
Loss after 02432 examples: 0.381
Loss after 02560 examples: 0.416
Loss after 02688 examples: 0.551
Loss after 02816 examples: 0.315
Loss after 02944 examples: 0.351
Loss after 03072 examples: 0.246
Loss after 03200 examples: 0.792
Loss after 03328 examples: 0.245
Loss after 03456 examples: 0.537
Loss after 03584 examples: 0.208
Loss after 03712 examples: 0.090
Loss after 03840 examples: 0.071
Loss after

wandb: Network error (ConnectionError), entering retry loop.


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
loss,▅▃▁█▁▂▁▂▁▁▁▁▁▁▁▁▁▅▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_accuracy,▁▆▇▇████▇██████████
val_accuracy,▁▆█▆▇█▇▇▆█▇▇▇██▇▇██

0,1
epoch,19.0
loss,0.00056
train_accuracy,0.99769
val_accuracy,0.95379


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Final Test

In [8]:
def test(config, model):
    test = get_data(test=True)
    test_loader = make_loader(test, batch_size=config.batch_size//config.accum)

    correct = 0
    with torch.no_grad():
        for batch, labels in test_loader:
            batch, labels = batch.to(device), labels.to(device)
            output = model(batch, labels=labels)

            predicted = output.logits.argmax(dim=-1)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / len(test_loader.dataset)
    wandb.log({"test_accuracy": accuracy})
    print(f"Final Test Accuracy: {accuracy}")


In [10]:
model = AutoModelForSequenceClassification.from_pretrained("results/model_9/")