In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, AutoConfig
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import f1_score

SEED = 234523

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('./dataset/train.csv', index_col='train_idx')
df.drop(['label_text'], inplace=True, axis='columns')
train, validation = train_test_split(df, test_size=0.25, random_state=SEED)

test = pd.read_csv('./dataset/test.csv')

dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(train)
dataset['validation'] = Dataset.from_pandas(validation)
dataset['test'] = Dataset.from_pandas(test)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
pretrained_model = AutoModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def tokenize(batch):
    tokens = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
    batch['input_ids'], batch['attention_mask'] = tokens['input_ids'], tokens['attention_mask']
    return batch

dataset = dataset.map(tokenize)

dataset['train'].set_format('pt', columns=['input_ids', 'attention_mask', 'label'])
dataset['validation'].set_format('pt', columns=['input_ids', 'attention_mask', 'label'])
dataset['test'].set_format('pt', columns=['input_ids', 'attention_mask'])

dataset['train'] = dataset['train'].remove_columns(['text', 'train_idx'])
dataset['validation'] = dataset['validation'].remove_columns(['text', 'train_idx'])
dataset['test'] = dataset['test'].remove_columns(['text', 'test_idx'])

                                                                 

In [5]:
class SentimentAnalysis(nn.Module):
    def __init__(self):
        super(SentimentAnalysis, self).__init__()
        self.roberta = pretrained_model
        self.fc1 = nn.Linear(768, 512)
        self.activation1 = nn.GELU()
        self.output = nn.Linear(512, 2)

    def forward(self, input_ids, attention_mask):
        x = self.roberta(input_ids, attention_mask).pooler_output
        x = self.activation1(self.fc1(x))
        x = torch.softmax(self.output(x), dim=-1)
        return x

In [6]:
BATCH_SIZE = 4
STEPS_PER_EPOCH = len(dataset['train']) // BATCH_SIZE

train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, shuffle=True)
validation_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE, shuffle=True)

In [9]:
def progress_bar(b_idx, perc_steps=25):
    perc_epoch = int(b_idx / STEPS_PER_EPOCH * perc_steps)
    perc_left = perc_steps - perc_epoch
    return perc_epoch, perc_left

def train_metrics(b_idx, outputs, labels, y_true, y_pred):
    if b_idx == 0: 
        y_true = labels.detach().cpu().numpy()
        y_pred = torch.argmax(outputs, dim=1).detach().cpu().numpy()
    else: 
        y_true = np.concatenate((y_true, labels.detach().cpu().numpy()))
        y_pred = np.concatenate((y_pred, torch.argmax(outputs, dim=1).detach().cpu().numpy()))
    
    return f1_score(y_true, y_pred), y_true, y_pred

def compute_eta(b_idx, start_batch, batch_times):
    batch_times.append(time.time() - start_batch)
    if b_idx == 0: batch_avg_time = batch_times[0]
    else: 
        batch_times = batch_times[-10:]
        batch_avg_time = sum(batch_times) / len(batch_times)
    
    seconds_left = (STEPS_PER_EPOCH - b_idx) * batch_avg_time
    if seconds_left > 60: eta = time.strftime('%M:%S', time.gmtime(seconds_left))
    else: eta = f'{seconds_left:.2f} s'

    return eta, batch_times

def validation_step(model):
    model.eval()
    y_pred, y_true = [], []

    with torch.no_grad():
        for batch in validation_loader:
            input_ids, attention_masks = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            labels = batch['label']

            outputs = model(input_ids, attention_masks)
            batch_pred = torch.argmax(outputs, dim=1).detach().cpu().numpy()
            y_pred.append(batch_pred)

            y_true.append(labels.numpy())
            
    y_pred, y_true = np.concatenate(y_pred), np.concatenate(y_true)
    val_f1 = f1_score(y_true, y_pred)
    
    return val_f1

def time_epoch(start_epoch):
    return f'{int(time.time() - start_epoch)} s'

def save_model(model, exp):
    torch.save(model.state_dict(), f'./models/{exp}.pt')

def epoch_status(b_idx, outputs, labels, start_batch, model, loss, start_epoch, y_true, y_pred, batch_times, best_val_f1, exp):
    perc_epoch, perc_left = progress_bar(b_idx)
    accumulative_f1, y_true, y_pred = train_metrics(b_idx, outputs, labels, y_true, y_pred)
    eta, batch_times = compute_eta(b_idx, start_batch, batch_times)

    val_f1 = False
    if b_idx == STEPS_PER_EPOCH:
        val_f1 = validation_step(model)
        eta = time_epoch(start_epoch)
        if val_f1 > best_val_f1:
            save_model(model, exp)
            best_val_f1 = val_f1

    print(f'{b_idx}/{STEPS_PER_EPOCH}\t[{"=" * perc_epoch}>{"." * perc_left}] - ETA: {eta} - loss: {loss.item():.4f} - f1: {accumulative_f1:.4f}{"" if not val_f1 else f" - val_f1: {val_f1:.4f}"}', end='\r')
    return batch_times, y_true, y_pred, best_val_f1

In [13]:
def train(model, criterion, optimizer, epochs=10, experiment=str(time.time())):
    model = model.to(device)

    best_val_f1 = 0.0
    for epoch in range(epochs):
        model.train()

        print(f'Epoch {epoch+1}/{epochs}')
        start_epoch = time.time()
        batch_times = [] #contains the times that each batch has lasted
        y_true, y_pred = [], [] #accumulates the predictions for each batch

        for b_idx, batch in enumerate(train_loader):
            start_batch = time.time()

            input_ids, attention_masks = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_masks)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            batch_times, y_true, y_pred, best_val_f1 = epoch_status(b_idx, outputs, labels, start_batch, model, loss, start_epoch, y_true, y_pred, batch_times, best_val_f1, experiment)

    return model

In [14]:
model = SentimentAnalysis()

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(
    params=model.parameters(),
    lr=1e-5,
    amsgrad=True,
    weight_decay=0.01
)

exp = 'testing'
model = train(
    model,
    criterion=criterion,
    optimizer=optimizer,
    epochs=1,
    experiment=exp
)

Epoch 1/1