In [5]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from transformers import DistilBertTokenizerFast 

dataset = pd.read_csv("clean_COVIDSenti.csv")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(tweet):
    tokenized = tokenizer(tweet, return_tensors='pt', padding="max_length", max_length = 47) #Max tweet token length is 47
    return tokenized

tweets, labels = dataset['tweet'], dataset['label'] + 1 #Labels need to be 0-indexed
tokenized_tweets = tweets.map(tokenize)
tokenized_tweets, labels = tokenized_tweets.to_list(), labels.to_list()
vocab_size = max(map(lambda x: torch.max(x['input_ids']), tokenized_tweets))
max_len = max(map(lambda x: len(x['input_ids']), tokenized_tweets))

#Determining correct backend
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Training on Apple GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on CUDA")
else:
    print ("MPS device not found.")

Training on Apple GPU


In [20]:
class PositionalEncoding(nn.Module):
    #Class used to encode positions
    def __init__(self, embedding_dim, max_len):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-np.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) #Ensures that this positional encoding isn't updated by the optimizer

    def forward(self, x):
        return x + self.pe

class SentimentTransformer(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, encoder_layers, nhead = 4, dropout = 0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.encoder_layers = encoder_layers
        
        self.pe = PositionalEncoding(embedding_dim=embedding_dim, max_len=max_len)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embedding_dim, nhead = nhead, dropout=dropout, batch_first=True)
            for _ in range(encoder_layers)
        ])
        
        self.lin = nn.Linear(embedding_dim, output_size)

    def forward(self, input_ids, attention_mask):
        attention_mask = ~attention_mask.bool()
        output = self.embedding(input_ids)
        output = self.pe(output)
        for layer in self.encoder_layers:
            output = layer(output, src_key_padding_mask = attention_mask)
        
        output = output.mean(axis = 1)
        output = self.lin(output)
        return output

In [22]:
from torch.utils.data import DataLoader, WeightedRandomSampler, Dataset, random_split

class TweetDataset(Dataset):
    def __init__(self, tweets, labels):
        self.x = tweets
        self.y = labels
        
    def __getitem__(self, index):
        # Check that x is really a dictionary before processing
        x = self.x[index]
        x = dict(x)
        x = {key: torch.squeeze(val, dim = 0) for key, val in x.items()}
        y = self.y[index]
        return (x, y)
    
    def __len__(self):
        return len(self.x)
    
folds = 5
early_stopping = 5 #Stop if 5 epochs without improvement on val
train_frac = 0.1
test_frac = 0.8
val_frac = 0.1
batch_size = 64
test_accuracies = []
data = TweetDataset(tokenized_tweets, labels)

#Instantiating model
dropout = 0.1
nhead = 4
output_size = 3
embedding_dim = 128
encoder_layers = 2

model = SentimentTransformer(vocab_size, output_size, embedding_dim, encoder_layers, nhead = nhead, dropout = dropout)


for fold in range(folds):
    print(f"FOLD {fold}")
    gen = torch.Generator().manual_seed(fold)
    train, val, test = random_split(data, lengths=[train_frac, val_frac, test_frac], generator=gen)
    
    #Dealing with imbalanced class weights for train dataset
    labels_for_counts = list(map(lambda x: x[-1], train))
    frequency = 1 / np.bincount(labels_for_counts)
    class_weights = torch.tensor(frequency, dtype=torch.float32)
    obs_weights = list(map(lambda x: class_weights[x[-1]], train))
        
    train_sampler = WeightedRandomSampler(weights = obs_weights, num_samples = len(obs_weights))
    train_loader = DataLoader(train, batch_size=batch_size, sampler = train_sampler) #Test with shuffle instead of sampler, maybe?
    val_loader = DataLoader(val, shuffle=False, batch_size=batch_size)
    test_loader = DataLoader(test, shuffle=False, batch_size=batch_size)
    
    #---- TRAINING ACTUAL MODEL FROM HERE ON OUT ----#
    model = model.to(device)
    model.train()

    lr = 0.001
    epoch = 0
    no_improvement = 0
    curr_acc = 0
    criterion = nn.CrossEntropyLoss() #Without softmax we use CEL
    optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

    while no_improvement < early_stopping:
        epoch += 1
        print(f"Epoch {epoch}")
        
        #Training model layers
        for train_inputs, train_labels in train_loader:
            train_inputs['input_ids'], train_inputs['attention_mask'] = train_inputs['input_ids'].to(device), train_inputs['attention_mask'].to(device)
            train_labels = train_labels.to(device)
            
            model.zero_grad()
            with torch.autocast("mps"):
                output = model(**train_inputs)   
            loss = criterion(output, train_labels)
            loss.backward()
            optimizer.step()
        
        #Early stopping
        model.eval()
        correct = torch.tensor(0, device = device)
        incorrect = torch.tensor(0, device = device)
        
        for val_inputs, val_labels in val_loader:
            val_inputs['input_ids'], val_inputs['attention_mask'] = val_inputs['input_ids'].to(device), val_inputs['attention_mask'].to(device)
            val_labels = val_labels.to(device)
            probs = model(**val_inputs)
            preds = torch.argmax(probs, axis = 1)
            preds = preds.to(device)
            correct += (preds == val_labels).sum()
            incorrect += (preds != val_labels).sum()  
        
        accuracy = correct / (correct + incorrect)
        if accuracy > curr_acc:
            print(f"New accuracy has been reached: {accuracy}")
            curr_acc = accuracy
            no_improvement = 0
        else:
            no_improvement += 1
        
        model.train()
        
    model.eval()
    corrects = torch.zeros(3, device = device)
    incorrects = torch.zeros(3, device = device)
    
    correct = torch.tensor(0, device = device)
    incorrect = torch.tensor(0, device = device)
    
    #Getting test accuracy for CV purposes
    for test_inputs, test_labels in test_loader:
        test_inputs['input_ids'], test_inputs['attention_mask'] = test_inputs['input_ids'].to(device), test_inputs['attention_mask'].to(device)
        test_labels = test_labels.to(device)
        probs = model(**test_inputs)
        preds = torch.argmax(probs, axis = 1)
        preds = preds.to(device)
        correct += (preds == test_labels).sum()
        incorrect += (preds != test_labels).sum()  
        for val in range(3):
            corrects[val] += ((preds == test_labels) & (test_labels == val)).sum() 
            incorrects[val] += ((preds != test_labels) & (test_labels == val)).sum() 
    
    test_accuracy = correct / (correct + incorrect)
    test_accuracy_each = corrects / (corrects + incorrects)
    test_accuracies.append(test_accuracy)
    print(f"FOR FOLD {fold}, THE TEST ACCURACY WAS {test_accuracy}")
    print(f"FOR FOLD {fold}, THE ACCURACIES WERE {test_accuracy_each}")
    print("---------------------------------------")
        



FOLD 0
Epoch 1
New accuracy has been reached: 0.39711111783981323
Epoch 2
New accuracy has been reached: 0.7284444570541382
Epoch 3
New accuracy has been reached: 0.7400000095367432
Epoch 4
Epoch 5
New accuracy has been reached: 0.7876666784286499
Epoch 6
Epoch 7
Epoch 8
Epoch 9
New accuracy has been reached: 0.7888888716697693
Epoch 10
Epoch 11
New accuracy has been reached: 0.8009999990463257
Epoch 12
Epoch 13
Epoch 14
New accuracy has been reached: 0.8033333420753479
Epoch 15
Epoch 16
Epoch 17
New accuracy has been reached: 0.8053333163261414
Epoch 18
New accuracy has been reached: 0.8065555691719055
Epoch 19
New accuracy has been reached: 0.8098888993263245
Epoch 20
New accuracy has been reached: 0.8127777576446533
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
FOR FOLD 0, THE TEST ACCURACY WAS 0.8012916445732117
FOR FOLD 0, THE ACCURACIES WERE tensor([0.6331, 0.8765, 0.4327], device='mps:0')
---------------------------------------
FOLD 1
Epoch 1
New accuracy has been reached: 0.7494

KeyboardInterrupt: 