In [74]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from functools import reduce
from sklearn.utils.class_weight import compute_class_weight
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification 
from transformers import BertTokenizer, BertModel
from collections import Counter
import xgboost

dataset = pd.read_csv("clean_COVIDSenti.csv")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(tweet):
    tokenized = tokenizer(tweet, padding="max_length", max_length = 47)
    return tokenized

tweets, labels = dataset['tweet'], dataset['label'] + 1
tokenized_tweets = tweets.map(tokenize)

In [None]:
#I'll eventually do CV and ES, but for now this is just train-test split!
from torch.utils.data import DataLoader, WeightedRandomSampler, Dataset
#Splitting into train, val, and test sets
split_frac = 0.9
test_split_idx = int(len(tokenized_tweets) * split_frac)
val_split_idx = int(test_split_idx * split_frac)
x_train, y_train = tokenized_tweets[:val_split_idx], labels[:val_split_idx]
x_val, y_val = tokenized_tweets[val_split_idx:test_split_idx], labels[val_split_idx:test_split_idx]
x_test, y_test = tokenized_tweets[test_split_idx:], labels[test_split_idx:]

#Turning into dataloaders
batch_size = 64

class TweetDataset(Dataset):
    def __init__(self, tweets, labels):
        self.x = tweets
        self.y = labels
        
    def __getitem__(self, index):
        x = self.x.iloc[index]
        x = {key: torch.tensor(val) for key, val in x.items()}
        y = self.y.iloc[index]
        return (x, y)
    
    def __len__(self):
        return len(self.x)

#Dealing with imbalanced class weights for train dataset
frequency = 1 / np.bincount(y_train)
class_weights = torch.tensor(frequency, dtype=torch.float32)
obs_weights = []
for val in y_train:
    obs_weights.append(class_weights[val])
obs_weights = torch.tensor(obs_weights)

#Sampling data and creating train/val/test sets
train_sampler = WeightedRandomSampler(weights = obs_weights, num_samples = len(obs_weights))
train_data = TweetDataset(x_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, sampler = train_sampler) #Test with shuffle instead of sampler, maybe?
val_data = TweetDataset(x_val, y_val)
test_data = TweetDataset(x_test, y_test)
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [None]:
#Begin work on actual model
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Training on Apple GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on CUDA")
else:
    print ("MPS device not found.")


class SentiBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, **inputs):
        output = self.bert(**inputs)['last_hidden_state']
        
        output = output.mean(axis = 1)
        output = self.lin1(output)
        output = self.softmax(output)
        return output

Training on Apple GPU


In [99]:
#model = SentiBERT()
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model = model.to(device)

#I originally tried just training these layers, but training the entire model worked out considerably better
layers = [model.classifier, model.pre_classifier, model.distilbert.transformer.layer[4], model.distilbert.transformer.layer[5]]

for param in model.parameters():
    param.requires_grad = True

model.train()

lr = 0.0001
epoch = 0
no_improvement = 0
curr_acc = 0
criterion = nn.CrossEntropyLoss() #Without softmax we use CEL
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

while no_improvement < 5:
    epoch += 1
    print(f"Epoch {epoch}")
    for inputs, labels in train_loader:
        inputs['input_ids'], inputs['attention_mask'] = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
        labels = labels.to(device)
        
        model.zero_grad()
        with torch.autocast("mps"):
            output = model(**inputs)['logits']   
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    
    model.eval()
    correct = torch.tensor(0, device = device)
    incorrect = torch.tensor(0, device = device)
    
    for inputs, labels in val_loader:
        inputs['input_ids'], inputs['attention_mask'] = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
        labels = labels.to(device)
        probs = model(**inputs)['logits']
        preds = torch.argmax(probs, axis = 1)
        preds = preds.to(device)
        correct += (preds == labels).sum()
        incorrect += (preds != labels).sum()  
    
    accuracy = correct / (correct + incorrect)
    if accuracy > curr_acc:
        print(f"New accuracy has been reached: {accuracy}")
        curr_acc = accuracy
        no_improvement = 0
    else:
        no_improvement += 1
    
    model.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1
New accuracy has been reached: 0.8799999952316284
Epoch 2
Epoch 3
Epoch 4
New accuracy has been reached: 0.8893827199935913
Epoch 5
New accuracy has been reached: 0.8928394913673401
Epoch 6
New accuracy has been reached: 0.896172821521759
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11


In [101]:
model.eval()
correct = torch.tensor(0, device = device)
incorrect = torch.tensor(0, device = device)

for inputs, labels in test_loader:
    inputs['input_ids'], inputs['attention_mask'] = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
    labels = labels.to(device)
    probs = model(**inputs)['logits']
    preds = torch.argmax(probs, axis = 1)
    preds = preds.to(device)
    correct += (preds == labels).sum()
    incorrect += (preds != labels).sum()  

accuracy = correct / (correct + incorrect)
print(accuracy)

tensor(0.8919, device='mps:0')
