In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from functools import reduce
from autocorrect import Speller
from transformers import AutoTokenizer
from collections import Counter

tweets = pd.read_csv("clean_COVIDSenti.csv")
initial_word_dict = {}
speller = Speller(fast=True)
for tweet in tweets['tweet']:
    corrected_tweet = speller(tweet)
    for word in corrected_tweet.split(" "):
        initial_word_dict[word] = initial_word_dict.get(word, 0) + 1

vocab = sorted(initial_word_dict.keys(), key = initial_word_dict.get, reverse=True)
vocab_to_int = {word: idx for idx, word in enumerate(vocab, 1)}

tweets_int = []
for tweet in tweets['tweet']:
    corrected_tweet = speller(tweet)
    tweets_int.append([vocab_to_int[word] for word in corrected_tweet.split()])
    
max_length = max(len(x) for x in tweets_int)

def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length. I changed this code from the basic code to pad AFTER
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    
    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
      features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

#Finished preprocessing tweets!
padded_tweets = pad_features(tweets_int, max_length)
sentiments = np.array(tweets['label']) + 1

In [2]:
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
#Splitting into train, val, and test sets
split_frac = 0.9
test_split_idx = int(len(padded_tweets) * split_frac)
val_split_idx = int(test_split_idx * split_frac)
x_train, y_train = padded_tweets[:val_split_idx], sentiments[:val_split_idx]
x_val, y_val = padded_tweets[val_split_idx:test_split_idx], sentiments[val_split_idx:test_split_idx]
x_test, y_test = padded_tweets[test_split_idx:], sentiments[test_split_idx:]


#Turning into dataloaders
batch_size = 64

#Dealing with imbalanced class weights for train dataset
frequency = 1 / np.bincount(y_train)
class_weights = torch.tensor(frequency, dtype=torch.float32)
obs_weights = []
for val in y_train:
    obs_weights.append(class_weights[val])
obs_weights = torch.tensor(obs_weights)
train_sampler = WeightedRandomSampler(weights = obs_weights, num_samples = len(obs_weights))
train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
train_loader = DataLoader(train_data, batch_size=batch_size, sampler = train_sampler)

val_data = TensorDataset(torch.from_numpy(x_val), torch.from_numpy(y_val))
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))
test_loader = DataLoader(test_data, shuffle=False, batch_size=batch_size)

#Get one batch for testing
dataiter = iter(train_loader)
sample_tweets, sample_labels = next(dataiter)

In [5]:
#Begin work on actual model
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Training on Apple GPU")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Training on CUDA")
else:
    print ("MPS device not found.")

class PositionalEncoding(nn.Module):
    #Class used to encode positions
    def __init__(self, embedding_dim, max_len):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-np.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe) #Ensures that this positional encoding isn't updated by the optimizer

    def forward(self, x):
        return x + self.pe

class SentimentTransformer(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, encoder_layers, nhead = 4, dropout = 0.1):
        super().__init__()
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.encoder_layers = encoder_layers
        
        self.pe = PositionalEncoding(embedding_dim=embedding_dim, max_len=max_length)
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(embedding_dim, nhead = nhead, dropout=dropout, batch_first=True)
            for _ in range(encoder_layers)
        ])
        
        self.lin = nn.Linear(embedding_dim, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, input):
        mask = (input == 0) #Lets the encoder know which positions are 
        output = self.embedding(input)
        output = self.pe(output)
        for layer in self.encoder_layers:
            output = layer(output, src_key_padding_mask = mask)
        
        output = output.mean(axis = 1)
        output = self.lin(output)
        output = self.softmax(output)
        return output

#Instantiating model
vocab_size = len(vocab_to_int)
dropout = 0.1
nhead = 4
output_size = 3
embedding_dim = 128
encoder_layers = 2

Training on Apple GPU


In [None]:
#Training model
# loss and optimization functions
model = SentimentTransformer(vocab_size, output_size, embedding_dim, encoder_layers, nhead = nhead, dropout = dropout)
lr=0.001
counter = 0

weights = class_weights.to(device)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#Early stopping
no_improvement = 0
epoch = 0
best_accuracy = 0

model.to(device)
model.train()
while no_improvement < 5:
    epoch += 1
    print(f"Epoch {epoch}")
    counter = 0
    for inputs, labels in train_loader:
        
        counter += 1
        #Pushing inputs to the correct device
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        model.zero_grad()
        output = model(inputs)
        
        #Calculating loss
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        
        """if counter % 100 == 1:
            model.eval()
            number_correct = torch.argmax(output, axis = 1) == labels
            
            correct_0 = number_correct[labels == 0]
            correct_1 = number_correct[labels == 1]
            accuracy = number_correct.sum() / batch_size
            accs = []
            sizes = []
            for val in range(3):
                correct_val = number_correct[labels == val]
                acc = correct_val.sum() / len(correct_val)
                sizes.append(len(correct_val))
                accs.append(acc.item()) 
                
            model.train()
            print(f"Loss at epoch {epoch}, counter {counter}: {loss}, accuracy {accuracy.item()}, array {accs}, sizes {sizes}")"""
    
    correct = torch.tensor(0, device = device)
    incorrect = torch.tensor(0, device = device)
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        probs = model(inputs)
        preds = torch.argmax(probs, axis = 1)
        preds = preds.to(device)
        correct += (preds == labels).sum()
        incorrect += (preds != labels).sum()  
    accuracy = correct / (correct + incorrect)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        print("Validation set has new best accuracy", accuracy)
        no_improvement = 0
    else:
        no_improvement += 1  
    

Epoch 1
Validation set has new best accuracy tensor(0.8160, device='mps:0')
Epoch 2
Epoch 3
Epoch 4
Validation set has new best accuracy tensor(0.8491, device='mps:0')
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Validation set has new best accuracy tensor(0.8642, device='mps:0')
Epoch 9
Epoch 10
Epoch 11
Epoch 12


In [10]:
correct = torch.tensor(0, device = device)
incorrect = torch.tensor(0, device = device)
for inputs, labels in test_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    probs = model(inputs)
    preds = torch.argmax(probs, axis = 1)
    preds = preds.to(device)
    correct += (preds == labels).sum()
    incorrect += (preds != labels).sum()
    
print(f"Accuracy: {correct.item() / (correct.item() + incorrect.item())}")

Accuracy: 0.8671111111111112


In [None]:
torch.save(model, "models/SentiTrans.pt")