In [1]:
from google.colab import drive
drive.mount('/content/drive')
# some_file.py
%cd drive/My\ Drive/CS224u_Final_Project
!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/CS224u_Final_Project
/content/drive/My Drive/CS224u_Final_Project


In [2]:
%load_ext autoreload
%autoreload 2
!pip3 install transformers



In [0]:
import transformers
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup, RobertaConfig
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import os
from dataset_loader import COVID19TweetDataset
from final_classifier import FinalClassifier
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path
import random

In [4]:
# TODO: Add tensorboard logging 
writer = SummaryWriter()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using Device: {}".format(device))
# Creating dataset
DATA_DIR = './data'
dataset_path_train = os.path.join(DATA_DIR, 'train.tsv')
dataset_path_test = os.path.join(DATA_DIR, 'test.tsv')
dataset_path_val = os.path.join(DATA_DIR, 'val.tsv')

tweets_train = pd.read_csv(dataset_path_train, sep='\t')
tweets_test = pd.read_csv(dataset_path_test, sep='\t')
tweets_val = pd.read_csv(dataset_path_val, sep='\t')

PRE_TRAINED_MODEL_NAME = 'roberta-base'
MAX_LEN = 200
BATCH_SIZE = 32
RANDOM_SEED = 42

tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Using Device: cuda:0


In [0]:
# Changing dataframes into dictionary of COVID19TweetDataset objects
def create_data_loader(data_set, tokenizer, max_len, batch_size):
    temp_data_set = COVID19TweetDataset(data_set['text.clean'].to_numpy(), data_set['expert'].to_numpy(),
                                    data_set['id'].to_numpy(), max_len, tokenizer)

    return DataLoader(temp_data_set, batch_size=batch_size)

# Creating data loaders
data_loader = {
                'train': create_data_loader(tweets_train, tokenizer, MAX_LEN, BATCH_SIZE),
                'val': create_data_loader(tweets_val, tokenizer, MAX_LEN, BATCH_SIZE),
                'test': create_data_loader(tweets_test, tokenizer, MAX_LEN, BATCH_SIZE)
            }


In [0]:
# Initializing model
roberta_config = RobertaConfig.from_pretrained(PRE_TRAINED_MODEL_NAME, output_hidden_states=True)
roberta_model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME, config=roberta_config)

model = FinalClassifier(2, roberta_model)
model = model.to(device)

NUM_EPOCHS = 20
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(data_loader['train']) * NUM_EPOCHS)

class_weights = torch.FloatTensor([1.0, 2.088]).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)

In [0]:
# Train loop
def train(num_epochs, model):
    history = []
    best_val_f1 = float('-inf')
    label_history = []
    tensorboard_time_train = 0
    tensorboard_time_val = 0

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss_arr = []
        train_predicted_labels = []
        train_actual_labels = []
        train_tweet_ids = []

        for step, batch in enumerate(tqdm(data_loader['train'])):
            tensorboard_time_train += 1
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            tweet_ids = batch['tweet_ids']

            outputs = model(input_ids, attention_mask)
            _, predictions = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)
            train_actual_labels += list(labels.detach().cpu().view(-1).numpy())
            train_predicted_labels += list(predictions.detach().cpu().view(-1).numpy())
            train_tweet_ids += tweet_ids
            train_loss_arr.append(loss.item())

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if step % 10 == 0:
                print(f'Train Epoch = {epoch}, Step = {step}, Train Loss = {np.mean(train_loss_arr)}')
            
            writer.add_scalar('train_loss', np.mean(train_loss_arr), tensorboard_time_train)

        train_f1_score = f1_score(np.array(train_actual_labels), np.array(train_predicted_labels))
        train_acc = np.sum(np.array(train_actual_labels) == np.array(train_predicted_labels)) / len(train_set)
        writer.add_scalar('train_f1_score', train_f1_score, epoch)

        # Validation 
        model.eval()
        val_loss_arr = []
        val_predicted_labels = []
        val_actual_labels = []
        val_tweet_ids = []
        with torch.no_grad():
            for step, batch in enumerate(tqdm(data_loader['val'])):
                tensorboard_time_val += 1
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                tweet_ids = batch['tweet_ids']

                outputs = model(input_ids, attention_mask)
                _, predictions = torch.max(outputs, dim=1)
                loss = criterion(outputs, labels)
                val_actual_labels += list(labels.detach().cpu().view(-1).numpy())
                val_predicted_labels += list(predictions.detach().cpu().view(-1).numpy())
                val_tweet_ids += tweet_ids

                val_loss_arr.append(loss.item())
                if step % 10 == 0:
                    print(f'Val Epoch = {epoch}, Step = {step}, Val Loss = {np.mean(val_loss_arr)}')
                
                writer.add_scalar('val_loss', np.mean(val_loss_arr), tensorboard_time_val)

            val_f1_score = f1_score(np.array(val_actual_labels), np.array(val_predicted_labels))
            val_acc = np.sum(np.array(val_actual_labels) == np.array(val_predicted_labels)) / len(val_set)
            writer.add_scalar('val_f1_score', val_f1_score, epoch)
        
        # If we get better validation f1, save the labels/tweet ids for error analysis
        if val_f1_score > best_val_f1:
            best_val_f1 = val_f1_score
            np.save(os.path.join(DATA_DIR, 'label_history.npy'), list(zip(val_actual_labels, 
                                                            val_predicted_labels, val_tweet_ids)))
            save(model, epoch, optimizer, np.mean(val_loss_arr), model_prefix='roberta_linear_baseline_model_weighted_loss')
        
        print(f'Epoch {epoch}')
        print('-' * 20)
        print(f'Train Loss = {np.mean(train_loss_arr)}, F-1 Score = {train_f1_score}, Acc = {train_acc}')
        print(f'Val Loss = {np.mean(val_loss_arr)}, F-1 Score = {val_f1_score}, Acc = {val_acc}')

        # Save history
        history.append([np.mean(train_loss_arr), np.mean(val_loss_arr), train_f1_score, val_f1_score])
        np.save(os.path.join(DATA_DIR, 'history.npy'), history)
        print("Best F-1 score on validation dataset is {}".format(best_val_f1))


def save(model, epoch, optimizer, loss, model_prefix='model_', root='/content/drive/My Drive/CS224u_Final_Project/.model'):
    path = Path(root) / (model_prefix + '.ep%d' % epoch)
    if not path.parent.exists():
        path.parent.mkdir()

    torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'loss': loss}, path)

In [8]:
# Call Train/Val Loop
print("Begin Training!")
train(NUM_EPOCHS, model)

  0%|          | 0/500 [00:00<?, ?it/s]

Begin Training!


  out = self.softmax(out)                          # BATCH_SIZE x 2
  0%|          | 1/500 [00:00<08:10,  1.02it/s]

Train Epoch = 0, Step = 0, Train Loss = 0.6860438585281372


  2%|▏         | 11/500 [00:07<05:28,  1.49it/s]

Train Epoch = 0, Step = 10, Train Loss = 0.6961346214467828


  4%|▍         | 21/500 [00:14<05:17,  1.51it/s]

Train Epoch = 0, Step = 20, Train Loss = 0.6641499201456705


  6%|▌         | 31/500 [00:20<05:12,  1.50it/s]

Train Epoch = 0, Step = 30, Train Loss = 0.6279888979850277


  8%|▊         | 41/500 [00:27<05:01,  1.52it/s]

Train Epoch = 0, Step = 40, Train Loss = 0.6194205981929127


 10%|█         | 51/500 [00:34<04:57,  1.51it/s]

Train Epoch = 0, Step = 50, Train Loss = 0.609858958160176


 12%|█▏        | 61/500 [00:40<04:53,  1.50it/s]

Train Epoch = 0, Step = 60, Train Loss = 0.5962320937484992


 14%|█▍        | 71/500 [00:47<04:40,  1.53it/s]

Train Epoch = 0, Step = 70, Train Loss = 0.581903767837605


 16%|█▌        | 81/500 [00:53<04:33,  1.53it/s]

Train Epoch = 0, Step = 80, Train Loss = 0.5825113039693715


 18%|█▊        | 91/500 [01:00<04:29,  1.52it/s]

Train Epoch = 0, Step = 90, Train Loss = 0.5799023155327682


 20%|██        | 101/500 [01:06<04:25,  1.50it/s]

Train Epoch = 0, Step = 100, Train Loss = 0.5752248507325012


 22%|██▏       | 111/500 [01:13<04:14,  1.53it/s]

Train Epoch = 0, Step = 110, Train Loss = 0.5690807047727946


 24%|██▍       | 121/500 [01:20<04:08,  1.53it/s]

Train Epoch = 0, Step = 120, Train Loss = 0.5650300132341621


 26%|██▌       | 131/500 [01:26<04:03,  1.52it/s]

Train Epoch = 0, Step = 130, Train Loss = 0.5591579775773842


 28%|██▊       | 141/500 [01:33<03:55,  1.53it/s]

Train Epoch = 0, Step = 140, Train Loss = 0.5557717452657983


 30%|███       | 151/500 [01:39<03:48,  1.53it/s]

Train Epoch = 0, Step = 150, Train Loss = 0.5534572279611171


 32%|███▏      | 161/500 [01:46<03:41,  1.53it/s]

Train Epoch = 0, Step = 160, Train Loss = 0.5498977668166901


 34%|███▍      | 171/500 [01:52<03:34,  1.53it/s]

Train Epoch = 0, Step = 170, Train Loss = 0.5477551162591454


 36%|███▌      | 181/500 [01:59<03:28,  1.53it/s]

Train Epoch = 0, Step = 180, Train Loss = 0.5455635588472061


 38%|███▊      | 191/500 [02:05<03:23,  1.52it/s]

Train Epoch = 0, Step = 190, Train Loss = 0.541462921971426


 40%|████      | 201/500 [02:12<03:15,  1.53it/s]

Train Epoch = 0, Step = 200, Train Loss = 0.5405542923146812


 42%|████▏     | 211/500 [02:19<03:11,  1.51it/s]

Train Epoch = 0, Step = 210, Train Loss = 0.5376183677341135


 44%|████▍     | 221/500 [02:25<03:03,  1.52it/s]

Train Epoch = 0, Step = 220, Train Loss = 0.5361383577547462


 46%|████▌     | 231/500 [02:32<02:58,  1.51it/s]

Train Epoch = 0, Step = 230, Train Loss = 0.5325822623777183


 48%|████▊     | 241/500 [02:38<02:52,  1.50it/s]

Train Epoch = 0, Step = 240, Train Loss = 0.5295129133952604


 50%|█████     | 251/500 [02:45<02:44,  1.51it/s]

Train Epoch = 0, Step = 250, Train Loss = 0.5263562492165432


 52%|█████▏    | 261/500 [02:51<02:36,  1.53it/s]

Train Epoch = 0, Step = 260, Train Loss = 0.5235731767283546


 54%|█████▍    | 271/500 [02:58<02:28,  1.54it/s]

Train Epoch = 0, Step = 270, Train Loss = 0.5212473877021747


 56%|█████▌    | 281/500 [03:05<02:22,  1.54it/s]

Train Epoch = 0, Step = 280, Train Loss = 0.5189556102947832


 58%|█████▊    | 291/500 [03:11<02:16,  1.54it/s]

Train Epoch = 0, Step = 290, Train Loss = 0.5171980176799486


 60%|██████    | 301/500 [03:18<02:10,  1.53it/s]

Train Epoch = 0, Step = 300, Train Loss = 0.515544342143195


 62%|██████▏   | 311/500 [03:24<02:04,  1.51it/s]

Train Epoch = 0, Step = 310, Train Loss = 0.5143679643942228


 64%|██████▍   | 321/500 [03:31<01:57,  1.53it/s]

Train Epoch = 0, Step = 320, Train Loss = 0.5127807017241683


 66%|██████▌   | 331/500 [03:37<01:51,  1.52it/s]

Train Epoch = 0, Step = 330, Train Loss = 0.5112255546081462


 68%|██████▊   | 341/500 [03:44<01:44,  1.52it/s]

Train Epoch = 0, Step = 340, Train Loss = 0.5110085872261405


 70%|███████   | 351/500 [03:50<01:36,  1.54it/s]

Train Epoch = 0, Step = 350, Train Loss = 0.5104842759095706


 72%|███████▏  | 361/500 [03:57<01:30,  1.53it/s]

Train Epoch = 0, Step = 360, Train Loss = 0.5093997088660824


 74%|███████▍  | 371/500 [04:03<01:24,  1.53it/s]

Train Epoch = 0, Step = 370, Train Loss = 0.5082359979898139


 76%|███████▌  | 381/500 [04:10<01:18,  1.52it/s]

Train Epoch = 0, Step = 380, Train Loss = 0.5064343977475104


 78%|███████▊  | 391/500 [04:17<01:11,  1.53it/s]

Train Epoch = 0, Step = 390, Train Loss = 0.504951516838025


 80%|████████  | 401/500 [04:23<01:05,  1.52it/s]

Train Epoch = 0, Step = 400, Train Loss = 0.5049975887498356


 82%|████████▏ | 411/500 [04:30<00:57,  1.54it/s]

Train Epoch = 0, Step = 410, Train Loss = 0.5051363802304233


 84%|████████▍ | 421/500 [04:36<00:51,  1.52it/s]

Train Epoch = 0, Step = 420, Train Loss = 0.5045444642563047


 86%|████████▌ | 431/500 [04:43<00:45,  1.52it/s]

Train Epoch = 0, Step = 430, Train Loss = 0.5044720318782081


 88%|████████▊ | 441/500 [04:49<00:38,  1.53it/s]

Train Epoch = 0, Step = 440, Train Loss = 0.5055789061549569


 90%|█████████ | 451/500 [04:56<00:32,  1.53it/s]

Train Epoch = 0, Step = 450, Train Loss = 0.5048433252158028


 92%|█████████▏| 461/500 [05:02<00:25,  1.55it/s]

Train Epoch = 0, Step = 460, Train Loss = 0.5034671441981177


 94%|█████████▍| 471/500 [05:09<00:18,  1.53it/s]

Train Epoch = 0, Step = 470, Train Loss = 0.5029557168863381


 96%|█████████▌| 481/500 [05:15<00:12,  1.53it/s]

Train Epoch = 0, Step = 480, Train Loss = 0.5021342729952132


 98%|█████████▊| 491/500 [05:22<00:05,  1.54it/s]

Train Epoch = 0, Step = 490, Train Loss = 0.5007279759876111


100%|██████████| 500/500 [05:28<00:00,  1.52it/s]


NameError: ignored