In [1]:
import numpy as np
import pandas as pd
import pickle


import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


import matplotlib.pyplot as plt
import gc

# Unique to this file
from transformers import AutoTokenizer, AutoModel
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from train_test import get_device, compute_accuracy

In [2]:
# torch.device("mps")
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

True
True


In [3]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

model_map= {
    #"tinybert": "huawei-noah/TinyBERT_General_4L_312D",
    #"covidbert": "digitalepidemiologylab/covid-twitter-bert-v2",
    #"distilbert": "distilbert-base-uncased",
    #"bertweet": "vinai/bertweet-base",
    "bertweetcovid": "vinai/bertweet-covid19-base-uncased"
}


# Depending on your GPU you can either increase or decrease this value
batch_size = 16
total_epoch = 10
learning_rate = 1e-5


# Find out how many labels are in the dataset
with open('covid_dataset/5_class_map.pkl','rb') as f:
    labels = pickle.load(f)
labels_in_dst = len(labels)


In [4]:
# train teacher model, to reproduce paper result
import os
cwd = os.getcwd()

dst_path = 'preprocessed_data/{}.dst'
dst_path = os.path.join(cwd,'preprocessed_data/{}-{{}}.dst'.format("bertweetcovid"))

train = torch.load(dst_path.format('train'))
val = torch.load(dst_path.format('val'))
test = torch.load(dst_path.format('test'))
unlabeled = torch.load(dst_path.format('19k'))

In [None]:
from train_test import train_paper
# reproduce paper results
train_paper(train, val, test, batch_size, total_epoch, labels_in_dst, learning_rate, model_path="bertweetcovid_paper.pth")


Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0


In [None]:
from train_test import test_ST
# test best val_weighted f1 model on test
_, _ = test_ST("bertweetcovid_paper.pth", test, labels_in_dst, batch_size)

In [None]:
from train_test import test_ST
# validation performance: best val_weighted f1 model 
_, _ = test_ST("bertweetcovid_paper.pth", val, labels_in_dst, batch_size)

In [None]:
# This allows sample weight for each training example, when calculate loss 
def train_teacher(train, val,test, batch_size, total_epoch, labels_in_dst, learning_rate, self_train, model_path):
    global model_map

    trainloader = DataLoader(train, shuffle=True, batch_size=batch_size)
    valloader = DataLoader(val, shuffle=False, batch_size=batch_size)
    
    device = get_device()

    model = AutoModelForSequenceClassification.from_pretrained(model_map['bertweetcovid'], 
                                                          num_labels=labels_in_dst,
                                                          return_dict=True)
    model = model.to(device)
    gc.collect()

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    sum_loss = []
    sum_val = []
    
    val_f1_average = []

    for epoch in range(0, total_epoch):
        print('Epoch:', epoch)
        train_loss, valid_loss = [], []
        
        model.train()
        
        step = 0
        total_train_accuracy = 0
        
        for input_ids, attention_mask, labels, sample_weight in trainloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            sample_weight = sample_weight.to(device)

            
            optimizer.zero_grad()
            output1 = model(input_ids, attention_mask=attention_mask)
            if not self_train:
                loss = F.cross_entropy(output1.logits, labels)
            else:
                logits = output1.logits
                # redefine loss that including the sample weights
                # loss = BCEWithLogitsLoss(logits, b_labels, weight=b_sample_weights)
                loss_fct = CrossEntropyLoss(reduction='none')
                loss = loss_fct(logits.view(-1, labels_in_dst), lables.view(-1))
                loss = torch.mean(loss * sample_weights) # element wise multiplication
            
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        sum_loss.append(sum(train_loss)/len(train))  
        print('Loss: {:.4f}'.format(sum_loss[epoch-1]))

#       evaluation part 
        model.eval()
        with torch.no_grad():
            predictions = []
            true_labels = []
            for input_ids, attention_mask, labels in valloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                output = model(input_ids, attention_mask=attention_mask)
                predictions.append(output.logits.clone().detach())
                true_labels.append(labels.clone().detach())
            predictions = torch.cat(predictions)
            true_labels = torch.cat(true_labels)
            predictions = predictions.cpu()
            true_labels = true_labels.cpu()

            # val_f1 is weighted f1 
            acc, precision, recall, f1_macro, val_f1  = compute_accuracy(predictions, true_labels)
            print("validation performance: ", acc, precision, recall, f1_macro, val_f1)
            
            
            best_f1 = max(val_f1_average, default=-1)
            best_model_state = ''
            # Save the best model seen so far
            if val_f1 > best_f1:
                best_f1 = val_f1
                torch.save({
                            'epoch': epoch,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'best_f1': best_f1
                            }, model_path)
            
            val_f1_average.append(val_f1)
    
        # test
        model.eval()
        testloader = DataLoader(test, shuffle=False, batch_size=batch_size)

        with torch.no_grad():
            predictions = []
            true_labels = []
            pred_prob = []
            for input_ids, attention_mask, labels in testloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                output = model(input_ids, attention_mask=attention_mask)
                logits = output.logits.clone().detach()

                predictions.append(logits)
                true_labels.append(labels.clone().detach())

                softmax = torch.nn.Softmax(dim=1)
                prob_batch = softmax(logits)
                prob_batch = prob_batch.cpu().numpy()
                pred_prob.append(prob_batch)

            predictions = torch.cat(predictions)
            true_labels = torch.cat(true_labels)
            predictions = predictions.cpu()
            true_labels = true_labels.cpu()

            flat_prob = np.concatenate(pred_prob, axis=0)

            pred_labels = np.argmax(flat_prob, axis=1).flatten()

            acc, precision, recall,f1_macro, f1_score  = compute_accuracy(predictions, true_labels)

            print("test model performance after all epochs: ", acc, precision, recall,f1_macro, f1_score)
#     last_epoch_model_path = "hardlabel_lastepoch_" + model_path
#     torch.save({
#                 'epoch': epoch,
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'best_f1': best_f1
#                 }, last_epoch_model_path)


In [None]:
# train teacher model for self training
# the train_st.dst is TensorDataset that assign each tweet a weight 1, that's the only difference with train.dst

dst_path = 'preprocessed_data/{}.dst'
dst_path = os.path.join(cwd,'preprocessed_data/{}-{{}}.dst'.format("bertweetcovid"))

train = torch.load(dst_path.format('train_st'))
val = torch.load(dst_path.format('val'))
test = torch.load(dst_path.format('test'))
unlabeled = torch.load(dst_path.format('19k'))

In [None]:
pred_labels, pred_probs = test_ST("bertweetcovid_paper.pth", unlabeled, labels_in_dst, batch_size=batch_size)

In [None]:
pred_probs

In [None]:
df_19k = pd.read_csv('covid_dataset/splits/19k_unlabeled.csv_cleaned.csv')
print(len(df_19k))

In [None]:
# df_temp = pd.DataFrame(pred_probs, columns=['class0', 'class1', 'class2', 'class3', 'class4'])
# print(len(df_temp))

# df_19k = pd.concat([df_19k, df_temp], axis=1)
# print(df_19k.columns)
# df_19k.to_csv('covid_dataset/splits/19k_bertweetcovid_probs.csv')

In [None]:
# df_19k_probs = pd.read_csv('covid_dataset/splits/19k_bertweetcovid_probs.csv')
# print(df_19k.columns)


In [None]:
df_19k = pd.read_csv('covid_dataset/splits/19k_bertweetcovid_probs.csv')
pred_probs = df_19k[['class0', 'class1', 'class2', 'class3', 'class4']].to_numpy()
pred_probs

In [None]:
# use pandas to get hard labeled unlabeled data
# but maybe can select directly from TensorDataset, then no need to reclean and retokenize selected tweets

class0_top100 = pred_probs[:, 0].flatten().argsort()[-500:]
print(len(class0_top100))
df_19k.loc[class0_top100, 'Label'] = 0
df_unlabeled_selected = df_19k.iloc[class0_top100]
top500_index = class0_top100.copy()


for i in range(1, labels_in_dst):
    classi_top100 = pred_probs[:, i].flatten().argsort()[-500:]
    df_19k.loc[classi_top100, 'Label'] = i
    
    top500_index = np.concatenate((top500_index, classi_top100))
    df_unlabeled_selected = pd.concat([df_unlabeled_selected, df_19k.iloc[classi_top100]])
print(len(top500_index))
print(len(df_19k))
df_19k.drop(top500_index, inplace=True)
df_19k = df_19k.reset_index(drop=True)
print(len(df_19k))  

pred_probs_top500 = pred_probs[top500_index]
print(pred_probs_top500.shape)
                                    
    

In [None]:
models = {
#     "tinybert": ("huawei-noah/TinyBERT_General_4L_312D",512),
#     "covidbert": ("digitalepidemiologylab/covid-twitter-bert-v2",512),
#     "distilbert": ("distilbert-base-uncased",512),
#     "bertweet": ("vinai/bertweet-base",130),
    "bertweetcovid": ("vinai/bertweet-covid19-base-uncased",130)
}

data_x = df_unlabeled_selected['clean_tweet'].values.tolist()
data_y = df_unlabeled_selected['Label'].values.tolist()

df_train = pd.read_csv('covid_dataset/splits/train.csv_cleaned.csv')
x_train = df_train['clean_tweet'].values.tolist()
y_train = df_train['Label'].values.tolist()

print(len(data_x))
print(len(x_train), len(y_train))





In [None]:
new_train_x = x_train + data_x
new_train_y = y_train + data_y
print(len(new_train_x), len(new_train_y))


for model_type,(model_name,sequence_length) in models.items():
    tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=False, normalization = True)

    tokenized_x = tokenizer(new_train_x, return_tensors='pt', padding=True, truncation=True, max_length=sequence_length)
    x_input_ids = tokenized_x['input_ids']
    x_attention_mask = tokenized_x['attention_mask']
    labels = torch.tensor(new_train_y, dtype=torch.long)
    
    # weight is just one for each selected hard labeled unlabeled tweet
    sample_weights = torch.ones(len(new_train_y)).type(torch.FloatTensor)
    
    new_train_dataset = TensorDataset(x_input_ids, x_attention_mask, labels, sample_weights )
    
    

In [None]:
batch_size = 16
total_epoch = 10
for i in range(5):
    train_teacher(new_train_dataset, val, test, batch_size, total_epoch, labels_in_dst, learning_rate, self_train=False, model_path=str(i)+"bertweetcovid_ST_1iter_500each16batch10epochs.pth")