In [3]:
import os
import pickle
import numpy as np
import pandas as pd



import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import TensorDataset, DataLoader, Subset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


import matplotlib.pyplot as plt
import gc

# Unique to this file
from transformers import AutoTokenizer, AutoModel
from transformers import BertForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score


from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

from train_test import get_device, compute_accuracy


# Depending on your GPU you can either increase or decrease this value
batch_size = 16
total_epoch = 10
learning_rate = 1e-5
iter_num = 1

# Find out how many labels are in the dataset
with open('covid_dataset/5_class_map.pkl','rb') as f:
    labels = pickle.load(f)
labels_in_dst = len(labels)



model_map= {
    #"tinybert": "huawei-noah/TinyBERT_General_4L_312D",
    #"covidbert": "digitalepidemiologylab/covid-twitter-bert-v2",
    #"distilbert": "distilbert-base-uncased",
    #"bertweet": "vinai/bertweet-base",
    "bertweetcovid": "vinai/bertweet-covid19-base-uncased"
}
# train teacher model for self training
# the train_st.dst is TensorDataset that assign each tweet a weight 1, that's the only difference with train.dst

cwd = os.getcwd()

dst_path = 'preprocessed_data/{}.dst'
dst_path = os.path.join(cwd,'preprocessed_data/{}-{{}}.dst'.format("bertweetcovid"))

train = torch.load(dst_path.format('train'))
val = torch.load(dst_path.format('val'))
test = torch.load(dst_path.format('test'))
unlabeled = torch.load(dst_path.format('19k'))




In [4]:
input_ids, masks, _ = zip(*unlabeled)

input_ids = torch.stack(input_ids)
masks = torch.stack(masks)

df_19k = pd.read_csv('covid_dataset/splits/19k_bertweetcovid_probs.csv')

pred_probs = df_19k[['class0', 'class1', 'class2', 'class3', 'class4']].to_numpy()

paper_pred_prob = torch.tensor(pred_probs, dtype=torch.float32)

unlabeled_prob = TensorDataset(input_ids, masks, paper_pred_prob )
print(unlabeled_prob.__len__())

19591


In [5]:
# cross entropy loss for two probability distribution
# https://discuss.pytorch.org/t/how-should-i-implement-cross-entropy-loss-with-continuous-target-outputs/10720/18
def cross_entropy(pred, soft_targets):
    logsoftmax = nn.LogSoftmax(dim=1)
    return torch.mean(torch.sum(- soft_targets * logsoftmax(pred), 1))

#assuming pred and soft_targets are both Variables with shape (batchsize, num_of_classes), 
#each row of pred is predicted logits and each row of soft_targets is a discrete distribution.


# class CrossEntropyLossForSoftTarget(nn.Module):
#     def __init__(self, T=20):
#         super(CrossEntropyLossForSoftTarget, self).__init__()
#         self.T = T
#         self.softmax = nn.Softmax(dim=-1)
#         self.logsoftmax = nn.LogSoftmax(dim=-1)
#     def forward(self, y_pred, y_gt):
#         y_pred_soft = y_pred.div(self.T)
#         y_gt_soft = y_gt.div(self.T)
#         return -(y_gt_soft)*self.logsoftmax(y_pred_soft).mean().mul(self.T*self.T)

class CrossEntropyLossForSoftTarget(nn.Module):
    '''
    y_pred: logits from student model
    y_gt:   logits from teacher model
    '''
    def __init__(self, T=20):
        super(CrossEntropyLossForSoftTarget, self).__init__()
        self.T = T
        self.softmax = nn.Softmax(dim=-1)
        self.logsoftmax = nn.LogSoftmax(dim=1)
    def forward(self, y_pred, y_gt):
        y_gt_soft = y_gt.div(self.T)
        y_pred_soft = y_pred.div(self.T)
#         return (-(y_gt_soft)*self.logsoftmax(y_pred_soft)).mean().mul(self.T*self.T)
        return torch.mean(torch.sum(- self.softmax(y_gt_soft) * self.logsoftmax(y_pred_soft), 1)).mul(self.T*self.T)

In [14]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the accuracy of our predictions vs labels
def train_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [22]:
def train_distill_student(teacher_path, student_path, train, unlabel, val, test, batch_size, batch_size_unlabel, total_epoch, labels_in_dst, learning_rate, T=10, unlabel_iter=None):
    global model_map

    trainloader = DataLoader(train, shuffle=True, batch_size=batch_size)
    valloader = DataLoader(val, shuffle=False, batch_size=batch_size)
    unlabel_loader = DataLoader(unlabel, shuffle=True, batch_size=batch_size_unlabel)
    
    
    
    device = get_device()
    
    teacher = AutoModelForSequenceClassification.from_pretrained(model_map['bertweetcovid'], 
                                                          num_labels=labels_in_dst,
                                                          return_dict=True)
    
    checkpoint = torch.load(teacher_path)
    teacher.load_state_dict(checkpoint['model_state_dict'])
    teacher = teacher.to(device)
    teacher.eval()
    

    model = AutoModelForSequenceClassification.from_pretrained(model_map['bertweetcovid'], 
                                                          num_labels=labels_in_dst,
                                                          return_dict=True)
    
#     checkpoint = torch.load(student_path)
#     model.load_state_dict(checkpoint['model_state_dict'])
    
    save_model_path = "student_" + student_path
    model = model.to(device)
    gc.collect()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    sum_loss = []
    sum_val = []
    
    val_f1_average = []
    
    

    for epoch in range(0, total_epoch):
        print('Epoch:', epoch)
        
        train_loss = []
        soft_all_loss = []
        valid_loss = []
        
        t0 = time.time()
        model.train()
        

        get_next_fnt = iter(unlabel_loader) # get one mini batch at a time
        unlabel_num_batchs = len(unlabel) // batch_size_unlabel + 1
        print(unlabel_num_batchs)
        
        step = 0
        total_train_accuracy = 0
        ## miniminze loss on labeled train per minibatch
        for input_ids, attention_mask, labels in trainloader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            output1 = model(input_ids, attention_mask=attention_mask)
            logits = output1.logits
            loss = F.cross_entropy(logits, labels)
            
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

            
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            b_accu = train_accuracy(logits, label_ids)
            total_train_accuracy += b_accu
            
        
            # minimize soft loss on unlabeled per mini batch,batch size is train batch size * 2
            unlabel_input_ids, unlabel_attention_mask, unlabel_labels = next(get_next_fnt)
            unlabel_input_ids = unlabel_input_ids.to(device)
            unlabel_attention_mask = unlabel_attention_mask.to(device)
            unlabel_labels = unlabel_labels.to(device) # teacher predicted probabilities
            
            optimizer.zero_grad()
            output2 = model(unlabel_input_ids, attention_mask=unlabel_attention_mask)

            logits2 = output2.logits
            
            teacher_output = teacher(unlabel_input_ids, attention_mask=unlabel_attention_mask)

            loss_fnt = CrossEntropyLossForSoftTarget(T) # T temprature, to smooth probabilities

            soft_loss = loss_fnt(logits2, teacher_output.logits)


            soft_loss.backward()
            optimizer.step()
            soft_all_loss.append(soft_loss.item())
            
            step += 1

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(trainloader), elapsed))

        # Report the final accuracy for this  run.
        avg_train_accuracy = total_train_accuracy / len(trainloader)
        print("Train Accuracy: {0:.4f}".format(avg_train_accuracy))
        
        
        # continue minimize soft loss on all rest of unlabeled
        unlabel_remain_batchs = unlabel_num_batchs - step
        
        # None is default, will use all unlabeled data, 
        # if 100 us passed then only 100 additional mini-batches of unlabeled data will be used.
        if unlabel_iter is not None: 
            unlabel_remain_batchs = unlabel_iter
            
        
        for i in range(unlabel_remain_batchs):
            try:
                unlabel_input_ids, unlabel_attention_mask, unlabel_labels = next(get_next_fnt)
            except StopIteration:
                get_next_fnt = iter(unlabel_loader) # get one mini batch at a time
                unlabel_input_ids, unlabel_attention_mask, unlabel_labels = next(get_next_fnt)
            
            unlabel_input_ids = unlabel_input_ids.to(device)
            unlabel_attention_mask = unlabel_attention_mask.to(device)
            unlabel_labels = unlabel_labels.to(device) # teacher predicted probabilities

            optimizer.zero_grad()
            output2 = model(unlabel_input_ids, attention_mask=unlabel_attention_mask)

            logits2 = output2.logits
            
            teacher_output = teacher(unlabel_input_ids, attention_mask=unlabel_attention_mask)

            loss_fnt = CrossEntropyLossForSoftTarget(T) # T temprature, to smooth probabilities
            soft_loss = loss_fnt(logits2, teacher_output.logits)
  

            soft_loss.backward()
            optimizer.step()
            soft_all_loss.append(soft_loss.item())
            
            if i % 40 == 0 and not i == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(i+step, len(unlabel_loader), elapsed))
        average_train_loss = np.sum(train_loss)/len(train)
        average_soft_loss = np.sum(soft_all_loss)/len(unlabel)
        print('Train labeled Loss: {:.4f}'.format(average_train_loss))
        print('Soft labeled Loss: {:.4f}'.format(average_soft_loss))
    
        sum_loss.append(average_train_loss + average_soft_loss)  
        print('Total Loss: {:.4f}'.format(sum_loss[epoch-1]))

#       evaluation part 
        model.eval()
        with torch.no_grad():
            predictions = []
            true_labels = []
            for input_ids, attention_mask, labels in valloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)
                output = model(input_ids, attention_mask=attention_mask)
                predictions.append(output.logits.clone().detach())
                true_labels.append(labels.clone().detach())
            predictions = torch.cat(predictions)
            true_labels = torch.cat(true_labels)
            predictions = predictions.cpu()
            true_labels = true_labels.cpu()

            # val_f1 is weighted f1 
            acc, precision, recall, f1_macro, val_f1  = compute_accuracy(predictions, true_labels)
            print("validation performance at epoch: ", epoch, acc, precision, recall, f1_macro, val_f1)
            
            
            best_f1 = max(val_f1_average, default=-1)
            best_model_state = ''
            # Save the best model seen so far
            if val_f1 > best_f1:
                best_f1 = val_f1
                torch.save({
                            'epoch': epoch,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'best_f1': best_f1
                            }, save_model_path)
#                 torch.save(model, save_model_path)
            
            val_f1_average.append(val_f1)
    
        # test
        model.eval()
        testloader = DataLoader(test, shuffle=False, batch_size=batch_size)

        with torch.no_grad():
            predictions = []
            true_labels = []
            pred_prob = []
            for input_ids, attention_mask, labels in testloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                output = model(input_ids, attention_mask=attention_mask)
                logits = output.logits.clone().detach()

                predictions.append(logits)
                true_labels.append(labels.clone().detach())

                softmax = torch.nn.Softmax(dim=1)
                prob_batch = softmax(logits)
                prob_batch = prob_batch.cpu().numpy()
                pred_prob.append(prob_batch)

            predictions = torch.cat(predictions)
            true_labels = torch.cat(true_labels)
            predictions = predictions.cpu()
            true_labels = true_labels.cpu()

            flat_prob = np.concatenate(pred_prob, axis=0)

            pred_labels = np.argmax(flat_prob, axis=1).flatten()

            acc, precision, recall,f1_macro, f1_score  = compute_accuracy(predictions, true_labels)

            print("test model performance at epoch : ", epoch, acc, precision, recall,f1_macro, f1_score)

In [23]:
total_epoch = 3
batch_size = 16
batch_size_unlabel = 16 * 2
T = 1
unlabel_iter = None # use all unlabeled data
model_name = "paper_student_distillv2_errortrial_epoch" + str(total_epoch) + "_batchunlabel" + str(batch_size_unlabel) + "_T" + str(T) + "_unlabeliter" + str(unlabel_iter)


train_distill_student("bertweetcovid_paper.pth", model_name, train, unlabeled_prob, val, test, batch_size, batch_size_unlabel, total_epoch, labels_in_dst, learning_rate, T, unlabel_iter)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:50.
  Batch    80  of     89.    Elapsed: 0:01:39.
Train Accuracy: 0.5969
  Batch   129  of    613.    Elapsed: 0:02:29.
  Batch   169  of    613.    Elapsed: 0:03:06.
  Batch   209  of    613.    Elapsed: 0:03:43.
  Batch   249  of    613.    Elapsed: 0:04:21.
  Batch   289  of    613.    Elapsed: 0:04:58.
  Batch   329  of    613.    Elapsed: 0:05:35.
  Batch   369  of    613.    Elapsed: 0:06:12.
  Batch   409  of    613.    Elapsed: 0:06:50.
  Batch   449  of    613.    Elapsed: 0:07:27.
  Batch   489  of    613.    Elapsed: 0:08:05.
  Batch   529  of    613.    Elapsed: 0:08:42.
  Batch   569  of    613.    Elapsed: 0:09:20.
  Batch   609  of    613.    Elapsed: 0:09:58.
Train labeled Loss: 0.0711
Soft labeled Loss: 0.0198
Total Loss: 0.0910
validation performance at epoch:  0 tensor(0.8407) 0.8033248252506547 0.8076860391831243 0.8000721629782517 0.8374656995772355
test model performance at epoch :  0 tensor(0.7805) 0.748779

In [25]:
total_epoch = 5
batch_size = 16
batch_size_unlabel = 16 * 2
T = 1 # temperature
unlabel_iter = None
model_name = "paper_student_distillv2_errortrial_epoch" + str(total_epoch) + "_batchunlabel" + str(batch_size_unlabel) + "_T" + str(T) + "_unlabeliter" + str(unlabel_iter)


for i in range(5):
    train_distill_student("bertweetcovid_paper.pth", str(i)+model_name, train, unlabeled_prob, val, test, batch_size, batch_size_unlabel, total_epoch, labels_in_dst, learning_rate, T, unlabel_iter)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:51.
  Batch    80  of     89.    Elapsed: 0:01:40.
Train Accuracy: 0.5590
  Batch   129  of    613.    Elapsed: 0:02:30.
  Batch   169  of    613.    Elapsed: 0:03:07.
  Batch   209  of    613.    Elapsed: 0:03:44.
  Batch   249  of    613.    Elapsed: 0:04:22.
  Batch   289  of    613.    Elapsed: 0:05:00.
  Batch   329  of    613.    Elapsed: 0:05:37.
  Batch   369  of    613.    Elapsed: 0:06:15.
  Batch   409  of    613.    Elapsed: 0:06:52.
  Batch   449  of    613.    Elapsed: 0:07:29.
  Batch   489  of    613.    Elapsed: 0:08:06.
  Batch   529  of    613.    Elapsed: 0:08:43.
  Batch   569  of    613.    Elapsed: 0:09:21.
  Batch   609  of    613.    Elapsed: 0:09:58.
Train labeled Loss: 0.0741
Soft labeled Loss: 0.0205
Total Loss: 0.0946
validation performance at epoch:  0 tensor(0.8358) 0.7840564990564991 0.7772699562122968 0.776386993539179 0.832122829400619
test model performance at epoch :  0 tensor(0.7764) 0.74531703

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:53.
  Batch    80  of     89.    Elapsed: 0:01:43.
Train Accuracy: 0.5969
  Batch   129  of    613.    Elapsed: 0:02:33.
  Batch   169  of    613.    Elapsed: 0:03:10.
  Batch   209  of    613.    Elapsed: 0:03:48.
  Batch   249  of    613.    Elapsed: 0:04:26.
  Batch   289  of    613.    Elapsed: 0:05:04.
  Batch   329  of    613.    Elapsed: 0:05:42.
  Batch   369  of    613.    Elapsed: 0:06:20.
  Batch   409  of    613.    Elapsed: 0:06:58.
  Batch   449  of    613.    Elapsed: 0:07:36.
  Batch   489  of    613.    Elapsed: 0:08:14.
  Batch   529  of    613.    Elapsed: 0:08:52.
  Batch   569  of    613.    Elapsed: 0:09:30.
  Batch   609  of    613.    Elapsed: 0:10:08.
Train labeled Loss: 0.0700
Soft labeled Loss: 0.0196
Total Loss: 0.0896
validation performance at epoch:  0 tensor(0.8456) 0.8153807140951995 0.7987106404267037 0.8000248226950355 0.8425776578361842
test model performance at epoch :  0 tensor(0.7805) 0.790892

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:53.
  Batch    80  of     89.    Elapsed: 0:01:42.
Train Accuracy: 0.6011
  Batch   129  of    613.    Elapsed: 0:02:32.
  Batch   169  of    613.    Elapsed: 0:03:10.
  Batch   209  of    613.    Elapsed: 0:03:47.
  Batch   249  of    613.    Elapsed: 0:04:25.
  Batch   289  of    613.    Elapsed: 0:05:03.
  Batch   329  of    613.    Elapsed: 0:05:41.
  Batch   369  of    613.    Elapsed: 0:06:19.
  Batch   409  of    613.    Elapsed: 0:06:57.
  Batch   449  of    613.    Elapsed: 0:07:34.
  Batch   489  of    613.    Elapsed: 0:08:12.
  Batch   529  of    613.    Elapsed: 0:08:50.
  Batch   569  of    613.    Elapsed: 0:09:28.
  Batch   609  of    613.    Elapsed: 0:10:05.
Train labeled Loss: 0.0706
Soft labeled Loss: 0.0199
Total Loss: 0.0905
validation performance at epoch:  0 tensor(0.8309) 0.7971451763630534 0.7782560071983478 0.7799081687739078 0.8278067539326468
test model performance at epoch :  0 tensor(0.7642) 0.738229

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:41.
Train Accuracy: 0.6016
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:46.
  Batch   249  of    613.    Elapsed: 0:04:24.
  Batch   289  of    613.    Elapsed: 0:05:02.
  Batch   329  of    613.    Elapsed: 0:05:40.
  Batch   369  of    613.    Elapsed: 0:06:18.
  Batch   409  of    613.    Elapsed: 0:06:56.
  Batch   449  of    613.    Elapsed: 0:07:34.
  Batch   489  of    613.    Elapsed: 0:08:12.
  Batch   529  of    613.    Elapsed: 0:08:50.
  Batch   569  of    613.    Elapsed: 0:09:28.
  Batch   609  of    613.    Elapsed: 0:10:05.
Train labeled Loss: 0.0699
Soft labeled Loss: 0.0198
Total Loss: 0.0897
validation performance at epoch:  0 tensor(0.8333) 0.800781529420368 0.7914113703537109 0.7882765590167627 0.8294675518497188
test model performance at epoch :  0 tensor(0.7927) 0.7792451

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:42.
Train Accuracy: 0.5890
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:46.
  Batch   249  of    613.    Elapsed: 0:04:24.
  Batch   289  of    613.    Elapsed: 0:05:02.
  Batch   329  of    613.    Elapsed: 0:05:39.
  Batch   369  of    613.    Elapsed: 0:06:17.
  Batch   409  of    613.    Elapsed: 0:06:55.
  Batch   449  of    613.    Elapsed: 0:07:33.
  Batch   489  of    613.    Elapsed: 0:08:11.
  Batch   529  of    613.    Elapsed: 0:08:48.
  Batch   569  of    613.    Elapsed: 0:09:26.
  Batch   609  of    613.    Elapsed: 0:10:04.
Train labeled Loss: 0.0714
Soft labeled Loss: 0.0198
Total Loss: 0.0912
validation performance at epoch:  0 tensor(0.8358) 0.7952876761191369 0.7881619391221115 0.7874453255653717 0.8325861267829522
test model performance at epoch :  0 tensor(0.7886) 0.755603

In [37]:
_, _ = test_ST("student_4paper_student_distillv2_errortrial_epoch3_batchunlabel32_T10_unlabeliterNone", test,labels_in_dst, batch_size)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

test performance:  tensor(0.7805) 0.758503526150585 0.722299196431732 0.7336381500446072 0.7774962922731367


In [26]:
total_epoch = 3
batch_size = 16
batch_size_unlabel = 16 * 2
T = 10 # temperature
unlabel_iter = None
model_name = "paper_student_distillv2_errortrial_epoch" + str(total_epoch) + "_batchunlabel" + str(batch_size_unlabel) + "_T" + str(T) + "_unlabeliter" + str(unlabel_iter)

for i in range(5):
    train_distill_student("bertweetcovid_paper.pth", str(i)+model_name, train, unlabeled_prob, val, test, batch_size, batch_size_unlabel, total_epoch, labels_in_dst, learning_rate, T, unlabel_iter)

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:53.
  Batch    80  of     89.    Elapsed: 0:01:43.
Train Accuracy: 0.5941
  Batch   129  of    613.    Elapsed: 0:02:33.
  Batch   169  of    613.    Elapsed: 0:03:11.
  Batch   209  of    613.    Elapsed: 0:03:49.
  Batch   249  of    613.    Elapsed: 0:04:26.
  Batch   289  of    613.    Elapsed: 0:05:04.
  Batch   329  of    613.    Elapsed: 0:05:42.
  Batch   369  of    613.    Elapsed: 0:06:19.
  Batch   409  of    613.    Elapsed: 0:06:57.
  Batch   449  of    613.    Elapsed: 0:07:35.
  Batch   489  of    613.    Elapsed: 0:08:13.
  Batch   529  of    613.    Elapsed: 0:08:50.
  Batch   569  of    613.    Elapsed: 0:09:28.
  Batch   609  of    613.    Elapsed: 0:10:06.
Train labeled Loss: 0.0700
Soft labeled Loss: 4.9743
Total Loss: 5.0443
validation performance at epoch:  0 tensor(0.8407) 0.8157270497847747 0.8028947284274771 0.8042081154809088 0.8375303880774996
test model performance at epoch :  0 tensor(0.7724) 0.789001

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:41.
Train Accuracy: 0.5911
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:47.
  Batch   249  of    613.    Elapsed: 0:04:25.
  Batch   289  of    613.    Elapsed: 0:05:02.
  Batch   329  of    613.    Elapsed: 0:05:40.
  Batch   369  of    613.    Elapsed: 0:06:18.
  Batch   409  of    613.    Elapsed: 0:06:55.
  Batch   449  of    613.    Elapsed: 0:07:33.
  Batch   489  of    613.    Elapsed: 0:08:11.
  Batch   529  of    613.    Elapsed: 0:08:49.
  Batch   569  of    613.    Elapsed: 0:09:27.
  Batch   609  of    613.    Elapsed: 0:10:04.
Train labeled Loss: 0.0694
Soft labeled Loss: 4.9747
Total Loss: 5.0441
validation performance at epoch:  0 tensor(0.8333) 0.797063492063492 0.7639149954120806 0.7746705544908812 0.8294530187127204
test model performance at epoch :  0 tensor(0.7724) 0.7402760

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:42.
Train Accuracy: 0.5962
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:47.
  Batch   249  of    613.    Elapsed: 0:04:25.
  Batch   289  of    613.    Elapsed: 0:05:02.
  Batch   329  of    613.    Elapsed: 0:05:40.
  Batch   369  of    613.    Elapsed: 0:06:18.
  Batch   409  of    613.    Elapsed: 0:06:56.
  Batch   449  of    613.    Elapsed: 0:07:33.
  Batch   489  of    613.    Elapsed: 0:08:11.
  Batch   529  of    613.    Elapsed: 0:08:49.
  Batch   569  of    613.    Elapsed: 0:09:27.
  Batch   609  of    613.    Elapsed: 0:10:05.
Train labeled Loss: 0.0697
Soft labeled Loss: 4.9746
Total Loss: 5.0443
validation performance at epoch:  0 tensor(0.8358) 0.8048113144155135 0.805001475424735 0.7991765106091255 0.8328191302367517
test model performance at epoch :  0 tensor(0.7764) 0.7704986

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:42.
Train Accuracy: 0.5627
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:47.
  Batch   249  of    613.    Elapsed: 0:04:25.
  Batch   289  of    613.    Elapsed: 0:05:03.
  Batch   329  of    613.    Elapsed: 0:05:41.
  Batch   369  of    613.    Elapsed: 0:06:18.
  Batch   409  of    613.    Elapsed: 0:06:56.
  Batch   449  of    613.    Elapsed: 0:07:34.
  Batch   489  of    613.    Elapsed: 0:08:12.
  Batch   529  of    613.    Elapsed: 0:08:50.
  Batch   569  of    613.    Elapsed: 0:09:28.
  Batch   609  of    613.    Elapsed: 0:10:05.
Train labeled Loss: 0.0721
Soft labeled Loss: 4.9748
Total Loss: 5.0468
validation performance at epoch:  0 tensor(0.8382) 0.8076193661411052 0.8062261851685257 0.8006667686970717 0.8348045467549329
test model performance at epoch :  0 tensor(0.7846) 0.763883

Some weights of the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-covid19-base-unc

Epoch: 0
613
  Batch    40  of     89.    Elapsed: 0:00:52.
  Batch    80  of     89.    Elapsed: 0:01:41.
Train Accuracy: 0.5658
  Batch   129  of    613.    Elapsed: 0:02:31.
  Batch   169  of    613.    Elapsed: 0:03:09.
  Batch   209  of    613.    Elapsed: 0:03:46.
  Batch   249  of    613.    Elapsed: 0:04:24.
  Batch   289  of    613.    Elapsed: 0:05:02.
  Batch   329  of    613.    Elapsed: 0:05:40.
  Batch   369  of    613.    Elapsed: 0:06:17.
  Batch   409  of    613.    Elapsed: 0:06:55.
  Batch   449  of    613.    Elapsed: 0:07:33.
  Batch   489  of    613.    Elapsed: 0:08:11.
  Batch   529  of    613.    Elapsed: 0:08:49.
  Batch   569  of    613.    Elapsed: 0:09:27.
  Batch   609  of    613.    Elapsed: 0:10:04.
Train labeled Loss: 0.0702
Soft labeled Loss: 4.9748
Total Loss: 5.0450
validation performance at epoch:  0 tensor(0.8407) 0.8025035035998821 0.7900429766495508 0.791652760797733 0.8378400048753742
test model performance at epoch :  0 tensor(0.7805) 0.7585035