In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# this notebook has code from https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb
!pip install transformers==3.0.2
!pip install sklearn
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix, brier_score_loss
import csv
import random
from random import shuffle
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
LEARNING_RATE = 1e-05
EPOCHS = 3
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)
torch.manual_seed(3)

In [None]:
random.seed(41)

# define parameters

data_folder = '.../usAirline/data/'
dataToTrain = 'usAirline_train.csv'
dataToVal = 'usAirline_val.csv'
dataToTest ='usAirline_test.csv'

train_feat = 'X_train.npy'
val_feat = 'X_val.npy'
test_feat = 'X_test.npy'

txt = 'text'
goldLabel = 'airline_sentiment'
iID = 'itemID'
dfColumns = [txt, goldLabel]

# AL parameters
al_strategies           = ['uncertainty']
minimum_training_items  = 3                                                # minimum number of training items before we first train a model
alBatchNum              = 88                                                # define the total number of batches in active learning pipeline
alBatchSize             = 100                                               # define the size of one batch in active learning pipeline

controlList2 = [25, 50, 88]
#cost-based parameters
Vr = 0.0
Vc = 1.0
Vw_list = [0, -0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8, -0.9, -1.0, -2.0, -4.0, -8.0, -10.0, -100.0]

confT_list = list(np.arange(0, 1.01, 0.01))

modelName = 'roberta'
datasetName = 'usAirline'

res_path = '.../AL/res/usAirline/roberta/' # specify the path to keep results
logfile_name = "{}_{}_rnd41_".format(datasetName,modelName)   

In [None]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, txt, goldLabel):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = self.data[txt]
        self.targets = self.data[goldLabel]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                } 

In [None]:
class Data():
    
    def __init__(self, filename, setN, feat):
        
        # each dataset will have a pool of data, together with their IDs and gold labels 
        self.poolData = np.array([])
        self.poolGoldLabels = np.array([])
        
        dt = pd.read_csv(filename)
        #dt = dt.dropna()
        dt = dt.reset_index(drop=True)
        dt['itemID'] = np.arange(dt.shape[0])
        y = dt[goldLabel].values
        X = np.array(dt[txt].tolist())
        
        self.data = dt
        self.poolDataEmb = X
        self.poolGoldLabels = y
        self.mClass = list(set(self.poolGoldLabels.tolist()))
        
    def setStartState(self, nStart):
        ''' This functions creates the initial training set which contains the equal number of samples per class
        Input:
        nStart -- number of labelled datapoints (size of training set)
        '''
        self.nStart = nStart
        data = self.data.copy()
        # get predefined points so that all classes are represented and initial classifier could be trained.
        sampledIndices = []
        for cls in self.mClass:
            indices = np.array(np.where(self.poolGoldLabels == cls)).tolist()[0]
            idx = random.sample(indices, nStart // len(mClass))
            sampledIndices = sampledIndices + idx

        sData = data.iloc[sampledIndices]
        self.labeledSet = sData.reset_index(drop=True)
        droppedData = data.drop(sampledIndices)
        self.unlabeledSet = droppedData.reset_index(drop=True)

def cost_based_threshold(k):
    t = (k)/(k+1)
    return t

def calculate_value(y_hat_proba, y, t, Vr, Vc, Vw):

    y_pred = np.array([np.where(l == np.amax(l))[0][0] if (np.amax(l) > t) else -1 for l in y_hat_proba])

    # now lets compute the actual value of each prediction
    
    value_vector = np.full(y_pred.shape[0], Vc)

    value_vector[(y_pred != y) & (y_pred != -1)] = Vw
    
    #loss due to asking humans
    value_vector[y_pred == -1] = Vr
    value = np.sum(value_vector) / len(y)

    numOfRejectedSamples = np.count_nonzero(y_pred == -1)
    numOfWrongPredictions = np.count_nonzero((y_pred != y) & (y_pred != -1))
    return value, numOfRejectedSamples, numOfWrongPredictions

def find_optimum_confidence_threshold(y_hat_proba, y, t_list, Vr, Vc, Vw):

    cost_list = {}

    for t in t_list:
        # here we define K = fn_c_norm, change it based on task. 
        value, _ , __ = calculate_value(y_hat_proba, y, t, Vr, Vc, Vw)
        cost_list["{}".format(t)] = value
    # find t values with maximum value
    maxValue = max(cost_list.values())
    optTList = [float(k) for k, v in cost_list.items() if v == maxValue]
    # pick the one with the lowest confidence
    optimumT = min(optTList)

    return optimumT, cost_list

def uncertainty_sampling(distances, number):
    """Returns batch of datapoints with smallest margin/highest uncertainty.
    For binary classification, can just take the absolute distance to decision
    boundary for each point.
    For multiclass classification, must consider the margin between distance for
    top two most likely classes.
    Returns:
      indices of points selected to add using margin active learner
    """
    '''Points are sampled according to uncertainty sampling criterion'''

  #  distances = model.predict_proba(unl_emb)

    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    score_indices = np.argsort(min_margin)
    selected_samples = score_indices[0:number]
    return selected_samples

def certainty_sampling(distances, number):
    """Returns batch of datapoints with highest margin/smallest uncertainty.
    For binary classification, can just take the absolute distance to decision
    boundary for each point.
    For multiclass classification, must consider the margin between distance for
    top two most likely classes.
    Returns:
      indices of points selected to add using margin active learner
    """
    '''Points are sampled according to certainty sampling criterion'''

   # distances = model.predict_proba(unl_emb)

    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    score_indices = np.argsort(min_margin)
    score_indices_reversed = score_indices[::-1]
    selected_samples = score_indices_reversed[0:number]
    return selected_samples

def random_sampling(dataIds, nQuery):
    '''Randomly samples the points'''
    query_idx = random.sample(range(len(dataIds)), nQuery)
    selectedIndex = dataIds[query_idx]
    return selectedIndex

def threshold_oriented_sampling(probs, number, t):
    #probs = model.predict_proba(unl_emb)
    margins = np.array([abs(np.amax(l) - t) for l in probs])
    score_indices = np.argsort(margins)
    selected_samples = score_indices[0:number]
    return selected_samples

def tos_below(probs, number, t):
    #probs = model.predict_proba(unl_emb)
    indices_acc = np.array([1 if np.amax(l) > t else 0 for l in probs])
    indices_accepted = np.array(np.where(indices_acc == 1)[0])
    margins = np.array([abs(np.amax(l) - t) for l in probs])
    score_indices = np.argsort(margins)
    score_indices = np.setdiff1d(score_indices, indices_accepted)
    selected_samples = score_indices[0:number]
    return selected_samples

def tos_above(probs, number, t):
    #probs = model.predict_proba(unl_emb)
    indices_rej = np.array([1 if np.amax(l) <= t else 0 for l in probs])
    indices_rejected = np.array(np.where(indices_rej == 1)[0])
    margins = np.array([abs(np.amax(l) - t) for l in probs])
    score_indices = np.argsort(margins)
    score_indices = np.setdiff1d(score_indices, indices_rejected)
    selected_samples = score_indices[0:number]
    return selected_samples

In [None]:
#load datasets
pool = Data(data_folder + dataToTrain, 'train', train_feat)
mClass =  pool.mClass
pool.setStartState(minimum_training_items)

validation = Data(data_folder + dataToVal, 'val', val_feat)
validation_data = validation.data
y_val = np.array(validation_data[goldLabel].tolist())
test = Data(data_folder + dataToTest, 'test', test_feat)
test_data = test.data
y_test = np.array(test_data[goldLabel].tolist())

trainD = pd.read_csv(data_folder + dataToTrain)
trainD = trainD[[txt, goldLabel]]
valD = pd.read_csv(data_folder + dataToVal)
valD = valD[[txt, goldLabel]]
validation_set = SentimentData(valD, tokenizer, MAX_LEN, txt, goldLabel)
validation_loader = DataLoader(validation_set, **test_params)

testD = pd.read_csv(data_folder + dataToTest)
testD = testD[[txt, goldLabel]]
testing_set = SentimentData(testD, tokenizer, MAX_LEN, txt, goldLabel)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        torch.manual_seed(3)
        self.pre_classifier = torch.nn.Linear(768, 768)
        #self.dropout = torch.nn.Dropout(0.3)
        torch.manual_seed(3)
        self.classifier = torch.nn.Linear(768, len(mClass))

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        #pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Loading the model
model = RobertaClass()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
def train(epoch, model, train_loader, optim):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optim.zero_grad()
        loss.backward()
        # # When using GPU
        optim.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

def valid(model, test_loader):
    model.eval()
    # Tracking variables 
    logitsList , true_labels = np.array([]), np.array([])
    index = 0
    # Predict 
    for _, data in tqdm(enumerate(test_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids).squeeze()
        # Move logits and labels to CPU
        outputs = outputs.detach().cpu().numpy()
        if outputs.ndim == 1:
            probs = np.array([F.softmax(torch.tensor(outputs)).detach().cpu().numpy()])
        else:
            probs = np.array([F.softmax(torch.tensor(output)).detach().cpu().numpy() for output in outputs])
        targets = targets.to('cpu').numpy()
        # Store predictions and true labels
        if index == 0:
            logitsList = probs
            true_labels = targets
        else:
            logitsList = np.concatenate((logitsList, probs), axis=0)
            true_labels = np.concatenate((true_labels, targets), axis=0)
        index = index + 1
    return logitsList, true_labels

In [None]:
for al_strategy in al_strategies:

    poolData = pool.data.copy()
    training_data = pool.labeledSet.copy()
    unlabeled_data = pool.unlabeledSet.copy()
    batchSize = alBatchSize

    poolDataEmb_val = validation.poolDataEmb
    poolDataEmb_test = test.poolDataEmb
    train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
    train_labels = np.array(training_data[goldLabel].tolist())

    #Start active learning
    sampleIds = []
    samplingRanks = []
    samplesDict = {}
    samplesDict[0] = training_data[iID].tolist()

    rv_path = res_path + logfile_name + al_strategy + "_value.csv"
    with open(rv_path, 'w') as f:
        c = 'batch, Vr, Vc, Vw, k, t_cal, t_opt_val, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test'
        f.write(c + '\n')

    training_set = SentimentData(pd.DataFrame(list(zip(train_data.tolist(), train_labels)), columns = dfColumns), tokenizer, MAX_LEN, txt, goldLabel)
    training_loader = DataLoader(training_set, **train_params)
    
    #fine-tune the model
    for epoch in range(EPOCHS):
        train(epoch, model, training_loader, optimizer)

    trainingTest_loader = DataLoader(training_set, **test_params)
    logits_train, train_labels = valid(model, trainingTest_loader)
    logits_val, y_val = valid(model, validation_loader)
    logits_test, y_test = valid(model, testing_loader)

    for Vw in Vw_list:
        k = (-1)*(Vw / Vc)
        t = cost_based_threshold(k)
        value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
        value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels, t, Vr, Vc, Vw)

        t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)
        value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
        value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

        t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
        t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
        value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

        with open(rv_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(0, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
            f.write(res_i)
    
    for alBatch in range(1, alBatchNum + 1, 1):
        sampledIndices = []

        unl_dataEmb = pool.poolDataEmb[poolData.index[poolData[iID].isin(unlabeled_data[iID].values)].tolist()]
        unl_dataLabels = pool.poolGoldLabels[poolData.index[poolData[iID].isin(unlabeled_data[iID].values)].tolist()]
        unlabeled_set = SentimentData(pd.DataFrame(list(zip(unl_dataEmb.tolist(), unl_dataLabels.tolist())), columns = dfColumns), tokenizer, MAX_LEN, txt, goldLabel)
        unlabeled_loader = DataLoader(unlabeled_set, **test_params)
        probs_unlabeled, labels_unlabeled = valid(model, unlabeled_loader)

        if alBatch == alBatchNum:
            batchSize = len(unlabeled_data[iID].values)
            print("alBatchSize changed to: ", batchSize)

        if al_strategy == 'uncertainty':
            idx = uncertainty_sampling(probs_unlabeled, batchSize)
            sampledIndices = unlabeled_data.loc[idx][iID].tolist()
            for i in sampledIndices: sampleIds.append(i)
        elif al_strategy == 'certainty':
            idx = certainty_sampling(probs_unlabeled, batchSize)
            sampledIndices = unlabeled_data.loc[idx][iID].tolist()
            for i in sampledIndices: sampleIds.append(i)
        elif al_strategy == 'random':
            sampledIndices = random_sampling(unlabeled_data[iID].values, batchSize)
            for i in sampledIndices: sampleIds.append(i)
        else:
            #default sampling, random
            sampledIndices = random_sampling(unlabeled_data[iID].values, batchSize)
            for i in sampledIndices: sampleIds.append(i)

        sampledSet = poolData.loc[poolData[iID].isin(sampledIndices)]
        samplesDict[alBatch] = sampledIndices
                    
        training_data.reset_index(drop=True)
        sampledSet.reset_index(drop=True)
        training_data = pd.concat([training_data, sampledSet], axis=0).reset_index(drop=True)
        training_data = training_data.sort_values(iID)
        indices = unlabeled_data.loc[unlabeled_data[iID].isin(sampledIndices)].index.to_list()
        unlabeled_data = unlabeled_data.drop(indices).reset_index(drop=True)
        unlabeled_data = unlabeled_data.reset_index(drop=True)
  
        train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
        train_labels = np.array(training_data[goldLabel].tolist())

        training_set = SentimentData(pd.DataFrame(list(zip(train_data.tolist(), train_labels)), columns = dfColumns), tokenizer, MAX_LEN, txt, goldLabel)
        training_loader = DataLoader(training_set, **train_params)
    
        model = RobertaClass()
        model.to(device)
        optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

        #fine-tune the model
        for epoch in range(EPOCHS):
            train(epoch, model, training_loader, optimizer)

        trainingTest_loader = DataLoader(training_set, **test_params)
        logits_train, train_labels = valid(model, trainingTest_loader)
        logits_val, y_val = valid(model, validation_loader)
        logits_test, y_test = valid(model, testing_loader)

        if alBatch in controlList2:
            col = []
            for i in range(logits_test.shape[1]):
                col.append(str(i))
            df_lgt = pd.DataFrame(logits_test, columns = col)
            df_lgt['y'] = y_test
            df_lgt.to_csv(res_path + logfile_name + al_strategy + '_b_{}_logits.csv'.format(alBatch))

        for Vw in Vw_list:
            k = (-1)*(Vw / Vc)
            t = cost_based_threshold(k)

            value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
            value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels,  t, Vr, Vc, Vw)

            t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)

            value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
            value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

            t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
            t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
            value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

            with open(rv_path, 'a') as f:
                res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(alBatch, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
                f.write(res_i) 
        
  #  training_data.to_csv(res_path + logfile_name + al_strategy + "_trainingData.csv")
    samplesDict_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in samplesDict.items() ]))
    samplesDict_df.to_csv(res_path + logfile_name + al_strategy + "_sampledItems.csv")

    output_model_file = res_path + logfile_name + al_strategy + '_roberta_sentiment.bin'
    output_vocab_file = './'

    model_to_save = model
    torch.save(model_to_save, output_model_file)
    tokenizer.save_vocabulary(output_vocab_file)
    print('All files saved')
