In [None]:
!pip install transformers
!pip install sklearn
!pip install netcal

In [None]:
import os
import random
import numpy as np
import pandas as pd 
from random import shuffle

from netcal.metrics import ECE
from sklearn.metrics import precision_recall_fscore_support, brier_score_loss, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [None]:
# please delete this cell if you don't run on Colab
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#please define all the configurations for your dataset in this cell, and then run the code
#configurations 
res_path                = 'drive/My Drive/Colab Notebooks/deepALForCalibration/res/withCost/'                            # specify the path to keep results
data_folder             = 'drive/My Drive/Colab Notebooks/deepALForCalibration/datasets/binary/disaster_relevance/'  #specify the path to the folder where you keep your datasets
dataToTrain             = '2_train_indexed_disaster_relevance_binary.csv'    # file name for your training data
dataToVal               = '2_val_indexed_disaster_relevance_binary.csv'      # file name for your validation data
dataToTest              = '2_test_indexed_disaster_relevance_binary.csv'     # file name for your test data
logfile_name            = "2-disasterRelevance-MLP3"                         # specify the name of the result file
#al_strategy             = 'random'                                          # specify the active learning strategy you want to use; 'random', 'diversity', or 'uncertainty'
al_strategies           = ['random', 'uncertainty', 'diversity']
minimum_training_items  = 157                                                # minimum number of training items before we first train a model
alBatchNum              = 10                                                 # define the total number of batches in active learning pipeline
alBatchSize             = 740                                                # define the size of one batch in active learning pipeline
maxTfIdfFeat            = 1024                                               # define the maximum number of features for tfidf 
cfpList                     = [1]                                            # define the cost of obtaining a false positive
cfnList                     = [1, 10, 100, 1000]                             # define the cost of obtaining a false negative
chList                      = [1, 10, 100, 1000]                             # cost of asking humans

# columns of the csv file used in the experiments: text/content for each item, gold labels for each item, confidence scores for each class, ID of each item 
# specify the column names of your data
iID                     = 'itemID'                                          # give each item an ID, it will be used during active learning
goldLabel               = 'crowd_label'                                     # define the name of column where you keep the gold labels of your data
txt                     = 'text'                                            # define the name of column where you keep the items 
testGoldLabel           = 'gold_label'                                      # define the name of column where you keep the gold labels of your test data

model = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100, 100), max_iter=500, alpha=0.05, activation = 'tanh', solver='sgd')  # define the model parameters that best fit to your data

In [None]:
poolDataEmb_train = np.array([])
poolDataEmb_val = np.array([])
poolDataEmb_test = np.array([])

# data directories
unlabeled_data_dir = data_folder + dataToTrain
validation_data_dir = data_folder + dataToVal
test_data_dir = data_folder + dataToTest

In [None]:
class DiversitySampling():

    def __init__(self, verbose):
        self.verbose = verbose
    
    def get_validation_rankings(self, model, validation_data, val_emb):
        """Get model outliers from unlabeled data 
    
        Keyword arguments:
            model -- current Machine Learning model for this task
            unlabeled_data -- data that does not yet have a label
            validation_data -- held out data drawn from the same distribution as the training data
            number -- number of items to sample
            limit -- sample from only this many items for faster sampling (-1 = no limit)
    
        An outlier is defined as 
        unlabeled_data with the lowest average from rank order of logits
        where rank order is defined by validation data inference 
    
        """
                
        validation_rankings = [] # 2D array, every neuron by ordered list of output on validation data per neuron    
    
        # Get per-neuron scores from validation data
        if self.verbose:
            print("Getting neuron activation scores from validation data")

        pred = model.predict_proba(val_emb) 

        v = 0
        for neuron_outputs in pred:
            # initialize array if we haven't yet
            if len(validation_rankings) == 0:
                for output in list(neuron_outputs):
                    validation_rankings.append([0.0] * len(validation_data))

            n=0
            for output in list(neuron_outputs):
                validation_rankings[n][v] = output
                n += 1
            v +=1
            
        
        # Rank-order the validation scores 
        v=0
        for validation in validation_rankings:
            validation.sort() 
            validation_rankings[v] = validation
            v += 1
          
        return validation_rankings 
    
    def get_rank(self, value, rankings):
        """ get the rank of the value in an ordered array as a percentage 
    
        Keyword arguments:
            value -- the value for which we want to return the ranked value
            rankings -- the ordered array in which to determine the value's ranking
        
        returns linear distance between the indexes where value occurs, in the
        case that there is not an exact match with the ranked values    
        """
        
        index = 0 # default: ranking = 0
        
        for ranked_number in rankings:
            if value < ranked_number:
                break #NB: this O(N) loop could be optimized to O(log(N))
            index += 1        
        
        if(index >= len(rankings)):
            index = len(rankings) # maximum: ranking = 1
            
        elif(index > 0):
            # get linear interpolation between the two closest indexes 
            
            diff = rankings[index] - rankings[index - 1]
            perc = value - rankings[index - 1]
            linear = perc / diff
            index = float(index - 1) + linear
        
        absolute_ranking = index / len(rankings)
    
        return(absolute_ranking)
    
    def get_model_outliers(self, dataPool, model, unlabeled_data, unl_emb, validation_data, val_emb, number):
        """Get model outliers from unlabeled data 
    
        Keyword arguments:
            model -- current Machine Learning model for this task
            unlabeled_data -- data that does not yet have a label
            validation_data -- held out data drawn from the same distribution as the training data
            number -- number of items to sample
            limit -- sample from only this many items for faster sampling (-1 = no limit)
    
        An outlier is defined as 
        unlabeled_data with the lowest average from rank order of logits
        where rank order is defined by validation data inference 
    
        """
    
        # Get per-neuron scores from validation data
        validation_rankings = self.get_validation_rankings(model, validation_data, val_emb)

        # Iterate over unlabeled items
        if self.verbose:
            print("Getting rankings for unlabeled data")
    
        outliers = []
        pred = model.predict_proba(unl_emb) 

        itID = 0
        for neuron_outputs in pred:
            n=0
            ranks = []
            for output in neuron_outputs:
                rank = self.get_rank(output, validation_rankings[n])
                ranks.append(rank)
                n += 1 
            avgRank = 1 - (sum(ranks) / len(neuron_outputs)) # average rank
            currentRow = unlabeled_data.iloc[[itID]].reset_index(drop=True)
            rowIndex = currentRow.itemID.item()
            row = dataPool.loc[dataPool[iID] == rowIndex]
            row['avgRank'] = avgRank
            outliers.append(row.values.flatten().tolist()) 
            itID += 1
        outliers.sort(reverse=True, key=lambda x: x[-1])       
        return outliers[:number:]       

In [None]:
def random_sampling(dataIds, nQuery):
    '''Randomly samples the points'''
    query_idx = random.sample(range(len(dataIds)), nQuery)
    selectedIndex = dataIds[query_idx]
    return selectedIndex
        
def uncertainty_sampling(model, unl_emb, number):
    '''Points are sampled according to uncertainty sampling criterion'''

    pred = model.predict_proba(unl_emb)
    uncertainty_scores = 1 - pred.max(axis=1)
    score_indices = np.argsort(uncertainty_scores)
    return score_indices[-number:]   

In [None]:
## Feature Preparation
def prepare_features(X_train, min_df=2, max_features=None, ngram_range=(1, 3)):
    # compute tfidf features
    tfidf = TfidfVectorizer(min_df=min_df, max_features=max_features,
                strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                ngram_range=ngram_range, use_idf=1, smooth_idf=1, sublinear_tf=1,
                stop_words=None, lowercase=False)

    X_train_tfidf = tfidf.fit_transform(X_train).toarray()
    return X_train_tfidf

class Data():
    
    def __init__(self, filename):
        
        # each dataset will have a pool of data, together with their IDs and gold labels 
        self.poolData = np.array([])
        self.poolGoldLabels = np.array([])
        
        dt = pd.read_csv(filename)
        dt = dt.dropna()
        dt = dt.reset_index(drop=True)
        y = dt[goldLabel].values
        X = prepare_features(dt[txt].tolist(), min_df= 0, max_features = maxTfIdfFeat, ngram_range = (1, 3))
        self.data = dt
        self.poolDataEmb = X
        self.poolGoldLabels = y
        self.mClass = list(set(self.poolGoldLabels.tolist()))
        
    def setStartState(self, nStart):
        ''' This functions creates the initial training set which contains the equal number of samples per class
        Input:
        nStart -- number of labelled datapoints (size of training set)
        '''
        self.nStart = nStart
        data = self.data.copy()
        # get predefined points so that all classes are represented and initial classifier could be trained.
        sampledIndices = []
        for cls in self.mClass:
            indices = np.array(np.where(self.poolGoldLabels == cls)).tolist()[0]
            idx = random.sample(indices, nStart // len(mClass))
            sampledIndices = sampledIndices + idx

        sData = data.iloc[sampledIndices]
        self.labeledSet = sData.reset_index(drop=True)
        droppedData = data.drop(sampledIndices)
        self.unlabeledSet = droppedData.reset_index(drop=True)

# function to calculate the ECE score
def ece_score(y_true, y_prob, n_bins=10):
    ece = ECE(n_bins)
    ece_val = ece.measure(y_prob, y_true)

    return ece_val

# classify by the threshold with respect to cost of different errors
def classify(probs, goldLabels):
    y_clf = []
    y_pred = []
    for i in range(len(probs)):
        p = probs[i][1]  #probability of being positive
        if ((cfp*(1-p)) < (cfn*p)) and ((cfp*(1-p)) < ch):
            y_pred.append(1)
            y_clf.append(1)
        elif ((cfn*p) < (cfp*(1-p))) and ((cfn*p) < ch):
            y_pred.append(0)
            y_clf.append(0)
        else:
            y_clf.append(-1)
            y_pred.append(goldLabels[i])
    return y_clf, np.array(y_pred)

#calculate the total cost of predictions
def calculateCost(fp, fn, y_clf):
    uc = y_clf.count(-1)
    cost = (cfp*fp + cfn*fn + ch*uc) / len(y_clf)
    return cost, uc

#evaluate the trained model on test/validation/training sets
def evaluate(train_data, train_labels, poolDataEmb_val, validation_data, poolDataEmb_test, test_data, mClass, sampledIndices, res_path, alBatch):

    logits_train = model.predict_proba(train_data)
    probs_train = np.array(logits_train)
    y_clf_train, y_pred_train = classify(probs_train, train_labels)

    logits_val = model.predict_proba(poolDataEmb_val)
    probs_val = np.array(logits_val)
    val_labels = np.array(validation_data[goldLabel].tolist())
    y_clf_val, y_pred_val = classify(probs_val, val_labels)

    logits_test = model.predict_proba(poolDataEmb_test)
    probs_test = np.array(logits_test)
    test_labels = np.array(test_data[testGoldLabel].tolist())
    y_clf_test, y_pred_test = classify(probs_test, test_labels)

    # check if binary or multi class classification
    if len(mClass) == 2:
        average = 'binary'
    else:
        average = 'macro'

    sampledItems = ''.join(str(e)+' ' for e in sampledIndices)

    pre_train, rec_train, f1_train, _ = precision_recall_fscore_support(train_labels, y_pred_train, average=average, beta=1)
    ece_train = ece_score(train_labels, probs_train)
    _, _, f01_train, _ = precision_recall_fscore_support(train_labels, y_pred_train, average=average, beta=0.1)
    _, _, f10_train, _ = precision_recall_fscore_support(train_labels, y_pred_train, average=average, beta=10)

    pre_val, rec_val, f1_val, _ = precision_recall_fscore_support(val_labels, y_pred_val, average=average, beta=1)
    ece_val = ece_score(val_labels, probs_val)
    _, _, f01_val, _ = precision_recall_fscore_support(val_labels, y_pred_val, average=average, beta=0.1)
    _, _, f10_val, _ = precision_recall_fscore_support(val_labels, y_pred_val, average=average, beta=10)

    pre_test, rec_test, f1_test, _ = precision_recall_fscore_support(test_labels, y_pred_test, average=average, beta=1)
    ece_test = ece_score(test_labels, probs_test)
    _, _, f01_test, _ = precision_recall_fscore_support(test_labels, y_pred_test, average=average, beta=0.1)
    _, _, f10_test, _ = precision_recall_fscore_support(test_labels, y_pred_test, average=average, beta=10)

    if average == 'binary':
        brier_train = brier_score_loss(train_labels, probs_train[:,1])
        brier_val = brier_score_loss(val_labels, probs_val[:,1])
        brier_test = brier_score_loss(test_labels, probs_test[:,1])

        tn_train, fp_train, fn_train, tp_train = confusion_matrix(train_labels, y_pred_train).ravel()
        tn_val, fp_val, fn_val, tp_val = confusion_matrix(val_labels, y_pred_val).ravel()
        tn_test, fp_test, fn_test, tp_test = confusion_matrix(test_labels, y_pred_test).ravel()

        cost_train, uc_train = calculateCost(fp_train, fn_train, y_clf_train)
        cost_val, uc_val = calculateCost(fp_val, fn_val, y_clf_val)
        cost_test, uc_test = calculateCost(fp_test, fn_test, y_clf_test)
        
        print(
            'Iteration: {}. F1: {:1.3f}, Precision: {:1.3f}, Recall: {:1.3f}'.
            format(alBatch, f1_val, pre_val, rec_val))
        # print to result file
        with open(res_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(alBatch, sampledItems, pre_train, rec_train, f01_train, f1_train, f10_train, ece_train, brier_train, cost_train, uc_train, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val, brier_val, cost_val, uc_val, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test, brier_test, cost_test, uc_test)
            f.write(res_i)
    else:

        print(
            'Iteration: {}. F1: {:1.3f}, Precision: {:1.3f}, Recall: {:1.3f}'.
            format(alBatch, f1_val, pre_val, rec_val))
        # print to result file
        with open(res_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(alBatch, sampledItems, pre_train, rec_train, f01_train, f1_train, f10_train, ece_train, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test)
            f.write(res_i)

In [None]:
#load datasets
pool = Data(unlabeled_data_dir)
mClass =  pool.mClass
pool.setStartState(minimum_training_items)

validation = Data(validation_data_dir)
validation_data = validation.data
test = Data(test_data_dir)
test_data = test.data

for al_strategy in al_strategies:
    for cfp in cfpList:
        for cfn in cfnList:
            for ch in chList:
                poolData = pool.data
                training_data = pool.labeledSet
                unlabeled_data = pool.unlabeledSet

                poolDataEmb_val = validation.poolDataEmb
                poolDataEmb_test = test.poolDataEmb

                train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
                train_labels = np.array(training_data[goldLabel].tolist())

                #Start active learning
                sampleIds = []
                samplingRanks = []

                log_name = al_strategy + '_' + logfile_name + "_cfp_{}_cfn_{}_ch_{}.csv".format(cfp, cfn, ch)

                # create log file
                log_path = res_path + log_name
                if len(mClass) == 2:
                    with open(log_path, 'w') as f:
                        c = 'alBatch, sampledIndices, pre_train, rec_train, f01_train, f1_train, f10_train, ece_train, brier_train, cost_train, uc_train, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val, brier_val, cost_val, uc_val, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test, brier_test, cost_test, uc_test'
                        f.write(c + '\n')
                else:
                    with open(log_path, 'w') as f:
                        c = 'alBatch, sampledIndices, pre_train, rec_train, f01_train, f1_train, f10_train, ece_train, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val, pre_test, rec_test, f01_test, f1_test, f10_test, ece_test'
                        f.write(c + '\n')

                model.fit(train_data, train_labels) 
                evaluate(train_data, train_labels, poolDataEmb_val, validation_data, poolDataEmb_test, test_data, mClass, [], log_path, 0)

                for alBatch in range(alBatchNum):
                    sampledIndices = []

                    unl_dataEmb = pool.poolDataEmb[poolData.index[poolData[iID].isin(unlabeled_data[iID].values)].tolist()]

                    if al_strategy == 'diversity':
                        strategy = DiversitySampling(True)
                        sampledItems = strategy.get_model_outliers(poolData, model, unlabeled_data, unl_dataEmb, validation_data, poolDataEmb_val, number=alBatchSize) 
        
                        for outlier in sampledItems:
                            samplingRanks.append(outlier[-1])
                            sampleIds.append(outlier[-2])
                            sampledIndices.append(outlier[-2])
   
                    elif al_strategy == 'random':
                        sampledIndices = random_sampling(unlabeled_data[iID].values, alBatchSize)
                        for i in sampledIndices: sampleIds.append(i)
                    elif al_strategy == 'uncertainty':
                        idx = uncertainty_sampling(model, unl_dataEmb, alBatchSize)
                        sampledIndices = unlabeled_data.loc[idx][iID].tolist()
                        for i in sampledIndices: sampleIds.append(i)
                    else:
                        # random sampling by default
                        sampledIndices = random_sampling(unlabeled_data[iID].values, alBatchSize)
                        for i in sampledIndices: sampleIds.append(i)

                    sampledSet = poolData.loc[poolData[iID].isin(sampledIndices)]
                    training_data.reset_index(drop=True)
                    sampledSet.reset_index(drop=True)
                    training_data = pd.concat([training_data, sampledSet], axis=0).reset_index(drop=True)
                    training_data = training_data.sort_values(iID)
                    indices = unlabeled_data.loc[unlabeled_data[iID].isin(sampledIndices)].index.to_list()
                    unlabeled_data = unlabeled_data.drop(indices).reset_index(drop=True)
                    unlabeled_data = unlabeled_data.reset_index(drop=True)
  
                    train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
                    train_labels = np.array(training_data[goldLabel].tolist())

                    model.fit(train_data, train_labels) 
                    evaluate(train_data, train_labels, poolDataEmb_val, validation_data, poolDataEmb_test, test_data, mClass, sampledIndices, log_path, alBatch + 1)     