In [None]:
!pip install sklearn
!pip install sentence-transformers

In [None]:
## Mount Drive into Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import libraries
import numpy as np
import pandas as pd 
import csv
import random
from random import shuffle
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [None]:
random.seed(41)
# define parameters
datasetName = 'usAirline'                                                           # name of the dataset
data_folder = '.../usAirline/data/'     # define the path where you store your datasets
dataToTrain = 'usAirline_train.csv'                                                 # name of your training data file
dataToVal = 'usAirline_val.csv'                                                     # name of your validation data file
dataToTest ='usAirline_test.csv'                                                    # name of your test data file
txt = 'text'                                                                        # define the column name of your text data
goldLabel = 'airline_sentiment'                                                     # define the column name of your ground truth label

# AL parameters
al_strategies           = ['uncertainty', 'certainty', 'random', 'tos', 'tos-below'] # define active learning strategies you want to test
minimum_training_items  = 3                                                          # minimum number of training items before we first train a model
alBatchNum              = 88                                                         # define the total number of batches in active learning pipeline
alBatchSize             = 100                                                        # define the size of one batch in active learning pipeline
controlList2 = [25, 50, 88]                                                          # define on which batches you want to save the predicted probabilities as a separate file 


encName = 'tfidf'                                                                    # pick either 'tfidf' or 'mpnet' as the encoder
model = LogisticRegression(max_iter=1000)                                            # we use Logistic Regression model, you can modify this part
modelName = 'LogReg'                                                                 # name of your model


iID = 'itemID'                                                                              # do not change this column
res_path = '.../AL/res/{}/'.format(datasetName) # specify the path to keep results
logfile_name = "{}_{}_rnd41_{}_".format(datasetName,modelName,encName)       

In [None]:
#cost-based parameters
Vr = 0.0
Vc = 1.0
Vw_list = [0, -0.1, -0.2, -0.3, -0.4, -0.5, -0.6, -0.7, -0.8, -0.9, -1.0, -2.0, -4.0, -8.0, -10.0, -100.0]
confT_list = list(np.arange(0, 1.01, 0.01))

In [None]:
# Create the encoder
if encName == 'tfidf':
    encoder = TfidfVectorizer(min_df=0, max_features = 1024, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
                stop_words='english', lowercase=False)
elif encName == 'mpnet':
    encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
else:
    raise Exception("The encoder name provided can't be recognised. You should pick either tfidf or mpnet as the text encoder")

In [None]:
class CELoss(object):

    def compute_bin_boundaries(self, probabilities = np.array([])):

        #uniform bin spacing
        if probabilities.size == 0:
            bin_boundaries = np.linspace(0, 1, self.n_bins + 1)
            self.bin_lowers = bin_boundaries[:-1]
            self.bin_uppers = bin_boundaries[1:]
        else:
            #size of bins 
            bin_n = int(self.n_data/self.n_bins)

            bin_boundaries = np.array([])

            probabilities_sort = np.sort(probabilities)  

            for i in range(0,self.n_bins):
                bin_boundaries = np.append(bin_boundaries,probabilities_sort[i*bin_n])
            bin_boundaries = np.append(bin_boundaries,1.0)

            self.bin_lowers = bin_boundaries[:-1]
            self.bin_uppers = bin_boundaries[1:]


    def get_probabilities(self, output, labels, logits):
        #If not probabilities apply softmax!
        if logits:
            self.probabilities = softmax(output, axis=1)
        else:
            self.probabilities = output

        self.labels = labels
        self.confidences = np.max(self.probabilities, axis=1)
        self.predictions = np.argmax(self.probabilities, axis=1)
        self.accuracies = np.equal(self.predictions,labels)

    def binary_matrices(self):
        idx = np.arange(self.n_data)
        #make matrices of zeros
        pred_matrix = np.zeros([self.n_data,self.n_class])
        label_matrix = np.zeros([self.n_data,self.n_class])
        #self.acc_matrix = np.zeros([self.n_data,self.n_class])
        pred_matrix[idx,self.predictions] = 1
        label_matrix[idx,self.labels] = 1

        self.acc_matrix = np.equal(pred_matrix, label_matrix)


    def compute_bins(self, index = None):
        self.bin_prop = np.zeros(self.n_bins)
        self.bin_acc = np.zeros(self.n_bins)
        self.bin_conf = np.zeros(self.n_bins)
        self.bin_score = np.zeros(self.n_bins)

        if index == None:
            confidences = self.confidences
            accuracies = self.accuracies
        else:
            confidences = self.probabilities[:,index]
            accuracies = self.acc_matrix[:,index]


        for i, (bin_lower, bin_upper) in enumerate(zip(self.bin_lowers, self.bin_uppers)):
            # Calculated |confidence - accuracy| in each bin
            in_bin = np.greater(confidences,bin_lower.item()) * np.less_equal(confidences,bin_upper.item())
            self.bin_prop[i] = np.mean(in_bin)

            if self.bin_prop[i].item() > 0:
                self.bin_acc[i] = np.mean(accuracies[in_bin])
                self.bin_conf[i] = np.mean(confidences[in_bin])
                self.bin_score[i] = np.abs(self.bin_conf[i] - self.bin_acc[i])

class MaxProbCELoss(CELoss):
    def loss(self, output, labels, n_bins = 15, logits = True):
        self.n_bins = n_bins
        super().compute_bin_boundaries()
        super().get_probabilities(output, labels, logits)
        super().compute_bins()

#http://people.cs.pitt.edu/~milos/research/AAAI_Calibration.pdf
class ECELoss(MaxProbCELoss):

    def loss(self, output, labels, n_bins = 15, logits = False):
        super().loss(output, labels, n_bins, logits)
        return np.dot(self.bin_prop,self.bin_score)

In [None]:
def uncertainty_sampling(model, unl_emb, number):
    """Returns batch of datapoints with smallest margin/highest uncertainty.
    For binary classification, can just take the absolute distance to decision
    boundary for each point.
    For multiclass classification, must consider the margin between distance for
    top two most likely classes.
    Returns:
      indices of points selected to add using margin active learner
    """
    '''Points are sampled according to uncertainty sampling criterion'''

    distances = model.predict_proba(unl_emb)

    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    score_indices = np.argsort(min_margin)
    selected_samples = score_indices[0:number]
    return selected_samples

def certainty_sampling(model, unl_emb, number):
    """Returns batch of datapoints with highest margin/smallest uncertainty.
    For binary classification, can just take the absolute distance to decision
    boundary for each point.
    For multiclass classification, must consider the margin between distance for
    top two most likely classes.
    Returns:
      indices of points selected to add using margin active learner
    """
    '''Points are sampled according to certainty sampling criterion'''

    distances = model.predict_proba(unl_emb)

    if len(distances.shape) < 2:
      min_margin = abs(distances)
    else:
      sort_distances = np.sort(distances, 1)[:, -2:]
      min_margin = sort_distances[:, 1] - sort_distances[:, 0]
    score_indices = np.argsort(min_margin)
    score_indices_reversed = score_indices[::-1]
    selected_samples = score_indices_reversed[0:number]
    return selected_samples

def random_sampling(dataIds, nQuery):
    '''Randomly samples the points'''
    query_idx = random.sample(range(len(dataIds)), nQuery)
    selectedIndex = dataIds[query_idx]
    return selectedIndex

def threshold_oriented_sampling(model, unl_emb, number, t):
    probs = model.predict_proba(unl_emb)
    margins = np.array([abs(np.amax(l) - t) for l in probs])
    score_indices = np.argsort(margins)
    selected_samples = score_indices[0:number]
    return selected_samples

def tos_below(model, unl_emb, number, t):
    probs = model.predict_proba(unl_emb)
    indices_acc = np.array([1 if np.amax(l) > t else 0 for l in probs])
    indices_accepted = np.array(np.where(indices_acc == 1)[0])
    margins = np.array([abs(np.amax(l) - t) for l in probs])
    score_indices = np.argsort(margins)
    score_indices = np.setdiff1d(score_indices, indices_accepted)
    selected_samples = score_indices[0:number]
    return selected_samples

def prepare_features(tfidf, X_train, setN):
    # compute tfidf features
    if setN == 'train':
        X_train_tfidf = tfidf.fit_transform(X_train).toarray()
    else:
        X_train_tfidf = tfidf.transform(X_train).toarray()
    return X_train_tfidf

class Data():
    
    def __init__(self, encoder, filename, setN, encoderName):
        
        # each dataset will have a pool of data, together with their IDs and gold labels 
        self.poolData = np.array([])
        self.poolGoldLabels = np.array([])
        
        dt = pd.read_csv(filename)
        #dt = dt.dropna()     # uncomment this if your data has none values
        dt = dt.reset_index(drop=True)
        dt['itemID'] = np.arange(dt.shape[0])
        y = dt[goldLabel].values
        if encName == 'tfidf':
            X = prepare_features(encoder, dt[txt].tolist(), setN)
        else:
            X = encoder.encode(dt[txt].tolist())
        self.data = dt
        self.poolDataEmb = X
        self.poolGoldLabels = y
        self.mClass = list(set(self.poolGoldLabels.tolist()))
        
    def setStartState(self, nStart):
        ''' This functions creates the initial training set which contains the equal number of samples per class
        Input:
        nStart -- number of labelled datapoints (size of training set)
        '''
        self.nStart = nStart
        data = self.data.copy()
        # get predefined points so that all classes are represented and initial classifier could be trained.
        sampledIndices = []
        for cls in self.mClass:
            indices = np.array(np.where(self.poolGoldLabels == cls)).tolist()[0]
            idx = random.sample(indices, nStart // len(mClass))
            sampledIndices = sampledIndices + idx

        sData = data.iloc[sampledIndices]
        self.labeledSet = sData.reset_index(drop=True)
        droppedData = data.drop(sampledIndices)
        self.unlabeledSet = droppedData.reset_index(drop=True)

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, rp_path, batch):

    y_pred_train = model.predict(X_train)
    logits_train = model.predict_proba(X_train)

    y_pred_val = model.predict(X_val)
    logits_val = model.predict_proba(X_val)

    y_pred_test = model.predict(X_test)
    logits_test = model.predict_proba(X_test)
    
    # check if binary or multi class classification
    num_classes = len(set(y_test))
    if num_classes == 2:
        average = 'binary'
    else:
        average = 'macro'
 
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_val = accuracy_score(y_val, y_pred_val)
    acc_test = accuracy_score(y_test, y_pred_test)
    pre_train, rec_train, f1_train, _ = precision_recall_fscore_support(y_train, y_pred_train, average=average, beta=1)
    pre_val, rec_val, f1_val, _ = precision_recall_fscore_support(y_val, y_pred_val, average=average, beta=1)
    pre_test, rec_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average=average, beta=1)
    
    ece = ECELoss()
    ece_train = ece.loss(logits_train, y_train, logits=False)
    ece_val = ece.loss(logits_val, y_val, logits=False)
    ece_test = ece.loss(logits_test, y_test, logits=False)

    with open(rp_path, 'a') as f:
        res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(batch, acc_train, pre_train, rec_train, f1_train, ece_train, acc_val, pre_val, rec_val, f1_val, ece_val, acc_test, pre_test, rec_test, f1_test, ece_test)
        f.write(res_i)

    return logits_train, logits_val, logits_test

def cost_based_threshold(k):
    t = (k)/(k+1)
    return t

def calculate_value(y_hat_proba, y, t, Vr, Vc, Vw):

    y_pred = np.array([np.where(l == np.amax(l))[0][0] if (np.amax(l) > t) else -1 for l in y_hat_proba])

    # now lets compute the actual value of each prediction
    
    value_vector = np.full(y_pred.shape[0], Vc)

    value_vector[(y_pred != y) & (y_pred != -1)] = Vw
    
    #loss due to asking humans
    value_vector[y_pred == -1] = Vr
    value = np.sum(value_vector) / len(y)

    numOfRejectedSamples = np.count_nonzero(y_pred == -1)
    numOfWrongPredictions = np.count_nonzero((y_pred != y) & (y_pred != -1))
    return value, numOfRejectedSamples, numOfWrongPredictions

def find_optimum_confidence_threshold(y_hat_proba, y, t_list, Vr, Vc, Vw):

    cost_list = {}

    for t in t_list:
        value, _ , __ = calculate_value(y_hat_proba, y, t, Vr, Vc, Vw)
        cost_list["{}".format(t)] = value
    # find t values with maximum value
    maxValue = max(cost_list.values())
    optTList = [float(k) for k, v in cost_list.items() if v == maxValue]
    # pick the one with the lowest confidence
    optimumT = min(optTList)

    return optimumT, cost_list

In [None]:
def run_sota_AL(model,al_strategy,pool,mClass,validation,validation_data,y_val,test,test_data,y_test,alBatchSize,res_path,logfile_name,Vw_list):
    poolData = pool.data.copy()
    training_data = pool.labeledSet.copy()
    unlabeled_data = pool.unlabeledSet.copy()
    batchSize = alBatchSize

    poolDataEmb_val = validation.poolDataEmb
    poolDataEmb_test = test.poolDataEmb

    train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
    train_labels = np.array(training_data[goldLabel].tolist())

    #Start active learning
    sampleIds = []
    samplingRanks = []
    samplesDict = {}
    samplesDict[0] = training_data[iID].tolist()

    # create log file 
    rp_path = res_path + logfile_name + al_strategy + "_perf.csv"
    with open(rp_path, 'w') as f:
        c = 'batch, acc_train, pre_train, rec_train, f1_train, ece_train, acc_val, pre_val, rec_val, f1_val, ece_val, acc_test, pre_test, rec_test, f1_test, ece_test'
        f.write(c + '\n')

    rv_path = res_path + logfile_name + al_strategy + "_value.csv"
    with open(rv_path, 'w') as f:
        c = 'batch, Vr, Vc, Vw, k, t_cal, t_opt_val, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test'
        f.write(c + '\n')

    model.fit(train_data, train_labels) 
    logits_train, logits_val, logits_test = evaluate_model(model, train_data, train_labels, poolDataEmb_val, y_val, poolDataEmb_test, y_test, rp_path, 0)

    for Vw in Vw_list:
        k = (-1)*(Vw / Vc)
        t = cost_based_threshold(k)
        value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
        value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels, t, Vr, Vc, Vw)

        t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)
        value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
        value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

        t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
        t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
        value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

        with open(rv_path, 'a') as f:
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(0, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
            f.write(res_i)
    
    for alBatch in range(1, alBatchNum + 1, 1):
        sampledIndices = []

        unl_dataEmb = pool.poolDataEmb[poolData.index[poolData[iID].isin(unlabeled_data[iID].values)].tolist()]

        if alBatch == alBatchNum:
            batchSize = len(unlabeled_data[iID].values)
            print("alBatchSize changed to: ", batchSize)

        if al_strategy == 'uncertainty':
            idx = uncertainty_sampling(model, unl_dataEmb, batchSize)
            sampledIndices = unlabeled_data.loc[idx][iID].tolist()
            for i in sampledIndices: sampleIds.append(i)
        elif al_strategy == 'random':
            sampledIndices = random_sampling(unlabeled_data[iID].values, batchSize)
            for i in sampledIndices: sampleIds.append(i)
        else:
            #default sampling, random
            sampledIndices = random_sampling(unlabeled_data[iID].values, batchSize)
            for i in sampledIndices: sampleIds.append(i)

        sampledSet = poolData.loc[poolData[iID].isin(sampledIndices)]
        samplesDict[alBatch] = sampledIndices
                    
        training_data.reset_index(drop=True)
        sampledSet.reset_index(drop=True)
        training_data = pd.concat([training_data, sampledSet], axis=0).reset_index(drop=True)
        training_data = training_data.sort_values(iID)
        indices = unlabeled_data.loc[unlabeled_data[iID].isin(sampledIndices)].index.to_list()
        unlabeled_data = unlabeled_data.drop(indices).reset_index(drop=True)
        unlabeled_data = unlabeled_data.reset_index(drop=True)
  
        train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
        train_labels = np.array(training_data[goldLabel].tolist())

        model.fit(train_data, train_labels) 

        logits_train, logits_val, logits_test = evaluate_model(model, train_data, train_labels, poolDataEmb_val, y_val, poolDataEmb_test, y_test, rp_path, alBatch)

        if alBatch in controlList2:
            col = []
            for i in range(logits_test.shape[1]):
                col.append(str(i))
            df_lgt = pd.DataFrame(logits_test, columns = col)
            df_lgt['y'] = y_test
            df_lgt.to_csv(res_path + logfile_name + al_strategy + '_b_{}_logits.csv'.format(alBatch))

        for Vw in Vw_list:
            k = (-1)*(Vw / Vc)
            t = cost_based_threshold(k)

            value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
            value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels,  t, Vr, Vc, Vw)

            t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)

            value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
            value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

            t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
            t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
            value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

            with open(rv_path, 'a') as f:
                res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(alBatch, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
                f.write(res_i) 
        
  #  training_data.to_csv(res_path + logfile_name + al_strategy + "_trainingData.csv")
    samplesDict_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in samplesDict.items() ]))
    samplesDict_df.to_csv(res_path + logfile_name + al_strategy + "_sampledItems.csv")


def run_tos(model,al_strategy,pool,mClass,validation,validation_data,y_val,test,test_data,y_test,alBatchSize,res_path,logfile_name,Vw_list):
    poolData = pool.data.copy()
    training_data_first = pool.labeledSet
    unlabeled_data_first = pool.unlabeledSet
    batchSize = alBatchSize

    poolDataEmb_val = validation.poolDataEmb
    poolDataEmb_test = test.poolDataEmb

    train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data_first[iID].values)].tolist()]
    train_labels = np.array(training_data_first[goldLabel].tolist())

    #Start active learning
    samplingRanks = []
    samplesDict = {}
    samplesDict[0] = training_data_first[iID].tolist()

    model.fit(train_data, train_labels) 

    for Vw in Vw_list:
        k = (-1)*(Vw / Vc)
        t = cost_based_threshold(k)

        rp_path = res_path + logfile_name + al_strategy + '_' + str(k) + "_perf.csv"
        with open(rp_path, 'w') as f:
            c = 'batch, acc_train, pre_train, rec_train, f1_train, ece_train, acc_val, pre_val, rec_val, f1_val, ece_val, acc_test, pre_test, rec_test, f1_test, ece_test'
            f.write(c + '\n')

        logits_train, logits_val, logits_test = evaluate_model(model, train_data, train_labels, poolDataEmb_val, y_val, poolDataEmb_test, y_test, rp_path, 0)


        value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
        value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels, t, Vr, Vc, Vw)

        t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)
        value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
        value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

        t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
        t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
        value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

        rv_path = res_path + logfile_name + al_strategy + '_' + str(k) + "_value.csv"
        with open(rv_path, 'w') as f:
            c = 'batch, Vr, Vc, Vw, k, t_cal, t_opt_val, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test'
            f.write(c + '\n')
            res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(0, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
            f.write(res_i)

    for Vw in Vw_list:
        k = (-1)*(Vw / Vc)
        rp_path = res_path + logfile_name + al_strategy + '_' + str(k) + "_perf.csv"
        rv_path = res_path + logfile_name + al_strategy + '_' + str(k) + "_value.csv"

        t = cost_based_threshold(k)

        unlabeled_data = unlabeled_data_first.copy()
        training_data = training_data_first.copy()
        batchSize = alBatchSize

        for alBatch in range(1, alBatchNum + 1, 1):
            sampledIndices = []

            unl_dataEmb = pool.poolDataEmb[poolData.index[poolData[iID].isin(unlabeled_data[iID].values)].tolist()]

            if alBatch == alBatchNum:
                batchSize = len(unlabeled_data[iID].values)
                print("alBatchSize changed to: ", batchSize)
            if al_strategy == 'tos':
                idx = threshold_oriented_sampling(model, unl_dataEmb, batchSize, t)
            else:
                idx = tos_below(model, unl_dataEmb, batchSize, t)
            sampledIndices = unlabeled_data.loc[idx][iID].tolist()
            sampledSet = poolData.loc[poolData[iID].isin(sampledIndices)]
            samplesDict[alBatch] = sampledIndices
                    
            training_data.reset_index(drop=True)
            sampledSet.reset_index(drop=True)
            training_data = pd.concat([training_data, sampledSet], axis=0).reset_index(drop=True)
            training_data = training_data.sort_values(iID)
            indices = unlabeled_data.loc[unlabeled_data[iID].isin(sampledIndices)].index.to_list()
            unlabeled_data = unlabeled_data.drop(indices).reset_index(drop=True)
            unlabeled_data = unlabeled_data.reset_index(drop=True)
  
            train_data = pool.poolDataEmb[poolData.index[poolData[iID].isin(training_data[iID].values)].tolist()]
            train_labels = np.array(training_data[goldLabel].tolist())

            model.fit(train_data, train_labels) 

            logits_train, logits_val, logits_test = evaluate_model(model, train_data, train_labels, poolDataEmb_val, y_val, poolDataEmb_test, y_test, rp_path, alBatch)

            if alBatch in controlList2:
                col = []
                for i in range(logits_test.shape[1]):
                    col.append(str(i))
                df_lgt = pd.DataFrame(logits_test, columns = col)
                df_lgt['y'] = y_test
                df_lgt.to_csv(res_path + logfile_name + al_strategy + '_' + '_b_{}_k_{}_logits.csv'.format(alBatch, k))

            t_opt, cost_list = find_optimum_confidence_threshold(logits_val, y_val, confT_list, Vr, Vc, Vw)

            value_test, rej_test, wrong_test = calculate_value(logits_test, y_test, t, Vr, Vc, Vw)
            value_train, rej_train, wrong_train = calculate_value(logits_train, train_labels, t, Vr, Vc, Vw)

            value_test_opt, rej_test_opt, wrong_test_opt = calculate_value(logits_test, y_test, t_opt, Vr, Vc, Vw)
            value_train_opt, rej_train_opt, wrong_train_opt  = calculate_value(logits_train, train_labels, t_opt, Vr, Vc, Vw)

            t_opt_train, cost_list_ = find_optimum_confidence_threshold(logits_train, train_labels, confT_list, Vr, Vc, Vw)
            t_opt_test, cost_list_ = find_optimum_confidence_threshold(logits_test, y_test, confT_list, Vr, Vc, Vw)
            value_test_opt_test, rej_test_opt_test, wrong_test_opt_test = calculate_value(logits_test, y_test, t_opt_test, Vr, Vc, Vw)

            with open(rv_path, 'a') as f:
                res_i = '{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}\n'.format(alBatch, Vr, Vc, Vw, k, t, t_opt, t_opt_train, t_opt_test, value_test, rej_test, wrong_test, value_train, rej_train, wrong_train, value_test_opt, rej_test_opt, wrong_test_opt, value_train_opt, rej_train_opt, wrong_train_opt, value_test_opt_test, rej_test_opt_test, wrong_test_opt_test)
                f.write(res_i)  
      #  training_data.to_csv(res_path + logfile_name + al_strategy + "_trainingData.csv")
        samplesDict_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in samplesDict.items() ]))
        samplesDict_df.to_csv(res_path + logfile_name + al_strategy + '_' + '_k_{}_sampledItems.csv'.format(k))

In [None]:
#load datasets
for al_strategy in al_strategies:
    pool = Data(encoder, data_folder + dataToTrain, 'train', encName)
    mClass =  pool.mClass
    pool.setStartState(minimum_training_items)

    validation = Data(encoder, data_folder + dataToVal, 'val', encName)
    validation_data = validation.data
    y_val = np.array(validation_data[goldLabel].tolist())
    test = Data(encoder, data_folder + dataToTest, 'test', encName)
    test_data = test.data
    y_test = np.array(test_data[goldLabel].tolist())

    if al_strategy.startswith("tos"):
        run_tos(model,al_strategy,pool,mClass,validation,validation_data,y_val,test,test_data,y_test,alBatchSize,res_path,logfile_name,Vw_list)
    else:
        run_sota_AL(model,al_strategy,pool,mClass,validation,validation_data,y_val,test,test_data,y_test,alBatchSize,res_path,logfile_name,Vw_list)