In [None]:
!pip install --upgrade small-text[transformers]==1.0.0b3
! pip install scikit-multilearn

import time
import pickle
import pandas as pd
import torch
import numpy as np
from scipy import sparse
from transformers import AutoTokenizer

from small_text.integrations.transformers.datasets import TransformersDataset
from small_text.active_learner import PoolBasedActiveLearner
from small_text.initialization import random_initialization_balanced, random_initialization, random_initialization_stratified
from small_text.integrations.transformers import TransformerModelArguments
from small_text.integrations.transformers.classifiers.factories import TransformerBasedClassificationFactory
from small_text.query_strategies import PredictionEntropy, RandomSampling, ContrastiveActiveLearning
from small_text.integrations.transformers import TransformerModelArguments

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import StratifiedKFold
from skmultilearn.model_selection import IterativeStratification

In [None]:
# get GPU info
!nvidia-smi -L

Connect to drive, read dataset csv for the Cycling Dialogues from "./drive/My Drive/EGOV-2022/dataset.csv" and transform the dataset.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#df_raddialoge = pd.read_csv("raddialog-thematic-eng.csv")
df_raddialoge = pd.read_csv("./drive/My Drive/EGOV-2022/dataset.csv")

# combine title and text as one column
df_raddialoge.fillna('', inplace=True)
df_raddialoge['title_text'] = df_raddialoge['title'] + " " + df_raddialoge['text']

# get separated dataframes for each city
df_b = df_raddialoge[df_raddialoge['dataset'] == 'B']
df_e = df_raddialoge[df_raddialoge['dataset'] == 'E']
df_m = df_raddialoge[df_raddialoge['dataset'] == 'M']

X_b, X_e, X_m = df_b['title_text'].tolist(), df_e['title_text'].tolist(), df_m['title_text'].tolist()

'''
single label preparation
'''
le = LabelEncoder()
y_b, y_e, y_m = le.fit_transform(df_b['main_category_level1']), le.fit_transform(df_e['main_category_level1']), le.fit_transform(df_m['main_category_level1'])

num_classes = np.unique(y_b).shape[0]

'''
multi label preparation
'''
multi_cols = ['level1_traffic_lights', 'level1_lighting', 'level1_signage', 'level1_bicycle_parking',
                           'level1_obstacles', 'level1_cycling_traffic_management', 'level1_cycle_path_quality',
                           'level1_misc']

y_b_multi, y_e_multi, y_m_multi = df_b[multi_cols].to_numpy(), df_e[multi_cols].to_numpy(), df_m[multi_cols].to_numpy()

Data preparation for BERT

In [None]:
transformer_model_name = 'deepset/gbert-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_model_name, do_lower_case=False)

In [None]:
def get_transformers_dataset(tokenizer, data, labels, max_length=256, multi_label=False):

    data_out = []

    for i, doc in enumerate(data):
        encoded_dict = tokenizer.encode_plus(
            doc,
            add_special_tokens=True,
            padding='max_length',
            max_length=max_length,
            return_attention_mask=True,
            return_tensors='pt',
            truncation='longest_first'
        )

        #data_out.append((encoded_dict['input_ids'], encoded_dict['attention_mask'], labels[i]))

        if multi_label:
            data_out.append((encoded_dict['input_ids'],
                             encoded_dict['attention_mask'],
                             labels[i]))#sparse.csr_matrix(labels[i]))) #np.sort(labels[i]))
                             #torch.tensor([labels[i]], dtype=torch.float)))
        else:
            data_out.append((encoded_dict['input_ids'],
                             encoded_dict['attention_mask'],
                             labels[i]))

    return TransformersDataset(data_out, multi_label=multi_label) #, target_labels=[0,1,2,3,4,5,6,7])

Set up Active Learner

In [None]:
# simulates an initial labeling to warm-start the active learning process
def initialize_active_learner(active_learner, y_train, x_train):

    #indices_initial = random_initialization_balanced(y_train, n_samples=20)
    indices_initial = random_initialization(x_train, n_samples=20)
    active_learner.initialize_data(indices_initial, y_train[indices_initial])

    return indices_initial


Active Learning Loop

In [None]:
def evaluate(active_learner, train, test):
    y_pred_train = active_learner.classifier.predict(train)
    y_pred_test = active_learner.classifier.predict(test)

    test_acc = f1_score(test.y, y_pred_test, average='micro')

    print('Train accuracy: {:.2f}'.format(f1_score(train.y, y_pred_train, average='micro')))
    print('Test accuracy: {:.2f}'.format(test_acc))

    print(classification_report(test.y, y_pred_test))
    
    return test_acc

In [None]:
# run 5-fold CV
def AlCV(X_in, y_in, d, qs, bs, non_active, multi, split_start):

    X = X_in
    y = y_in

    if multi:
        skf = IterativeStratification(n_splits=5, order=1)
    else:
        skf = StratifiedKFold(n_splits=5)
        skf.get_n_splits(X, y)

    split = 1

    for train_index, test_index in skf.split(X, y):

      print("Split",split)

      if split >= split_start:

          # prepare dataset
          X_train, X_test = [X_in[index] for index in train_index], [X_in[index] for index in test_index]
          y_train, y_test = y_in[train_index], y_in[test_index]

          train = get_transformers_dataset(tokenizer, X_train, y_train)#, multi_label=multi)
          test = get_transformers_dataset(tokenizer, X_test, y_test)#, multi_label=multi)

          # init the model and query strategy
          transformer_model = TransformerModelArguments(transformer_model_name)
          clf_factory = TransformerBasedClassificationFactory(transformer_model, 
                                                              num_classes=num_classes, 
                                                              kwargs=dict({'device': 'cuda', 
                                                                          'mini_batch_size': bs,
                                                                          'class_weight': 'balanced', # for single label
                                                                          #'multi_label': multi
                                                                          }))
              
          query_strategy = qs

          active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train, reuse_model=True)
          indices_labeled = initialize_active_learner(active_learner, train.y, train.x)

          # active learning loop

          num_samples = 20
          tmp = (len(X_train)-num_samples)/num_samples
          if tmp > int(tmp):
            num_queries = min(int(tmp) + 1, 30)
          else:
            num_queries = min(int(tmp), 30)

          if non_active:
              num_queries = 1
              
          # time stamps
          query_times = []
          user_simulation_times = []
          update_times = []
          evaluation_times = []
          overall_iteration_times = []

          # get results per iteration
          results = []
          results.append(evaluate(active_learner, train[indices_labeled], test))

          # get labels per iteration
          labels_in_pool = []
          labels_in_pool.append(train.y[indices_labeled])

          for i in range(num_queries):

              start_time = time.time()

              # ...where each iteration consists of labelling 20 samples
              if i == num_queries-1:
                  num_samples = len(X_train) - (num_queries*num_samples)

              indices_queried = active_learner.query(num_samples=num_samples)

              query_time = time.time()
              query_times.append(query_time - start_time)

              # Simulate user interaction here. Replace this for real-world usage.
              y = train.y[indices_queried]

              user_simulation_time = time.time()
              user_simulation_times.append(user_simulation_time - query_time)

              # Return the labels for the current query to the active learner.
              active_learner.update(y)

              indices_labeled = np.concatenate([indices_queried, indices_labeled])

              update_time = time.time()
              update_times.append(update_time - user_simulation_time)
                  
              print('---------------')
              print(f'Iteration #{i} ({len(indices_labeled)} samples)')
              results.append(evaluate(active_learner, train[indices_labeled], test))

              labels_in_pool.append(train.y[indices_labeled])

              evaluation_time = time.time()
              evaluation_times.append(evaluation_time - update_time)

              overall_iteration_times.append(evaluation_time - start_time)

          summary = {
                  'results': results,
                  'query_times': query_times,
                  'user_simulation_times':  user_simulation_times,
                  'update_times': update_times,
                  'evaluation_times': evaluation_times,
                  'overall_iteration_times': overall_iteration_times,
                  'labels_in_pool': labels_in_pool
                  }

          print(summary)

      split += 1

Example usage for an Active Learning experiment:
ALCV(X_dataset, y_dataset, dataset_code, query_strategy, batch_size, non_active, multi, split_start):


*   non_active: if True perform full supervision experiment, else if False perform Active Learning experiment.
*   multi: if True perform multi-class prediction, else if False perform single-class prediction.
*   split_start: if 1 then all five cross validation splits are used. 



In [None]:
AlCV(X_b, y_b, "B", RandomSampling(), 2, non_active=False, multi=False, split_start=1)