In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
import argparse

import baal
from baal import ActiveLearningDataset
from baal.active import get_heuristic, ActiveLearningLoop
from baal.bayesian.dropout import MCDropoutModule
from copy import deepcopy

import numpy as np

import segmentation_models_pytorch as smp

from PIL import Image
from pprint import pprint

import torch
from torch import nn
from torch.nn import functional as F

from torchvision.transforms import transforms
from torch.cuda.amp import GradScaler, autocast

import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import os
import torchtext
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import random
import pandas as pd


from tqdm import tqdm

from typing import List




In [3]:
debug=False

print("Pytorch: \t\t", torch.__version__)
if not debug and torch.cuda.is_available():
    device='cuda'
    print('GPU:         \t\t', torch.cuda.get_device_name(0))
    print('Memory Usage:\t',
        round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB / ',
        round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
    torch.backends.cudnn.benchmark = True
else:
    print("GPU is **not available**")
    device='cpu'

Pytorch: 		 1.13.0+cu116
GPU:         		 NVIDIA A100 80GB PCIe MIG 2g.20gb
Memory Usage:	 0.0 GB /  0.0 GB


In [4]:
# Initialization
%load_ext autoreload
%autoreload 2
import sys
import os
import pathlib

# To be able to reference packages/modules in this repository, this
# relative path must be added to the python path. Your notebook may be 
# in a different folder, so modify this variable to point to the src 
# folder.
proj_notebooks_root = pathlib.Path().absolute()
proj_root_path = proj_notebooks_root.parent
data_path = proj_notebooks_root / "data"

if proj_root_path not in sys.path:
    sys.path.insert(0, proj_root_path.as_posix())
    print("Updated Python Path")

print(f"Project Root Path: {proj_root_path}")
print(f"Project Source Root Path: {proj_notebooks_root}")
print(f"Project Data Path: {data_path}")

Updated Python Path
Project Root Path: /home/default/workspace
Project Source Root Path: /home/default/workspace/ActiveLearning
Project Data Path: /home/default/workspace/ActiveLearning/data


In [5]:
data_dir = pathlib.Path('data/aclImdb_v1/')

if data_dir.exists():
    print("Found the 'aclImdb_v1' dataset.")
else:
    print("Downloading the 'aclImdb_v1' dataset.")
    dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    !wget {dataset_url} -P {data_dir.parent}
    !mkdir {data_dir}
    !tar xf {data_dir.parent / "aclImdb_v1.tar.gz"} -C {data_dir}
    !mv {data_dir / 'aclImdb' / 'train' / 'unsup'} {data_dir / 'unsup'}
# data_dir = data_dir / 'aclImdb'


Found the 'aclImdb_v1' dataset.


In [6]:
def cleanup(data):
    return data.replace('<br />', '')

class ArrayDataset(torch.utils.data.Dataset):
    def __init__(self, array):
        self.array = array

    def __len__(self):
        return len(self.array)

    def __getitem__(self, index):
        res = self.array[index]
        return cleanup(open(res[0]).read()), res[1]
    
    def split(self, p=0.5):
        count = len(self.array)
        index = np.arange(count)
        first = int(count * p)
        return [
            ArrayDataset(self.array[index[:first]], 
                    image_transforms=self.image_transforms), 
            ArrayDataset(self.array[index[first:]], 
                    image_transforms=self.image_transforms)
        ]
    
    def split_count(self, first):
        return [
            ArrayDataset(self.array[:first]), 
            ArrayDataset(self.array[first:])
        ]

def get_datasets(initial_pool, path):
#     test_dir = data_dir / 'test'
    train_dir = path / 'train'
    print(train_dir)
    files = [y for y in (train_dir).glob('*/*')] 
    data = np.array([(i, i.parent.stem) for id, i in enumerate(files)])
    print(len(data), "examples found")

    dataset = ArrayDataset(data)

    test_set, active_set = dataset.split_count(500)
    print("Active Set: ", len(active_set))
    print("Test Set: ", len(test_set))
    
    active_set = ActiveLearningDataset(active_set)
    
    active_set.label_randomly(initial_pool)
    return active_set, test_set

# get_datasets(100, data_dir)

In [7]:
from collections import Counter, OrderedDict
from torchtext.data.utils import get_tokenizer
from torch import nn
from functools import partial

def collate_unlabelled_batch(text_pipeline, batch):
     indices_list, text_list = [], []
     for (text, index) in batch:
          input = text_pipeline(text)
          indices_list.append(index)
          
          a = torch.tensor(input[:sequence_max_length], dtype=torch.int64)
          b = torch.zeros(max(0, sequence_max_length - len(input)), dtype=torch.int64)
          text_list.append(torch.cat((a,b)))
               
     text_list = torch.stack(text_list)
     return text_list, indices_list
    

def collate_batch(text_pipeline, label_pipeline, batch):
     label_list, text_list = [], []
     for (text, label) in batch:
          input = text_pipeline(text)
          label_list.append(torch.tensor([label_pipeline(label)], dtype=torch.float32))
          
          a = torch.tensor(input[:sequence_max_length], dtype=torch.int64)
          b = torch.zeros(max(0, sequence_max_length - len(input)), dtype=torch.int64)
          text_list.append(torch.cat((a,b)))
               
     label_list = torch.stack(label_list)
     text_list = torch.stack(text_list)
     return text_list, label_list

class ClassifyNet(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        # self.pool = nn.AvgPool1d(3, stride=2)
        self.fc = nn.Linear(embed_dim, num_class)
        self.dropout = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

    def forward(self, x):
        x = self.emb(x)
        # print(x.shape)
        x = self.dropout(x)
        x = torch.mean(x, 1)
        x = self.dropout2(x)
        # print(x.shape)
        x = self.fc(x)

        return torch.sigmoid(x)

def train_loop(dataloader, model, loss_fn, optimizer, history=None):
    size = len(dataloader.dataset)
    total_loss, accuracy = 0, 0
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        
        #FORWARD PASS
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        accuracy += (torch.round(pred) == y).type(torch.float).sum().item()

        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    accuracy /= size
    total_loss /= size
    if history is not None:
        history['train_loss'].append(total_loss)
        history['train_accuracy'].append(accuracy)
    return loss, accuracy


def test_loop(dataloader, model, loss_fn, history):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss, accuracy = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            
            pred = model(X)
            
            loss += loss_fn(pred, y).item()
            accuracy += (torch.round(pred) == y).type(torch.float).sum().item()

    loss /= num_batches
    accuracy /= size

    history['test_loss'].append(loss)
    history['test_accuracy'].append(accuracy)
    return loss, accuracy


def train_model(model_wrapper, loss_fn, train_dataset, test_dataset,epochs=40):
    history = {
        'train_loss':[],
        'train_accuracy':[],
        'test_loss':[],
        'test_accuracy':[]
    }
    
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        num_workers=8,
        batch_size=p_batch_size,
        collate_fn=partial(collate_batch, text_pipeline, label_pipeline))

    test_dataloader = DataLoader(
        test_dataset, 
        num_workers=8,
        batch_size=p_batch_size, 
        collate_fn=partial(collate_batch, text_pipeline, label_pipeline))

    vocab_size = len(vocab)

    optimizer = torch.optim.Adam(model_wrapper.model.parameters(), lr=4e-3)

    for t in tqdm(range(epochs), bar_format="{elapsed} Elapsed | {percentage:3.0f}% done |{bar}| {n_fmt}/{total_fmt} [{remaining} remaining | {rate_fmt}{postfix}]", unit="epoch", total=epochs):
        train_loop(train_dataloader, model_wrapper.model, loss_fn, optimizer, history)
        test_loop(test_dataloader, model_wrapper.model, loss_fn, history)

    
    return model, history

In [8]:
p_active_learning_steps = 3

p_initial_pool = 300
p_query_size = 300
p_query_interations = 20

p_reduce="sum"

p_learning_epochs=30
p_batch_size = 32
p_learning_rate = 0.001

p_classes = 1

vocab_size = 10000
emsize = 16
sequence_max_length=500

use_cuda = torch.cuda.is_available()
# use_cuda = False
print("Use Cuda:", use_cuda)

Use Cuda: True


In [9]:
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for text, _ in data_iter:
        yield tokenizer(text)

def build_vocab_from_iterator(iterator, min_freq: int = 1, specials = [], special_first: bool = True, vocab_size = None):
    counter = Counter()
    for tokens in iterator:
        counter.update(tokens)

    if specials is not None:
        for tok in specials:
            del counter[tok]

    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[0])
    sorted_by_freq_tuples.sort(key=lambda x: x[1], reverse=True)
    
    if vocab_size is not None:
        sorted_by_freq_tuples = sorted_by_freq_tuples[:vocab_size]

    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    if specials is not None:
        if special_first:
            specials = specials[::-1]
        for symbol in specials:
            ordered_dict.update({symbol: min_freq})
            ordered_dict.move_to_end(symbol, last=not special_first)

    word_vocab = torchtext.vocab.vocab(ordered_dict, min_freq=min_freq)
    return word_vocab


# We will use the FocalLoss
if p_classes > 1:
    criterion = FocalLoss(gamma=2, alpha=0.25)
else:
    criterion = nn.BCELoss()

model = ClassifyNet(vocab_size, emsize, p_classes)

# This will enable Dropout at test time.
model = MCDropoutModule(model)

# Put everything on GPU.
if use_cuda:
    model.cuda()

# Keep a copy of the original weights
initial_weights = deepcopy(model.state_dict())

# Add metrics
model = baal.ModelWrapper(model, 
                          criterion, 
                          replicate_in_memory=False)

In [10]:
def get_probabilities(pool, **kwargs):
    return model.predict_on_dataset(pool, 
                                  collate_fn = partial(collate_batch, text_pipeline, label_pipeline), 
                                  **kwargs)

In [11]:
heuristics = ['random', 'entropy', 'bald']
all_histories = []
for h in heuristics:
    print("="*20)
    print(f"Heuristic: {h}")
    print("="*20)
    if h == 'batch_bald':
        heuristic = get_heuristic(h, num_samples=1000)
    else:
        heuristic = get_heuristic(h)

    active_set, test_set = get_datasets(p_initial_pool, data_dir)

    vocab = build_vocab_from_iterator(yield_tokens(active_set), specials=["<pad>", "<unk>"], vocab_size=10000-2)
    vocab.set_default_index(vocab["<unk>"])
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: 1 if x == 'pos' else 0

    # The ALLoop is in charge of predicting the uncertainty and
    loop = ActiveLearningLoop(
        active_set,
        get_probabilities,
        heuristic=heuristic,
        query_size=p_query_size,
        # Instead of predicting on the entire pool, only a subset is used
        max_sample=2000,
        batch_size=p_batch_size,
        iterations=p_query_interations,
        use_cuda=use_cuda,
    )
    history = []
    all_histories.append(history)
    for epoch in range(p_active_learning_steps):
        print(f"Step: {epoch + 1}")
        # Following Gal et al. 2016, we reset the weights.
        model.load_state_dict(initial_weights)
        # Train 50 epochs before sampling.

        print(f"Training Model with {len(active_set)} examples...")
        model, stats = train_model(model, criterion, 
                                   active_set, test_set, 
                                   epochs=p_learning_epochs)
        history.append(stats)

        print("Querying...")

        should_continue = loop.step()

        print("-"*20)

        if not should_continue:
            break

Heuristic: random
data/aclImdb_v1/train
25000 examples found
Active Set:  24500
Test Set:  500
Step: 1
Training Model with 300 examples...


00:46 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.56s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T15:57:04.990178Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 75.22it/s] 
--------------------
Step: 2
Training Model with 600 examples...


00:49 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.66s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T15:57:55.802004Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 84.56it/s] 
--------------------
Step: 3
Training Model with 900 examples...


00:54 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.83s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T15:58:51.902469Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 82.64it/s] 
--------------------
Heuristic: entropy
data/aclImdb_v1/train
25000 examples found
Active Set:  24500
Test Set:  500
Step: 1
Training Model with 300 examples...


00:46 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.56s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T15:59:42.297811Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 77.13it/s] 
--------------------
Step: 2
Training Model with 600 examples...


00:51 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.70s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T16:00:34.665043Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 85.44it/s] 
--------------------
Step: 3
Training Model with 900 examples...


00:56 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.87s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T16:01:31.918916Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 79.03it/s] 
--------------------
Heuristic: bald
data/aclImdb_v1/train
25000 examples found
Active Set:  24500
Test Set:  500
Step: 1
Training Model with 300 examples...


00:47 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.57s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T16:02:22.709903Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 83.55it/s] 
--------------------
Step: 2
Training Model with 600 examples...


00:51 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.71s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T16:03:15.056456Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 77.61it/s] 
--------------------
Step: 3
Training Model with 900 examples...


00:54 Elapsed | 100% done |██████████| 30/30 [00:00 remaining |  1.82s/epoch]


Querying...
[1415865-MainThread] [baal.modelwrapper:predict_on_dataset_generator:232] [2m2022-12-16T16:04:10.971196Z[0m [[32m[1minfo     [0m] [1mStart Predict                 [0m [36mdataset[0m=[35m2000[0m
100%|██████████| 63/63 [00:00<00:00, 83.15it/s] 
--------------------


In [12]:
len(all_histories[-1]), all_histories[-1][0]

(3,
 {'train_loss': [0.023267762263615925,
   0.023034343520800273,
   0.022995293935139972,
   0.022856536706288656,
   0.022935722271601358,
   0.0228342866897583,
   0.022800857027371724,
   0.02287408709526062,
   0.02254206379254659,
   0.022580652634302777,
   0.022379831671714784,
   0.02225578745206197,
   0.02237004299958547,
   0.02214676102002462,
   0.02201523224512736,
   0.021775400241216023,
   0.021246384580930075,
   0.021196983257929482,
   0.02100347101688385,
   0.020669724345207214,
   0.02063179572423299,
   0.019891916513442992,
   0.019621530175209047,
   0.01929427186648051,
   0.01903814673423767,
   0.018251535892486574,
   0.017969027757644654,
   0.017528752982616424,
   0.016769630114237467,
   0.016886346340179444],
  'train_accuracy': [0.45,
   0.55,
   0.5333333333333333,
   0.5466666666666666,
   0.55,
   0.5466666666666666,
   0.5433333333333333,
   0.5533333333333333,
   0.57,
   0.55,
   0.5633333333333334,
   0.5733333333333334,
   0.54,
   0.57666

In [13]:
len(all_histories), len(all_histories[0]), all_histories[0][0].keys()

(3,
 3,
 dict_keys(['train_loss', 'train_accuracy', 'test_loss', 'test_accuracy']))

In [None]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d%H%M")
results_dir = pathlib.Path('results')

timestamp, results_dir.exists()

In [None]:
import json
with open(results_dir / f'{timestamp}-log.txt', 'w') as logfile:
    logfile.write(json.dumps(all_histories))

In [None]:
import matplotlib
import matplotlib.pyplot as plt

def plotHistory(history):
#     metrics = ['train_loss', 'train_precision', 'train_recall', 'train_f1score',
#            'test_loss','test_precision','test_recall', 'test_f1score']
    metrics = list(history[0].keys())
    gradients = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
                          'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
                          'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn']
    norm = matplotlib.colors.Normalize(vmin=0, vmax=len(history))
    rows = 2
    cols = round(len(metrics)/rows)

    fig, axes = plt.subplots(rows, cols, figsize=(12,8), sharey = True)

    for index, metric in enumerate(metrics):
        cmap = matplotlib.cm.get_cmap(gradients[index])

        ax = axes[index//cols,index%cols]
        for i, val in enumerate(history):
            ax.plot(val[metric], color=cmap(norm(i)))
        ax.set_title(metric)

print("For Heuristic: ", heuristics[0])
plotHistory(all_histories[0])
plt.savefig(results_dir / f'{timestamp}-{heuristics[0]}.png', bbox_inches='tight')




In [None]:
print("For Heuristic: ", heuristics[1])
plotHistory(all_histories[1])
plt.savefig(results_dir / f'{timestamp}-{heuristics[1]}.png', bbox_inches='tight')


In [None]:
print("For Heuristic: ", heuristics[2])
plotHistory(all_histories[2])
plt.savefig(results_dir / f'{timestamp}-{heuristics[2]}.png', bbox_inches='tight')


In [None]:
print("For Heuristic: ", heuristics[3])
plotHistory(all_histories[3])
plt.savefig(results_dir / f'{timestamp}-{heuristics[3]}.png', bbox_inches='tight')