In [1]:
# Initialization
%load_ext autoreload
%autoreload 2
import sys
import os
import pathlib

# To be able to reference packages/modules in this repository, this
# relative path must be added to the python path. Your notebook may be 
# in a different folder, so modify this variable to point to the src 
# folder.
proj_notebooks_root = pathlib.Path().absolute()
proj_root_path = proj_notebooks_root.parent
data_path = proj_notebooks_root / "data"

if proj_root_path not in sys.path:
    sys.path.insert(0, proj_root_path.as_posix())
    print("Updated Python Path")

print(f"Project Root Path: {proj_root_path}")
print(f"Project Source Root Path: {proj_notebooks_root}")
print(f"Project Data Path: {data_path}")

Updated Python Path
Project Root Path: /home/default/workspace
Project Source Root Path: /home/default/workspace/ActiveLearning
Project Data Path: /home/default/workspace/ActiveLearning/data


In [2]:
from PIL import Image
import random
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
from torchvision import transforms
from PIL import Image
from torch.cuda.amp import GradScaler, autocast
import torchvision
from tqdm import tqdm
# from torchinfo import summary
from torch.utils.data import DataLoader
import time

debug = False

print("Pytorch: \t\t", torch.__version__)
if not debug and torch.cuda.is_available():
    device='cuda'
    print('GPU:         \t\t', torch.cuda.get_device_name(0))
    print('Memory Usage:\t',
        round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB / ',
        round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
    torch.backends.cudnn.benchmark = True
else:
    print("GPU is **not available**")
    device='cpu'

Pytorch: 		 1.13.0+cu116
GPU:         		 NVIDIA A100 80GB PCIe MIG 2g.20gb
Memory Usage:	 0.0 GB /  0.0 GB




In [3]:
# !pip install kaggle
# ! cp {proj_notebooks_root / 'kaggle.json'} ~/.kaggle/kaggle.json
# ! chmod 600 ~/.kaggle/kaggle.json


In [6]:
# base_dir = data_path / 'crack_segmentation_dataset'
# if not base_dir.exists():
#     ! kaggle datasets download -p {data_path.as_posix()} -d lakshaymiddha/crack-segmentation-dataset
#     ! unzip -q {data_path / 'crack-segmentation-dataset.zip'} -d {data_path}
#     ! rm {data_path / 'crack-segmentation-dataset.zip'} 
# else:
#     print("Found dataset at ", base_dir.as_posix())

base_dir = data_path / 'ConglomerateConcreteCrackDataset'
if not base_dir.exists():
    dataset_url = 'https://data.lib.vt.edu/ndownloader/articles/16625056/versions/1'

    ! wget {dataset_url} -P {data_path}
    ! unzip -q {data_path / '1'} -d {data_path}
    ! unzip -q {data_path / 'Conglomerate\ Concrete\ Crack\ Detection.zip'} -d {data_path}
    ! mv {data_path/'Conglomerate\ Concrete\ Crack\ Detection'} {base_dir}
    ! mv {data_path / 'README_congl_dataset.rtf'}  {data_path/'ConglomerateConcreteCrackDataset'}     
    ! rm {data_path / 'Conglomerate\ Concrete\ Crack\ Detection.zip'}
    ! rm {data_path / '1'}
else:
    print("Found dataset at ", base_dir.as_posix())


--2022-12-15 20:58:57--  https://data.lib.vt.edu/ndownloader/articles/16625056/versions/1
Resolving data.lib.vt.edu (data.lib.vt.edu)... 52.214.33.39, 63.32.251.213, 2a05:d018:1f4:d003:9b76:e1dc:c675:2471, ...
Connecting to data.lib.vt.edu (data.lib.vt.edu)|52.214.33.39|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1028474638 (981M) [application/zip]
Saving to: ‘/home/default/workspace/ActiveLearning/data/1’


2022-12-15 20:59:51 (18.5 MB/s) - ‘/home/default/workspace/ActiveLearning/data/1’ saved [1028474638/1028474638]



In [None]:
def cleanup(data):
    return data.replace('<br />', '')

class ArrayDataset(torch.utils.data.Dataset):
    def __init__(self, array, image_transforms=None, both_transforms=None):
        self.array = array
                
        self.image_transforms = image_transforms
        self.both_transforms = both_transforms
        self.segment_transforms = transforms.Compose([
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.array)

    def __getitem__(self, index):
        id, imagePath, segmentPath = self.array[index]
        image = Image.open(imagePath).convert('RGB')
        segment = Image.open(segmentPath).convert('L')
        segment = self.segment_transforms(segment)        
            
        if self.image_transforms is not None:
            image = self.image_transforms(image)
            
        if self.both_transforms is not None:
            image = self.both_transforms(image)
            segment = self.both_transforms(segment)
        
        # print(image.shape, segment.shape)

        if image.shape != (3, 448, 448):
            print(f"Image shape is {image.shape}")
        if segment.shape != (1, 448, 448):
            print(f"Segment shape is {segment.shape}")
        
        return image, segment

    def indices(self):
        return self.array[:,0]

    def split(self, p=0.5):
        count = len(self.array)
        index = np.arange(count)
        first = int(count * p)
        return [
            ArrayDataset(self.array[index[:first]], 
                    image_transforms=self.image_transforms,
                    both_transforms=self.both_transforms), 
            ArrayDataset(self.array[index[first:]], 
                    image_transforms=self.image_transforms,
                    both_transforms=self.both_transforms)
        ]

    def pop(self, indices):
        fltr = np.isin(self.array[:, 0], indices)
        # print(fltr)
        removed = ArrayDataset(self.array[fltr], 
                    image_transforms=self.image_transforms,
                    both_transforms=self.both_transforms)
        self.array = self.array[~fltr]
        return removed

    def addLabels(self, labels):
        self.array = np.hstack([self.array, np.expand_dims(labels, axis=1)])
        return self

    def addData(self, otherArrayDataset):
        self.array = np.append(self.array, otherArrayDataset.array, axis=0)


class UnlabelledDataset(ArrayDataset):    
    def __getitem__(self, index):
        id, imagePath = self.array[index]
        image = Image.open(imagePath)
            
        if self.image_transforms is not None:
            image = self.image_transforms(image)
            
        if self.both_transforms is not None:
            image = self.both_transforms(image)
        
        return image, id


class OracleDataset():
    def __init__(self, array):
        self.array = array
    def query(self, indices):
        return self.array[np.isin(self.array[:, 0], indices),1]


def init():
    X_dir = base_dir/'train'/'images'
    y_dir = base_dir/'train'/'masks'

    files = [y for y in X_dir.glob('*')] 

    for i in files:
        assert((y_dir / i.name).exists())

    data = np.array([(id, i, (y_dir / i.name)) for id, i in enumerate(files)])
    # print(all_data)
    oracle_data = data[:,[0, 2]]
    unlabelled_data = data[:,[0, 1]]
    data_transforms = {
        'both': transforms.Compose([
    #         transforms.CenterCrop((630, 1024))
        ]),
        'images': transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    oracle_dataset = OracleDataset(oracle_data)
    unlabelled_dataset = UnlabelledDataset(unlabelled_data,
                              image_transforms=data_transforms['images'], 
                              both_transforms=data_transforms['both'])
    return unlabelled_dataset, oracle_dataset

def random_query(dataset, count, model=None, device=None):
    return np.random.choice(dataset.indices(), count)


In [None]:
class SegModel(torch.nn.Module):
    def __init__(self, numclasses):
        super(SegModel, self).__init__()

        self.fcn = torchvision.models.segmentation.fcn_resnet50(
            weights=None, 
            num_classes = numclasses, 
            aux_loss = False,
            weights_backbone=torchvision.models.ResNet50_Weights.DEFAULT)
#         self.softmax = torch.nn.Softmax(0)

    def forward(self, x):
        x = self.fcn(x)['out']
#         x = self.softmax(x)
        return x

def get_eval_metrics(pred, gold, threshold = 0.5):
    pred = (pred > threshold).long()
    gold = gold.long()
    hits = torch.sum(torch.mul(pred, gold)).item() #element-wise multiplication
    shots = torch.sum(pred).item()
    targets = torch.sum(gold).item()
#     print(hits, shots, targets)
    return hits, shots, targets

def train_loop(dataloader, model, loss_fn, optimizer, scaler, history=None, lr_sched=None):
    size = len(dataloader.dataset)
    model.train()
    loss_sum = 0
    bar_format = "{l_bar}{bar} {elapsed}<{remaining} {rate_fmt}{postfix}"
    
    with tqdm(dataloader, unit="batch", bar_format=bar_format) as tepoch:
        for X, y in tepoch:
            X = X.to(device)
            y = y.to(device)
            optimizer.zero_grad()

            # Runs the forward pass with autocasting.
            with autocast():
                output = model(X)
                loss = loss_fn(output, y)

                hits, shots, targets = get_eval_metrics(output, y)
                total_hits += hits
                total_shots += shots
                total_targets += targets

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            # Backward passes under autocast are not recommended.
            # Backward ops run in the same dtype autocast chose for corresponding forward ops.
            scaler.scale(loss).backward()

            # scaler.step() first unscales the gradients of the optimizer's assigned params.
            # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)

            # Updates the scale for next iteration.
            scaler.update()
            
            loss_sum += loss.item()

            if(lr_sched is not None):
                lr_sched.step()

            loss = loss.item()
            tepoch.set_postfix(
                loss=round(loss, 4), 
                lr=round(lr_sched.get_last_lr()[0], 4)
            )

    history['train_loss'].append(loss_sum/len(dataloader))
    history['train_hits'].append(total_hits)
    history['train_shots'].append(total_shots)
    history['train_targets'].append(total_targets)
    
    if total_shots == 0:
        precision = 0
        recall = 0
        f1score = 0
    else:
        precision = total_hits/total_shots
        recall = total_hits/total_targets
        f1score = (2 * precision * recall)/(precision + recall)
        

    history['train_precision'].append(precision)
    history['train_recall'].append(recall)
    history['train_f1score'].append(f1score)


def test_loop(dataloader, model, loss_fn, history):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss = 0
    total_hits = 0
    total_shots = 0
    total_targets = 0
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
#             print(y.shape)
            
            output = model(X)
            
            loss += loss_fn(output, y).item()
            hits, shots, targets = get_eval_metrics(output, y)
            total_hits += hits
            total_shots += shots
            total_targets += targets

    loss /= num_batches
    history['test_loss'].append(loss)
    history['test_hits'].append(total_hits)
    history['test_shots'].append(total_shots)
    history['test_targets'].append(total_targets)
    
    if total_shots == 0:
        precision = 0
        recall = 0
        f1score = 0
    else:
        precision = total_hits/total_shots
        recall = total_hits/total_targets
        f1score = (2 * precision * recall)/(precision + recall)
        

    history['test_precision'].append(precision)
    history['test_recall'].append(recall)
    history['test_f1score'].append(f1score)


def train_model(train_dataset, test_dataset,epochs=40):
    history = {
        'train_loss':[],
        'train_hits':[],
        'train_shots':[],
        'train_targets':[],
        'train_precision': [],
        'train_recall': [],
        'train_f1score': [],
        'test_loss':[],
        'test_hits':[],
        'test_shots':[],
        'test_targets':[],
        'test_precision': [],
        'test_recall': [],
        'test_f1score': []
    }

    train_dataloader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            prefetch_factor=2,
                            pin_memory=True,
                            num_workers=8)

    test_dataloader = DataLoader(test_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=8)

    model = SegModel(1)
    model.to(device)
        
    loss_fn = torch.nn.BCEWithLogitsLoss()

    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = torch.optim.SGD(params, lr=0.00001, momentum=0.9)

    lr_sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, 
                       max_lr=0.01, 
                       steps_per_epoch=len(train_dataloader), 
                       epochs=epochs)

    for t in tqdm(range(epochs), bar_format="{elapsed} Elapsed | {percentage:3.0f}% done |{bar}| {n_fmt}/{total_fmt} [{remaining} remaining | {rate_fmt}{postfix}]", unit="epoch", total=epochs):
        train_loop(train_dataloader, model, loss_fn, optimizer, history, lr_sched)
        test_loop(test_dataloader, model, loss_fn, history)

    return model, history

In [None]:

# unlabelled_dataset, oracle = init()
# print(f"Starting with unlabelled dataset of size {len(unlabelled_dataset)}")

# chosen_indices = random_query(unlabelled_dataset, p)

# train_dataset, test_dataset =  unlabelled_dataset.pop(chosen_indices)\
#                                     .addLabels(oracle.query(chosen_indices))\
#                                     .split(p=0.5)

# train_dataloader = DataLoader(train_dataset,
#                         batch_size=batch_size,
#                         shuffle=True,
#                         num_workers=1)

# for _ in train_dataloader:
#     pass
# batch = next(iter(train_dataloader))
# tensors = batch[0]
# segments = batch[1]

# # tensors, metadata
# y = int(len(tensors)/2+0.5); x = 2
# fig = plt.figure(figsize=(8*x, 5*y))

# for i in range(min(len(tensors)//2, 3)):
# #     print(y*100 + x*10 + i)
#     ax = fig.add_subplot(y , x , (i*2) + 1)
#     image = tensors[i].permute(1,2,0).cpu().numpy()
#     image = (image - [image[:,:,0].min(), image[:,:,1].min(), image[:,:,2].min()])
#     image = image/([image[:,:,0].max(), image[:,:,1].max(), image[:,:,2].max()])
#     ax.set_axis_off()
#     ax.imshow(image)
#     plt.tight_layout()
    
#     ax = fig.add_subplot(y , x , (i*2) + 2)
#     image = segments[i].permute(1,2,0).cpu().numpy()
#     image = np.squeeze(image)
# #     image = np.atleast_3d(image)
#     ax.set_axis_off()
#     ax.imshow(image)
#     plt.tight_layout()
    


In [1]:

# Initializing parameters
batch_size = 16     # Reduce this if you get memory errors
epochs = 30

K = 300 # Total budget for labelling
p = int(K * .4) # Number of examples to begin with
n = 3
ks = [(K-p)//n]*n
ks[0] += K - p - sum(ks) # removes rounding errors

ks

[60, 60, 60]

In [None]:
history = []

unlabelled_dataset, oracle = init()
print(f"Starting with unlabelled dataset of size {len(unlabelled_dataset)}")

chosen_indices = random_query(unlabelled_dataset, p)

train_dataset, test_dataset =  unlabelled_dataset.pop(chosen_indices)\
                                    .addLabels(oracle.query(chosen_indices))\
                                    .split(p=0.5)
print(f"Randomly sampled train_dataset of size {len(train_dataset)} and test_dataset of size {len(test_dataset)}")

print(f"Training new model ...")
model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
history.append(stats)

for k in ks:
    print(f"Quering {k} items from the from  ...")
    chosen_indices = random_query(unlabelled_dataset, k)

    newData = unlabelled_dataset.pop(chosen_indices)\
                          .addLabels(oracle.query(chosen_indices))
    train_dataset.addData(newData)
    print(f"New train_dataset size: {len(train_dataset)}.")
    print(f"Training new model ...")
    
    model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
    history.append(stats)
    


#~ 1 minute per training


In [None]:

plt.figure(figsize=(8, 8))
plt.subplot(2, 2, 1)
for id, i in enumerate(history):
    plt.plot(i['train_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Training Loss')

plt.subplot(2, 2, 2)
for id, i in enumerate(history):
    plt.plot(i['train_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Training Accuracy')

plt.subplot(2, 2, 3)
for id, i in enumerate(history):
    plt.plot(i['test_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Test Loss')

plt.subplot(2, 2, 4)
for id, i in enumerate(history):
    plt.plot(i['test_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Test Accuracy')


In [None]:
def binary_least_confidence_query(dataset, query_size, model, device, deterministic=True):
    confidences = []
    indices = []

    ul_dataloader = DataLoader(
        dataset, 
        num_workers=8,
        batch_size=batch_size, 
        collate_fn=partial(collate_unlabelled_batch, text_pipeline))
    
    with torch.no_grad():
        for X, idx in tqdm(ul_dataloader):
            X = X.to(device)
            probabilities = model['model'](X)
            confidence = (torch.abs(probabilities - 0.5)*2).squeeze(dim=-1)
            
            confidences.extend(confidence.cpu().tolist())
            indices.extend(idx)
            
    conf = np.asarray(confidences)
    ind = np.asarray(indices)
    sorted_pool = np.argsort(conf)
    # Return the indices corresponding to the lowest `query_size` confidences
    return ind[sorted_pool][0:query_size]


history = []

unlabelled_dataset, oracle = init()
print(f"Starting with unlabelled_dataset of size {len(unlabelled_dataset)}")

chosen_indices = random_query(unlabelled_dataset, p)

train_dataset, test_dataset =  unlabelled_dataset.pop(chosen_indices)\
                                    .addLabels(oracle.query(chosen_indices))\
                                    .split(p=0.5)
print(f"Starting with randomly sampled train_dataset of size {len(train_dataset)} and test_dataset of size {len(test_dataset)}")

print(f"Training new model ...")
model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
history.append(stats)

for k in ks:
    print(f"Quering {k} items from the from  ...")
    chosen_indices = binary_least_confidence_query(unlabelled_dataset, k, model=model, device=device)

    newData = unlabelled_dataset.pop(chosen_indices)\
                          .addLabels(oracle.query(chosen_indices))
    train_dataset.addData(newData)
    print(f"New train_dataset size: {len(train_dataset)}.")
    print(f"Training new model ...")
    
    model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
    history.append(stats)
    


#~ 1 minute per training


In [None]:

plt.figure(figsize=(8, 8))
plt.subplot(2, 2, 1)
for id, i in enumerate(history):
    plt.plot(i['train_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Training Loss')

plt.subplot(2, 2, 2)
for id, i in enumerate(history):
    plt.plot(i['train_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Training Accuracy')

plt.subplot(2, 2, 3)
for id, i in enumerate(history):
    plt.plot(i['test_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Test Loss')

plt.subplot(2, 2, 4)
for id, i in enumerate(history):
    plt.plot(i['test_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Test Accuracy')


In [None]:
def variance(t):
    return torch.var(t, axis=1)

In [None]:
def klDivergence(t):
    return torch.sum((t * torch.log(t/torch.mean(t, axis=1).unsqueeze(-1))), axis=1)

In [None]:
def entropy(t):
    return torch.sum(t * torch.log(t), axis=1)

In [None]:
# !pip install baal

In [None]:
from baal.bayesian.dropout import MCDropoutModule

In [None]:
def binary_disagreement_bald(unlabelled_dataset, k, model, disagreement=entropy, device=device, iterations=20):
    entropies = []
    indices = []

    vocab = model['vocab']
    model = model['model']
    text_pipeline = lambda x: vocab(tokenizer(x))
    label_pipeline = lambda x: 1 if x == 'pos' else 0
    
    ul_dataloader = DataLoader(
        unlabelled_dataset, 
        num_workers=8,
        shuffle=False,
        batch_size=batch_size, 
        collate_fn=partial(collate_unlabelled_batch, text_pipeline))

    with MCDropoutModule(model) as mcdropout_model:
        with torch.no_grad():
            for X, ind in ul_dataloader:
                X = X.to(device)
                conf = torch.stack([mcdropout_model(X).squeeze() for _ in range(iterations)])
                entropies.extend(entropy(conf.T).cpu())
                indices.extend(ind)

    entropies = torch.stack(entropies)
    indices = torch.tensor(indices)
    sorted_pool = torch.argsort(entropies, descending=True)
    return indices[sorted_pool[k:]]


history = []

unlabelled_dataset, oracle = init()
print(f"Starting with unlabelled_dataset of size {len(unlabelled_dataset)}")

chosen_indices = random_query(unlabelled_dataset, p)

train_dataset, test_dataset =  unlabelled_dataset.pop(chosen_indices)\
                                    .addLabels(oracle.query(chosen_indices))\
                                    .split(p=0.5)
print(f"Starting with randomly sampled train_dataset of size {len(train_dataset)} and test_dataset of size {len(test_dataset)}")

print(f"Training new model ...")
model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
history.append(stats)

for k in ks:
    print(f"Quering {k} items from the from  ...")
    chosen_indices = binary_disagreement_bald(unlabelled_dataset, k, model)

    newData = unlabelled_dataset.pop(chosen_indices)\
                          .addLabels(oracle.query(chosen_indices))
    train_dataset.addData(newData)
    print(f"New train_dataset size: {len(train_dataset)}.")
    print(f"Training new model ...")
    
    model, stats = train_model(train_dataset, test_dataset, epochs=epochs)
    history.append(stats)


In [None]:

plt.figure(figsize=(8, 8))
plt.subplot(2, 2, 1)
for id, i in enumerate(history):
    plt.plot(i['train_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Training Loss')

plt.subplot(2, 2, 2)
for id, i in enumerate(history):
    plt.plot(i['train_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Training Accuracy')

plt.subplot(2, 2, 3)
for id, i in enumerate(history):
    plt.plot(i['test_loss'], label=f'Loss {id+1}')
plt.legend(loc='lower right')
plt.title('Test Loss')

plt.subplot(2, 2, 4)
for id, i in enumerate(history):
    plt.plot(i['test_accuracy'], label=f'Accuracy {id+1}')
plt.legend(loc='lower right')
plt.title('Test Accuracy')
