In [4]:
import numpy as np 
import pandas as pd 
import slidingwindow as sw 
import skimage 
from skimage import io 
from pandas import DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

import os 
import re #2 
import random #3
from time import time #4

from utils import *

import torch # 7
from torch.utils.data  import Dataset # 8

from fastai2.data.core import DataLoaders # 5
from fastai2.vision.all import * #6
from fastai2.vision.widgets import *
from fastai2.data.external import untar_data,URLs
from fastai2.data.transforms import get_image_files # 1


In [2]:
BASE_FOLDER = "/project/data/"   # if on gcloud 

train = pd.read_csv(os.path.join(BASE_FOLDER, 'train.csv'))

image_dir = '/home/abharani/data/train_images/'
path = Path(image_dir)

1. Wrapping all pre-processing (resize, conversion to tensor, dividing by 255 and reordering of the channels) on image into one step using a helper func.
2. See label for image inside the file name (or generate via image_id from fname) 


#### Generate list of files at image dir, pick random indexes and perform split for train and validation set.

In [None]:
files = get_image_files(path)[0:200]
idxs = np.random.permutation(range(len(files)))
cut = int(0.8 * len(files))
train_files = files[idxs[:cut]]
valid_files = files[idxs[cut:]]
print("Training set images {}, Validation set images {}".format(len(train_files),len(valid_files)))

#### Let's check unique lables in dataset and distribution of each label 

In [None]:
labels = list(set(files.map(label_func3)))
print("distinct labels {}".format(len(labels)))

## Approach I  - Purely Pytorch 
Following from https://dev.fast.ai/tutorial.siamese


####  We can use above files to create Dataset 

In [None]:
class BiopsyDataset(Dataset):
    def __init__(self, files, is_valid=False):
        self.files = files
        self.is_valid =is_valid
        
    def __getitem__(self, i):
        file_path = self.files[i]
        tic = time.time()
        processed_image = pre_process_image(file_path) 
        toc = time.time()
        print("Time took to pre-process {} secs".format(toc-tic))
        cls = label_func3(file_path)
        y_tensor = torch.tensor(cls, dtype=torch.long)
        return (processed_image, y_tensor)
    
    def __len__(self): 
        return len(self.files)
    
    
train_ds :Dataset = BiopsyDataset(train_files)
valid_ds :Dataset = BiopsyDataset(valid_files, is_valid=True)

# Validate dataset

for i in range(len(train_ds)):
    sample = train_ds[i]

    print(i, sample[0].shape, sample[1])

    if i == 3:
        plt.show()
        break
    

#### Create DataLoaders with the following factory method DataLoaders

We can change batch-size depending upon gpu

In [None]:
dls = DataLoaders.from_dsets(train_ds, valid_ds,bs=5,num_workers=4)

#### to use the GPU and inspect one batch of data

In [None]:
dls = dls.cuda()
b = dls.one_batch()

##### Create cnn_learner using pre-trained resnet50 model

In [None]:
learn = cnn_learner(dls, resnet50, metrics=[accuracy],n_out=6,loss_func=F.cross_entropy)
learn.fine_tune(10)

### End of Approach I
what is a bit annoying is that we have to rewrite everything that is already in fastai if we want to normalize our images, or apply data augmentation.

### Approach II - Fastai
Following from https://dev.fast.ai/tutorial.siamese

A dataset like before, you can easily convert it into a fastai Transform by just changing the __getitem__ function to encodes. 

So three things changed:

1. the __len__ disappeared, we won't need it
2. __getitem___ became encodes
3. we return TensorImage for our images

still wrapping all pre-processing (resize, conversion to tensor, dividing by 255 and reordering of the channels) on image into one step using a helper func. 
and generating label for image inside the file name (or generate via image_id from fname)

In [None]:
class BiopsyTransform(Transform):
    def __init__(self, files, is_valid=False):
        self.files = files
        self.is_valid = is_valid
        
    def encodes(self, i):
        file_path = self.files[i]
#         tic = time.time()
        processed_image = pre_process_image(file_path) 
#         toc = time.time()
#         print("Time took to pre-process {} secs".format(toc-tic)) 
        cls = label_func3(file_path)
        y_tensor = torch.tensor(cls, dtype=torch.long)
        return (TensorImage(processed_image), y_tensor)
    

##### How do we build a dataset with this? We will use TfmdLists. It's just an object that lazily applies a collection of Transforms on a list. Here since our transform takes integers, we will pass simple ranges for this list. 

In [None]:
train_tl= TfmdLists(range(len(train_files)), BiopsyTransform(train_files))
valid_tl= TfmdLists(range(len(valid_files)), BiopsyTransform(valid_files, is_valid=True))

##### Then, when we create a DataLoader, we can add any transform we like.


In [None]:
dls = DataLoaders.from_dsets(train_tl, valid_tl, bs=5,num_workers=4,after_item=[Resize(224), ToTensor],
                             after_batch=[Resize(224),Normalize.from_stats(*imagenet_stats), *aug_transforms()])
dls = dls.cuda()
b = dls.one_batch()
print(b[0].shape,b[1])

In [None]:
# for i, sample in enumerate(dls):
#     print(sample)
# dls.show_batch()

##### Create cnn_learner using pre-trained resnet50 model

In [None]:
learn = cnn_learner(dls, resnet50, metrics=[accuracy],n_out=6,loss_func=F.cross_entropy)
learn.fine_tune(10)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
learn.export()

In [None]:
path = Path()
path.ls(file_exts='.pkl')

In [None]:
learn_inf = load_learner(path/'export_resnet50.pkl')

### Approach III 
Using Fast ai on processed images

In [12]:
from torchvision.transforms import Compose, ToTensor, Resize
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, sampler,Subset,WeightedRandomSampler
from sklearn.model_selection import train_test_split
from collections import defaultdict 

In [10]:
def train_val_dataset(dataset, val_split=0.25, generate_small=False):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets_created = {}
    if generate_small:
        print("Generating Small Train, Test Dataset")
        datasets_created['train'] = Subset(dataset, train_idx[0:200])
        datasets_created['test'] = Subset(dataset, val_idx[0:50])
    else:
        datasets_created['train'] = Subset(dataset, train_idx)
        datasets_created['test'] = Subset(dataset, val_idx)        
    return datasets_created

In [11]:
dataset = ImageFolder('/home/abharani/cs231n_project/glued_images/data', 
                      transform=Compose([Resize((224,224)),ToTensor(), 
                                 Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])]))

print(len(dataset))

dataset_final = {}
datasets_created_trial_1 = train_val_dataset(dataset,val_split=0.10,generate_small=False)
datasets_created_trial_2 = train_val_dataset(datasets_created_trial_1['train'],val_split=0.25,generate_small=False)

dataset_final['test'] = datasets_created_trial_1['test']
dataset_final['train'] = datasets_created_trial_2['train']
dataset_final['val'] = datasets_created_trial_2['test']


print("Train set size {}".format(len(dataset_final['train'])))
print("Validation set size {}".format(len(dataset_final['val'])))
print("Test set size {}".format(len(dataset_final['test'])))

10615
Train set size 7164
Validation set size 2389
Test set size 1062


In [13]:
count_dict = defaultdict(int)
target_list = []

#Generate target_list of all labels and count dict of all classes
for i, (image, label) in enumerate(dataset_final['train']):
    count_dict[label] += 1
    target_list.append(label)
    
#     if i== 3:
#         break

print("Distribution of classes: \n", count_dict)

class_count = [i for i in count_dict.values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 

target_list = torch.tensor(target_list)
class_weights_all = class_weights[target_list]

weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

train_loader = DataLoader(dataset_final['train'],batch_size=32, sampler = weighted_sampler, num_workers=4, pin_memory=True)
val_loader = DataLoader(dataset_final['val'],batch_size=32, sampler = weighted_sampler, num_workers=4, pin_memory=True)

dataloaders = {'train': train_loader, 'val': val_loader}

Distribution of classes: 
 defaultdict(<class 'int'>, {1: 1811, 0: 1993, 2: 906, 5: 848, 4: 804, 3: 802})


AttributeError: 'dict' object has no attribute 'after_batch'