Using reference from https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/2

In [43]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision.transforms import Compose, ToTensor, Resize
from torch.utils.tensorboard import SummaryWriter
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, sampler,Subset,WeightedRandomSampler


from torchvision import transforms, datasets

import logging
import numpy as np
from utils import *


import time
import copy

import skimage 
from skimage import io 
import logging
from PIL import Image

import os 
from os.path import splitext
from os import listdir

import numpy as np
import pandas as pd
from pandas import DataFrame

from torchvision.transforms import Compose, ToTensor, Resize, ToPILImage
from PIL import Image
Image.MAX_IMAGE_PIXELS = 1000000000  # incase PIL gives error


import torch
from torchvision.datasets import ImageFolder
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split
from torchvision.transforms import Compose, ToTensor, Resize
from torch.utils.data import DataLoader

In [2]:
def train_val_dataset(dataset, val_split=0.25, generate_small=False):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets_created = {}
    if generate_small:
        print("Generating Small Train, Test Dataset")
        datasets_created['train'] = Subset(dataset, train_idx[0:200])
        datasets_created['test'] = Subset(dataset, val_idx[0:50])
    else:
        datasets_created['train'] = Subset(dataset, train_idx)
        datasets_created['test'] = Subset(dataset, val_idx)        
    return datasets_created

In [3]:
dataset = ImageFolder('/home/abharani/cs231n_project/glued_images/data', 
                      transform=Compose([Resize((224,224)),ToTensor(), 
                                 Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])]))

print(len(dataset))

dataset_final = {}
datasets_created_trial_1 = train_val_dataset(dataset,val_split=0.10,generate_small=False)
datasets_created_trial_2 = train_val_dataset(datasets_created_trial_1['train'],val_split=0.25,generate_small=False)

dataset_final['test'] = datasets_created_trial_1['test']
dataset_final['train'] = datasets_created_trial_2['train']
dataset_final['val'] = datasets_created_trial_2['test']


print("Train set size {}".format(len(dataset_final['train'])))
print("Validation set size {}".format(len(dataset_final['val'])))
print("Test set size {}".format(len(dataset_final['test'])))

10615
Train set size 7164
Validation set size 2389
Test set size 1062


## Approach I

In [6]:
# len(dataset_final['train'].classes)
# dataset.class_to_idx
# idx2class = {v: k for k, v in dataset.class_to_idx.items()}

# def get_class_distribution(dataset_obj):
#     count_dict = {k:0 for k,v in dataset_obj.class_to_idx.items()}
    
#     for element in dataset_obj:
#         y_lbl = element[1]
#         y_lbl = idx2class[y_lbl]
#         count_dict[y_lbl] += 1
            
#     return count_dict


In [37]:
# len(dataset_final['train'])

from collections import defaultdict 
count_dict = defaultdict(int)
target_list = []

#Generate target_list of all labels and count dict of all classes

for i, (image, label) in enumerate(dataset_final['train']):
    
    count_dict[label] += 1
    target_list.append(label)
    
#     print(i, image.shape,sample)
    
#     if i== 3:
#         break


count_dict
print("Distribution of classes: \n", count_dict)

class_count = [i for i in count_dict.values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
class_weights

Distribution of classes: 
 defaultdict(<class 'int'>, {1: 1811, 0: 1993, 2: 906, 5: 848, 4: 804, 3: 802})


tensor([0.0006, 0.0005, 0.0011, 0.0012, 0.0012, 0.0012])

In [39]:
target_list = torch.tensor(target_list)
target_list

tensor([1, 0, 1,  ..., 5, 5, 5])

In [40]:
target_list = target_list[torch.randperm(len(target_list))]

In [41]:
class_weights_all = class_weights[target_list]
class_weights_all

tensor([0.0012, 0.0011, 0.0006,  ..., 0.0012, 0.0006, 0.0006])

In [44]:
weighted_sampler = WeightedRandomSampler(
    weights=class_weights_all,
    num_samples=len(class_weights_all),
    replacement=True
)

In [45]:
train_loader = DataLoader(dataset_final['train'],batch_size=32, sampler = weighted_sampler, num_workers=4, pin_memory=True)

In [46]:
next(iter(train_loader))
x,y = next(iter(train_loader))

In [48]:
y.shape

torch.Size([32])

## Approach II

In [4]:
def make_weights_for_balanced_classes(images, nclasses):                        
    count = [0] * nclasses                                                      
    for item in images:                                                         
        count[item[1]] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))                                                   
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])                                 
    weight = [0] * len(images)                                              
    for idx, val in enumerate(images):                                          
        weight[idx] = weight_per_class[val[1]]                                  
    return weight       

In [None]:
# For unbalanced dataset we create a weighted sampler                       
weights = make_weights_for_balanced_classes(dataset.imgs, len(dataset.classes))                                                                
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))    



In [None]:
# train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=32,                               
#                                                              sampler = sampler, num_workers=4, pin_memory=True)     

dataloaders = {x:DataLoader(dataset_final[x],batch_size=32, sampler = sampler, num_workers=4, pin_memory=True) for x in ['train','val']}
train_loader = dataloaders['train']
val_loader = dataloaders['val']

In [None]:
next(iter(train_loader))
x,y = next(iter(train_loader))
# print(x.shape, y)