In [1]:
# https://androidkt.com/deal-with-an-imbalanced-dataset-using-weightedrandomsampler-in-pytorch/
# https://towardsdatascience.com/pytorch-basics-sampling-samplers-2a0f29f0bf2a

In [1]:
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler, SequentialSampler, RandomSampler
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
import sys
sys.path.append('../')
from utils.dataset import *

%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fb8a5e19890>

In [4]:
train = pd.read_csv('../lbp_data/train.csv')
train.head()


Unnamed: 0,file_name,task,label,xmin,ymin,w,h,occluded,des,cell_type,bbox,area,ratio,label_det,label_cls,label_hpv
0,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1337,102,256,136,0,,ASC-US,"[1337, 102, 256, 136]",186,1.371989,ASC-US,ASC-US,0
1,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,220,619,166,169,0,,ASC-US,"[220, 619, 166, 169]",167,0.991085,ASC-US,ASC-US,0
2,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,658,1747,191,166,0,,ASC-US,"[658, 1747, 191, 166]",178,1.072661,ASC-US,ASC-US,0
3,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1571,365,136,146,0,,ASC-US,"[1571, 365, 136, 146]",140,0.965146,ASC-US,ASC-US,0
4,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1044,1698,258,204,0,,ASC-US,"[1044, 1698, 258, 204]",229,1.124591,ASC-US,ASC-US,0


In [5]:
train.label_cls

0           ASC-US
1           ASC-US
2           ASC-US
3           ASC-US
4           ASC-US
           ...    
17823    Carcinoma
17824    Carcinoma
17825    Carcinoma
17826    Carcinoma
17827    Carcinoma
Name: label_cls, Length: 17828, dtype: object

In [6]:
count=Counter(train.label_cls)
count

Counter({'ASC-US': 8523,
         'LSIL': 4066,
         'Carcinoma': 502,
         'ASC-H': 892,
         'HSIL': 1103,
         'Negative': 2742})

In [7]:
class_count=np.array([count['ASC-US'],count['LSIL'], count['Carcinoma'],count['ASC-H'], count['HSIL'],count['Negative']])
class_count

array([8523, 4066,  502,  892, 1103, 2742])

In [8]:
train.label_cls = train.label_cls.apply(lambda x : label_id(x))
count=Counter(train.label_cls)
count

Counter({0: 8523, 1: 4066, 4: 502, 3: 892, 2: 1103, 5: 2742})

In [9]:
class_count=np.array([count[0],count[1], count[2],count[3], count[4],count[5]])
class_count

array([8523, 4066, 1103,  892,  502, 2742])

In [10]:
weight=np.sqrt(1./class_count)
weight

array([0.01083188, 0.01568254, 0.0301101 , 0.03348248, 0.04463218,
       0.01909705])

In [11]:
samples_weight = np.array([weight[t] for t in train.label_cls])
samples_weight=torch.from_numpy(samples_weight)

In [12]:
samples_weight

tensor([0.0108, 0.0108, 0.0108,  ..., 0.0446, 0.0446, 0.0446],
       dtype=torch.float64)

In [13]:
print(list(train.label_cls)[-10:])

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [14]:
len(samples_weight)

17828

In [15]:
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [3]:
train = pd.read_csv('../lbp_data/train.csv')
train_dataset = PapsClsDataset(train, transform=train_transforms)

(17828, 16)


In [4]:
train_dataset.df.label_cls.value_counts()

0    8523
1    4066
5    2742
2    1103
3     892
4     502
Name: label_cls, dtype: int64

In [5]:
train_dataset.df.label_cls.values

array([3, 2, 2, ..., 0, 4, 1])

In [6]:
count=Counter(train_dataset.df.label_cls)
count

Counter({3: 892, 2: 1103, 5: 2742, 0: 8523, 4: 502, 1: 4066})

In [7]:
class_count=np.array([count[0],count[1], count[2],count[3], count[4],count[5]])
class_count

array([8523, 4066, 1103,  892,  502, 2742])

In [8]:
weight=np.sqrt(1./class_count)
# weight=(1./class_count)
weight

array([0.01083188, 0.01568254, 0.0301101 , 0.03348248, 0.04463218,
       0.01909705])

In [9]:
samples_weight = np.array([weight[t] for t in train_dataset.df.label_cls.values])
samples_weight=torch.from_numpy(samples_weight)
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [10]:
samples_weight

tensor([0.0335, 0.0301, 0.0301,  ..., 0.0108, 0.0446, 0.0157],
       dtype=torch.float64)

In [11]:
trainLoader = torch.utils.data.DataLoader(train_dataset, batch_size=16, num_workers=8, sampler = sampler)

In [12]:
images, boxes, labels = next(iter(trainLoader))
count=Counter(np.array(labels[0]))
count

Counter({1: 4, 0: 5, 5: 4, 3: 2, 4: 1})

In [13]:
print(labels)

[tensor([1, 0, 0, 5, 5, 1, 0, 5, 3, 1, 3, 4, 1, 0, 0, 5])]


In [26]:
'''
for i, (target) in enumerate(trainLoader) :
    
    count=Counter(target.numpy())
    print(count)
'''

'\nfor i, (target) in enumerate(trainLoader) :\n    \n    count=Counter(target.numpy())\n    print(count)\n'

In [27]:
targets = list(WeightedRandomSampler([1., 0.9, 0.5], 50, replacement=True))
print(targets)

[0, 2, 1, 0, 2, 1, 2, 2, 0, 2, 1, 1, 0, 0, 2, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 2, 0, 2, 0, 1, 1, 2, 0, 1, 0, 0, 1, 2, 0, 0, 0, 2, 1, 0]


In [28]:
count=Counter(targets)
count

Counter({0: 20, 2: 13, 1: 17})

In [29]:
targets = list(WeightedRandomSampler([1., 0.9, 0.5, 1., 0.9, 0.5, 1., 0.9, 0.5, 1., 0.9, 0.5, 1., 0.9, 0.5], 10, replacement=False))
print(targets)

[14, 6, 9, 1, 3, 13, 11, 7, 12, 4]
