In [1]:
# https://androidkt.com/deal-with-an-imbalanced-dataset-using-weightedrandomsampler-in-pytorch/
# https://towardsdatascience.com/pytorch-basics-sampling-samplers-2a0f29f0bf2a

In [2]:
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler, WeightedRandomSampler, SequentialSampler, RandomSampler
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:
train = pd.read_csv('../lbp_data/train.csv')
train.head()


Unnamed: 0,file_name,task,label,xmin,ymin,w,h,occluded,des,cell_type,bbox,area,ratio,label_det,label_cls,label_hpv
0,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1337,102,256,136,0,,ASC-US,"[1337, 102, 256, 136]",186,1.371989,ASC-US,ASC-US,0
1,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,220,619,166,169,0,,ASC-US,"[220, 619, 166, 169]",167,0.991085,ASC-US,ASC-US,0
2,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,658,1747,191,166,0,,ASC-US,"[658, 1747, 191, 166]",178,1.072661,ASC-US,ASC-US,0
3,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1571,365,136,146,0,,ASC-US,"[1571, 365, 136, 146]",140,0.965146,ASC-US,ASC-US,0
4,patch_images/2021.01.14/LBC424-20210111(1)/LBC...,[AS6] LBC424,AS,1044,1698,258,204,0,,ASC-US,"[1044, 1698, 258, 204]",229,1.124591,ASC-US,ASC-US,0


In [4]:
[ idx for idx in SequentialSampler(train.file_name.values)][:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [5]:
[ idx for idx in RandomSampler(train.file_name.values)][:10]

[6397, 876, 4478, 17591, 2392, 8167, 11759, 15616, 14565, 8136]

In [6]:
numSample=1000
batch_size=100

sample=torch.FloatTensor(numSample,10)


zero=np.zeros(int(numSample * 0.8),dtype=np.int32)
one=np.ones(int(numSample * 0.2),dtype=np.int32)

target=np.hstack((zero,one))

dataset=sample.numpy()

#split dataset into tran and test set
x_train,x_test,y_train,y_test= train_test_split(dataset,
                                                target,
                                                test_size=0.25,
                                                random_state=42,
                                                stratify=target,
                                                shuffle=True)

In [7]:
count=Counter(y_train)

In [8]:
count

Counter({1: 150, 0: 600})

In [9]:
class_count=np.array([count[0],count[1]])
class_count

array([600, 150])

In [10]:
weight=1./class_count
weight

array([0.00166667, 0.00666667])

In [11]:
samples_weight = np.array([weight[t] for t in y_train])
samples_weight=torch.from_numpy(samples_weight)
# samples_weight

In [12]:
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [13]:
trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(x_train), torch.LongTensor(y_train.astype(int)))
validDataset = torch.utils.data.TensorDataset(torch.FloatTensor(x_test), torch.LongTensor(y_test.astype(int)))


trainLoader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=batch_size, num_workers=1, sampler = sampler)
testLoader = torch.utils.data.DataLoader(dataset = validDataset, batch_size=batch_size, shuffle=False, num_workers=1)

In [14]:
for i, (data, target) in enumerate(trainLoader) :
    count=Counter(target.numpy())
    print('batch {} 0/1 {} {}'.format(i, count[0], count[1]))

batch 0 0/1 43 57
batch 1 0/1 52 48
batch 2 0/1 47 53
batch 3 0/1 56 44
batch 4 0/1 45 55
batch 5 0/1 49 51
batch 6 0/1 54 46
batch 7 0/1 19 31


In [15]:
# https://towardsdatascience.com/pytorch-basics-sampling-samplers-2a0f29f0bf2a

In [16]:
np.random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x7fa624b8f8f0>

In [17]:
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

In [18]:
import sys
sys.path.append('../')
from utils.dataset import *

In [19]:
train = pd.read_csv('../lbp_data/train.csv')

In [20]:
# train_dataset = PapsClsDataset(train)

In [21]:
# [ idx for idx in SequentialSampler(train_dataset)][:10]

In [22]:
# [ idx for idx in RandomSampler(train_dataset)][:10]

In [23]:
train.label_cls

0           ASC-US
1           ASC-US
2           ASC-US
3           ASC-US
4           ASC-US
           ...    
17823    Carcinoma
17824    Carcinoma
17825    Carcinoma
17826    Carcinoma
17827    Carcinoma
Name: label_cls, Length: 17828, dtype: object

In [24]:
count=Counter(train.label_cls)
count

Counter({'ASC-US': 8523,
         'LSIL': 4066,
         'Carcinoma': 502,
         'ASC-H': 892,
         'HSIL': 1103,
         'Negative': 2742})

In [25]:
class_count=np.array([count['ASC-US'],count['LSIL'], count['Carcinoma'],count['ASC-H'], count['HSIL'],count['Negative']])
class_count

array([8523, 4066,  502,  892, 1103, 2742])

In [26]:
train.label_cls = train.label_cls.apply(lambda x : label_id(x))
count=Counter(train.label_cls)
count

Counter({0: 8523, 1: 4066, 4: 502, 3: 892, 2: 1103, 5: 2742})

In [27]:
class_count=np.array([count[0],count[1], count[2],count[3], count[4],count[5]])
class_count

array([8523, 4066, 1103,  892,  502, 2742])

In [28]:
weight=1./class_count
weight

array([0.00011733, 0.00024594, 0.00090662, 0.00112108, 0.00199203,
       0.0003647 ])

In [29]:
samples_weight = np.array([weight[t] for t in train.label_cls])
samples_weight=torch.from_numpy(samples_weight)

In [30]:
samples_weight

tensor([0.0001, 0.0001, 0.0001,  ..., 0.0020, 0.0020, 0.0020],
       dtype=torch.float64)

In [31]:
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

In [38]:
train = pd.read_csv('../lbp_data/train.csv')
train_dataset = PapsClsDataset(train, transform=train_transforms)

(17828, 16)


In [39]:
train_dataset.df.label_cls.value_counts()

0    8523
1    4066
5    2742
2    1103
3     892
4     502
Name: label_cls, dtype: int64

In [40]:
trainLoader = torch.utils.data.DataLoader(train_dataset, batch_size=32, num_workers=4, sampler = sampler)

In [41]:
images, bbox, labels = next(iter(trainLoader))

In [42]:
print(labels)

[tensor([5, 0, 0, 3, 1, 0, 2, 1, 5, 0, 3, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 5, 3, 0, 0])]


In [46]:
for i, (data, bbox, target) in enumerate(trainLoader) :
    count=Counter(target[0].numpy())
    print(count)

Counter({0: 13, 1: 9, 5: 7, 2: 2, 3: 1})
Counter({0: 17, 1: 9, 2: 4, 3: 1, 5: 1})
Counter({0: 12, 1: 9, 5: 6, 3: 3, 2: 2})
Counter({0: 17, 1: 8, 5: 4, 2: 2, 3: 1})
Counter({0: 15, 1: 7, 5: 5, 2: 4, 3: 1})
Counter({1: 12, 0: 9, 5: 8, 4: 2, 2: 1})
Counter({0: 18, 5: 8, 1: 5, 2: 1})
Counter({0: 14, 1: 10, 5: 3, 3: 3, 2: 2})
Counter({0: 13, 5: 9, 3: 6, 1: 2, 2: 1, 4: 1})
Counter({0: 19, 5: 6, 1: 3, 3: 2, 4: 1, 2: 1})
Counter({0: 12, 5: 8, 1: 8, 2: 2, 3: 2})
Counter({0: 20, 1: 6, 5: 4, 2: 2})
Counter({0: 18, 1: 6, 5: 4, 3: 3, 2: 1})
Counter({1: 13, 0: 8, 5: 5, 3: 3, 2: 2, 4: 1})
Counter({0: 16, 2: 4, 3: 4, 1: 4, 5: 3, 4: 1})
Counter({0: 19, 1: 4, 3: 3, 5: 3, 4: 2, 2: 1})


KeyboardInterrupt: 

In [None]:
train.label_cls.value_counts()

In [None]:
train_dataset.df.label_cls.value_counts()