# Prepare Data

In [34]:
import torch
from torch.utils.data import DataLoader,Dataset
import os
from glob import glob
import numpy as np
from collections import Counter
from imblearn import over_sampling,under_sampling
from torchsampler import ImbalancedDatasetSampler

In [4]:
path = '/Users/Aymanjabri/notebooks/Artwork/data/images/images'
x = glob(path+'/**/*')

classes = [os.path.basename(i) for i in glob(path+'/**')]
targets = np.arange(0,50)
class_to_idx=dict(zip(classes,targets))

In [5]:
d = [os.path.basename(os.path.dirname(i)) for i in x]

In [6]:
y = [class_to_idx[i] for i in d]

In [10]:
balance = dict(sorted(Counter(y).items()));balance

{0: 84,
 1: 128,
 2: 702,
 3: 43,
 4: 291,
 5: 99,
 6: 259,
 7: 49,
 8: 194,
 9: 255,
 10: 90,
 11: 119,
 12: 181,
 13: 81,
 14: 87,
 15: 31,
 16: 134,
 17: 188,
 18: 311,
 19: 73,
 20: 239,
 21: 164,
 22: 81,
 23: 126,
 24: 47,
 25: 91,
 26: 139,
 27: 70,
 28: 88,
 29: 117,
 30: 877,
 31: 59,
 32: 193,
 33: 186,
 34: 120,
 35: 439,
 36: 24,
 37: 336,
 38: 102,
 39: 141,
 40: 67,
 41: 55,
 42: 137,
 43: 171,
 44: 109,
 45: 262,
 46: 143,
 47: 70,
 48: 66,
 49: 328}

In [30]:
balanceList = list(balance.values())
max_idx = balanceList.index(max(balanceList))
min_idx = balanceList.index(min(balanceList))
max_idx,min_idx

(30, 36)

In [31]:
print(classes[max_idx]) #The class with the most samples
print(classes[min_idx]) #The class with least number of samples

Vincent_van_Gogh
Jackson_Pollock


# Balance the Data

### 1. Balance the sample using Random Over Sampling

let's balance our data by synthetically generate additional number of minority classes' painting to match the majority class sample number. 

In this case the max number of samples across classes is 877 paintings by Van Gogh 

In [7]:
over = over_sampling.RandomOverSampler()

In [8]:
# balance.fit_resample(x,y)
X = np.array(x).reshape(-1,1)

In [9]:
X_resampled,y_resampled=over.fit_resample(X,y)

In [10]:
sorted(Counter(y_resampled).items())

[(0, 877),
 (1, 877),
 (2, 877),
 (3, 877),
 (4, 877),
 (5, 877),
 (6, 877),
 (7, 877),
 (8, 877),
 (9, 877),
 (10, 877),
 (11, 877),
 (12, 877),
 (13, 877),
 (14, 877),
 (15, 877),
 (16, 877),
 (17, 877),
 (18, 877),
 (19, 877),
 (20, 877),
 (21, 877),
 (22, 877),
 (23, 877),
 (24, 877),
 (25, 877),
 (26, 877),
 (27, 877),
 (28, 877),
 (29, 877),
 (30, 877),
 (31, 877),
 (32, 877),
 (33, 877),
 (34, 877),
 (35, 877),
 (36, 877),
 (37, 877),
 (38, 877),
 (39, 877),
 (40, 877),
 (41, 877),
 (42, 877),
 (43, 877),
 (44, 877),
 (45, 877),
 (46, 877),
 (47, 877),
 (48, 877),
 (49, 877)]

In [11]:
len(X_resampled),len(x)

(43850, 8446)

The over-resamled data `X_resample` and `y_resample` now has the max number of samples across classes

### 2. Balance the sample using Random *Under* Sampling

To balance our data by reducing the number of samples to match the minority class, which in this case is 'Jackson_Pollock' with 24 paintings only:

In [12]:
under = under_sampling.RandomUnderSampler()

In [13]:
X = np.array(x).reshape(-1,1)

In [14]:
X_u,y_u=under.fit_resample(X,y)

In [15]:
sorted(Counter(y_u).items())

[(0, 24),
 (1, 24),
 (2, 24),
 (3, 24),
 (4, 24),
 (5, 24),
 (6, 24),
 (7, 24),
 (8, 24),
 (9, 24),
 (10, 24),
 (11, 24),
 (12, 24),
 (13, 24),
 (14, 24),
 (15, 24),
 (16, 24),
 (17, 24),
 (18, 24),
 (19, 24),
 (20, 24),
 (21, 24),
 (22, 24),
 (23, 24),
 (24, 24),
 (25, 24),
 (26, 24),
 (27, 24),
 (28, 24),
 (29, 24),
 (30, 24),
 (31, 24),
 (32, 24),
 (33, 24),
 (34, 24),
 (35, 24),
 (36, 24),
 (37, 24),
 (38, 24),
 (39, 24),
 (40, 24),
 (41, 24),
 (42, 24),
 (43, 24),
 (44, 24),
 (45, 24),
 (46, 24),
 (47, 24),
 (48, 24),
 (49, 24)]

In [16]:
len(X_u),len(x)

(1200, 8446)

The new data `X_u` and `y_u` now has the same number of sample acros classes

### 3. Balance the sample using torchsampler

Experementing this new sampler that I found on github.

The calim is that it doesn't augment the data but rather samples it in a balanced way. that being said, it means we need to create the `DataLoader` then iter through it to see if it really works. 

P.S it doesn't work with `TensorData`

In [32]:
x[:5],y[:5]

(['/Users/Aymanjabri/notebooks/Artwork/data/images/images/Piet_Mondrian/Piet_Mondrian_38.jpg',
  '/Users/Aymanjabri/notebooks/Artwork/data/images/images/Piet_Mondrian/Piet_Mondrian_10.jpg',
  '/Users/Aymanjabri/notebooks/Artwork/data/images/images/Piet_Mondrian/Piet_Mondrian_11.jpg',
  '/Users/Aymanjabri/notebooks/Artwork/data/images/images/Piet_Mondrian/Piet_Mondrian_39.jpg',
  '/Users/Aymanjabri/notebooks/Artwork/data/images/images/Piet_Mondrian/Piet_Mondrian_13.jpg'],
 [0, 0, 0, 0, 0])

In [35]:
class ArtworkSet(Dataset):
    def __init__(self,x,y,class_to_idx,classes,transform=None):
        self.classes = classes
        self.class_to_idx = class_to_idx
        self.paths = x
        self.targets = y
        self.transform=transform
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self,idx):
        img,label = Image.open(self.paths[idx][0]),self.targets[idx]
        if img.getbands()[0] == 'L':
            img = img.convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        return img,label

In [36]:
trainset = ArtworkSet(x,y,class_to_idx,classes)

In [38]:
trainloader = DataLoader(trainset,batch_size=50,
                         sampler=ImbalancedDatasetSampler(trainset)) #for some reason it doesn't work here as well!!

NotImplementedError: 

### 3. Adjust loss function weights

An easy way is to adjust loss function such that it over-punishes errors in predicting minority classes.

`CrossEntropyLoss` has already a builtin weight function that we can use

In [72]:
len(classes)

50

In [74]:
curr_weights = np.bincount(y)/len(y)*100;curr_weights 

array([ 0.99455363,  1.5155103 ,  8.31162681,  0.50911674,  3.44541795,
        1.1721525 ,  3.06654037,  0.58015629,  2.2969453 ,  3.01918068,
        1.06559318,  1.40895098,  2.14302628,  0.95903386,  1.03007341,
        0.36703765,  1.58654985,  2.22590575,  3.68221643,  0.86431447,
        2.82974189,  1.94174757,  0.95903386,  1.49183045,  0.55647644,
        1.0774331 ,  1.64574947,  0.8287947 ,  1.04191333,  1.38527113,
       10.38361354,  0.69855553,  2.28510538,  2.20222591,  1.42079091,
        5.19772673,  0.28415818,  3.97821454,  1.20767227,  1.66942932,
        0.79327492,  0.65119583,  1.62206962,  2.02462704,  1.29055174,
        3.10206015,  1.69310916,  0.8287947 ,  0.781435  ,  3.88349515])

In [77]:
#because we have 50 classes. the goal is to have a 2% representation of each classes in the dataset (100/50)
#To achieve that we divid 2/current_weights 
weights = torch.tensor(2/curr_weights);weights

tensor([2.0110, 1.3197, 0.2406, 3.9284, 0.5805, 1.7063, 0.6522, 3.4473, 0.8707,
        0.6624, 1.8769, 1.4195, 0.9333, 2.0854, 1.9416, 5.4490, 1.2606, 0.8985,
        0.5432, 2.3140, 0.7068, 1.0300, 2.0854, 1.3406, 3.5940, 1.8563, 1.2153,
        2.4131, 1.9195, 1.4438, 0.1926, 2.8631, 0.8752, 0.9082, 1.4077, 0.3848,
        7.0383, 0.5027, 1.6561, 1.1980, 2.5212, 3.0713, 1.2330, 0.9878, 1.5497,
        0.6447, 1.1813, 2.4131, 2.5594, 0.5150], dtype=torch.float64)

In [78]:
import torch.nn as nn
loss_fn = nn.CrossEntropyLoss(weight=weights)