# Load Datasets

In [1]:
import pandas as pd

df_train = pd.read_csv('../data/trainLabels.csv')
df_train = df_train[df_train.image.isin(['492_right'])==False]

df_test = pd.read_csv('../data/testLabels.csv') # Já temos os targets da base de teste, então vamos usar.
df_test = df_test[df_test.image.isin(['25313_right', '27096_right'])==False]

df_train['folder'] = 'train'
df_test['folder'] = 'test'
df = pd.concat([df_train.loc[:, ['folder', 'image', 'level']], df_test.loc[:, ['folder', 'image', 'level']]])



# Calc Balance

In [2]:
from sklearn.utils import class_weight
class_w = class_weight.compute_class_weight('balanced'
                                               ,sorted(df.level.unique())
                                               ,df.level)

# Balance the Dataset

In [3]:
class_w

array([0.27149569, 2.85895246, 1.34882908, 8.50014375, 9.26844305])

Apenas a classe 0 tem peso menor que 1, sendo assim, iremos diminuir seu volume aplicando downsample e vamos manter as demais classes sem alteração.
Será necessário recalcular os pesos para a utilização no hiperparâmetro class_weight do Keras.

In [4]:
from sklearn.utils import resample

n_samples = class_w * df.level.value_counts().sort_index().values

df0 = resample(df[df.level==0], n_samples=int(n_samples[0]), replace=False, random_state=2019)
#df1 = resample(df[df.level==1], n_samples=int(n_samples[1]), replace=True, random_state=2019)
#df2 = resample(df[df.level==2], n_samples=int(n_samples[2]), replace=True, random_state=2019)
#df3 = resample(df[df.level==3], n_samples=int(n_samples[3]), replace=True, random_state=2019)
#df4 = resample(df[df.level==4], n_samples=int(n_samples[4]), replace=True, random_state=2019)

df_others = df[df.level != 0]
                        
#df_balanced = pd.concat([df0, df1, df2, df3, df4])
df_balanced = pd.concat([df0, df_others])

# Split Data into Train & Test

In [8]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_balanced, test_size = 0.3, random_state = 1407)

print(df_train.shape)
print(df_test.shape)

(28767, 3)
(12330, 3)


# Calc Balance for Train Data

Estes pesos precisam ser colocados no hiperparâmetro class_weight do Keras.

In [12]:
train_class_w = class_weight.compute_class_weight('balanced'
                                               ,sorted(df_train.level.unique())
                                               ,df_train.level)

train_class_w

array([0.46182373, 1.33303985, 0.62618633, 3.90590631, 4.31936937])

# Move Files do Test Dir

In [9]:
import shutil

for index, row in df_train.iterrows():
    shutil.move('../data/all/' + str(row.level) + '/' + row.folder + '_' + row.image + '.jpeg', 
                '../data/train/' + str(row.level) + '/' + row.folder + '_' + row.image + '.jpeg')
    

In [10]:
import shutil

for index, row in df_test.iterrows():
    shutil.move('../data/all/' + str(row.level) + '/' + row.folder + '_' + row.image + '.jpeg', 
                '../data/test/' + str(row.level) + '/' + row.folder + '_' + row.image + '.jpeg')

# Create a Small Dataset to Train

In [2]:
import os, random, shutil

base_dir = '../data/'

for d in ['train', 'test']:
    for c in range(0,5):
        files = os.listdir(base_dir + d + '/' + str(c))
        sample = random.sample(files, 200)
        for f in sample:
            shutil.copyfile(os.path.join(base_dir, d, str(c), f), os.path.join(base_dir, 'small_' + d, str(c), f))