In [1]:
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import seaborn as sns
from tqdm import tqdm
from torch import nn
from sklearn.metrics import accuracy_score
from umap import UMAP

In [2]:
%matplotlib inline

In [3]:
parameters = context.catalog.load('parameters')

2020-12-25 14:17:20,021 - kedro.io.data_catalog - INFO - Loading data from `parameters` (MemoryDataSet)...


In [4]:
cv_results = context.catalog.load('cv_results')

2020-12-25 14:17:20,063 - kedro.io.data_catalog - INFO - Loading data from `cv_results` (PickleDataSet)...


In [8]:
print(f"Accuracy: {cv_results['summary']['test_accuracy_mean']:0.4f} +- {cv_results['summary']['test_accuracy_std']:0.4f}")

Accuracy: 0.8746 +- 0.0038


In [None]:
print(f"F1-score: {cv_results['f1_score_mean']:0.4f} +- {cv_results['f1_score_std']:0.4f}")

In [5]:
train_lmdb = context.catalog.load('train_lmdb')

2020-12-23 16:00:39,682 - kedro.io.data_catalog - INFO - Loading data from `train_lmdb` (KedroImageLMDBDataSet)...


In [15]:
labels = train_lmdb.labels
sources = train_lmdb.sources

In [16]:
pd.Series(train_lmdb.labels).value_counts(normalize=True, sort=False)

0    0.056623
1    0.132007
2    0.114575
3    0.587118
4    0.109676
dtype: float64

In [17]:
original_class_distribution = pd.Series(train_lmdb.labels[train_lmdb.sources == 'train_2020']).value_counts(normalize=True, sort=False).values
original_class_distribution

array([0.05075952, 0.10231362, 0.11152138, 0.61500351, 0.12040196])

In [85]:
pd.Series(train_lmdb.labels).value_counts(normalize=True, sort=False)[[0, 1, 2, 3, 4]].values

array([0.05662312, 0.13200668, 0.11457542, 0.58711834, 0.10967644])

In [86]:
indices_2020 = np.argwhere(sources == 'train_2020')
indices_2019 = np.argwhere(sources == 'train_2019')

len(indices_2020), len(indices_2019)

(21395, 4937)

In [87]:
probs = np.array([original_class_distribution[labels[i]] for i in indices_2019])

In [88]:
probs

array([[0.05075952],
       [0.05075952],
       [0.05075952],
       ...,
       [0.12040196],
       [0.12040196],
       [0.12040196]])

In [89]:
probs = probs / probs.sum()

In [90]:
resampled_indices_2019 = np.random.choice(indices_2019, len(indices_2019), p=probs)

ValueError: a must be 1-dimensional

In [91]:
len(resampled_indices_2019) == len(indices_2019)

True

In [92]:
pd.Series(labels[resampled_indices_2019]).value_counts(normalize=True, sort=False)[[0, 1, 2, 3, 4]]

0    0.013774
1    0.072919
2    0.040916
3    0.851327
4    0.021065
dtype: float64

In [93]:
from sklearn.model_selection import StratifiedKFold

In [94]:
cv = StratifiedKFold(n_splits=parameters['cv_splits'], random_state=parameters['seed'])



In [95]:
np.random.shuffle(indices_2020)

In [96]:
indices_2020

array([[20035],
       [12244],
       [13863],
       ...,
       [ 5527],
       [13207],
       [19278]])

In [103]:
splits = []
for (train_2019_idx, test_2019_idx), (train_2020_idx, test_2020_idx) in zip(cv.split(indices_2019, labels[indices_2020][:len(indices_2019)]),
                                                                            cv.split(indices_2020, labels[indices_2020])):
    train_idx = np.concatenate([train_2019_idx, train_2020_idx])
    test_idx = test_2020_idx

    splits.append((train_idx, test_idx))

In [105]:
splits[0]

(array([ 1171,  1179,  1184, ..., 21392, 21393, 21394]),
 array([   0,    1,    2, ..., 5576, 5582, 5595]))

In [106]:
pd.Series(labels[splits[0][0]]).value_counts(normalize=True, sort=False)[[0, 1, 2, 3, 4]]

0    0.051448
1    0.101529
2    0.111302
3    0.614645
4    0.121076
dtype: float64

In [107]:
pd.Series(labels[splits[0][1]]).value_counts(normalize=True, sort=False)[[0, 1, 2, 3, 4]]

0    0.047672
1    0.101701
2    0.114414
3    0.616751
4    0.119462
dtype: float64