# Data extraction

If this is being run locally and not on the kaggle platform, please run `data_download.sh`.

In [1]:
from os import walk, listdir
from os.path import isfile, join

# this will only work after deleting the zip file.
for (dirpath, dirnames, filenames) in walk('./data'):
    patients = dirnames
    break
print (patients)
    
files = []
for i in patients:
    for j in ['/0/', '/1/']:
        path = './data/' + i + j
        these_files = [(path+f) for f in listdir(path) if isfile(join(path, f))]
        files += these_files

['16555', '9174', '14078', '12870', '10260', '12947', '12898', '10257', '12949', '13106', '12868', '10279', '9135', '14153', '9125', '14209', '9124', '12930', '12900', '9261', '9291', '10300', '13689', '8914', '9035', '9321', '14210', '14190', '9255', '9250', '15513', '10306', '10301', '9319', '9075', '15839', '15472', '14191', '12911', '9076', '16014', '8975', '13462', '12948', '8918', '8980', '16568', '13022', '9177', '13591', '8867', '12929', '12907', '12891', '9181', '10261', '13404', '14156', '12884', '12895', '13459', '8955', '14079', '12826', '15902', '15634', '16896', '12910', '12242', '12931', '9381', '9322', '13617', '9344', '14189', '12877', '14154', '12905', '12751', '9029', '8950', '8959', '8917', '9173', '13023', '15840', '10274', '12876', '9176', '9265', '12822', '13458', '9037', '15510', '9347', '12818', '13402', '12951', '12901', '12892', '12750', '15471', '10253', '9258', '9324', '9382', '12955', '9383', '9228', '12626', '12749', '12810', '12954', '13692', '9290', '10

In [2]:
import numpy as np
from matplotlib.image import imread

In [3]:
imgs_0 = []
imgs_1 = []
for i in files:
    if int(i[-5]) == 0:
        imgs_0.append(imread(i))
    else:
        imgs_1.append(imread(i))
imgs_0 = np.array(imgs_0)
imgs_1 = np.array(imgs_1)

In [4]:
imgs_0.shape, imgs_1.shape

((198738,), (78786,))

## On data selection
This dataset is very unbalanced. Efforts made to:
- idc vs. non-idc is a 50/50 split
- train vs. test is a 65/35 split

### 50/50 split of IDC vs Non-IDC

In [6]:
non_idc_n = imgs_0.shape[0]
n_idc_n = imgs_1.shape[0]
non_idc_to_delete = imgs_0.shape[0] - imgs_1.shape[0]
# not inspected I know, but the dataset shows more non-idc than idc

In [7]:
print(np.random.choice(imgs_0.shape[0], non_idc_to_delete, replace = False).shape)

(119952,)


In [8]:
idx = np.random.choice(imgs_0.shape[0], non_idc_to_delete, replace = False)
imgs_0 = np.delete(imgs_0, idx, axis=0)
imgs_0.shape

(78786,)

### 65/35 split of training and test

In [9]:
# shuffle and split
# shuffle
np.random.shuffle(imgs_0)
np.random.shuffle(imgs_1)

# split
split_idx = int(np.floor(0.65 * imgs_0.shape[0]))  # idc and non idc share the same size

train_0 = imgs_0[:split_idx]
train_1 = imgs_1[:split_idx]

test_0 = imgs_0[split_idx:]
test_1 = imgs_1[split_idx:]

In [10]:
(train_0.shape, test_0.shape, train_1.shape, test_1.shape)

((51210,), (27576,), (51210,), (27576,))

Cool! We have the dataset prepared (do not read 'cleaned') and wrapped in numpy arrays. Now we only have to create one-hot encodings.

### Formatting for `tensorflow`

In [12]:
train_0_labels = np.zeros(train_0.shape[0])
test_0_labels = np.zeros(test_0.shape[0])

train_1_labels = np.ones(train_1.shape[0])
test_1_labels = np.ones(test_1.shape[0])

train = np.append(train_0, train_1, axis=0)
train_labels = np.append(train_0_labels, train_1_labels, axis=0)

test = np.append(test_0, test_1, axis=0)
test_labels = np.append(test_0_labels, test_1_labels, axis=0)

def shuffle(a, b):
    permutation = np.random.permutation(a.shape[0])
    return a[permutation], b[permutation]

train, train_labels = shuffle(train, train_labels)
test, test_labels = shuffle(test, test_labels)

def get_one_hot(targets, nb_classes):
    # https://stackoverflow.com/a/42874726
    return np.eye(nb_classes)[np.array(targets.astype(np.int32)).reshape(-1)]

train_labels = get_one_hot(train_labels, 2)
test_labels = get_one_hot(test_labels, 2)

In [13]:
import pickle
with open('dataset.npys', 'wb') as handle:
    pickle.dump((train, train_labels, test, test_labels), handle)

MemoryError: 