In [8]:
import pandas as pd
import numpy as np
import datetime

# used to access folder structures
import os

# used to open images
import PIL

# Graphs, visualizations
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import scipy

import pickle
from tqdm import tqdm


from sklearn.model_selection import train_test_split

In [3]:
base_path = "IDC_regular_ps50_idx5/"
folder = os.listdir(base_path)
print("No. of Patients total:",len(folder))

total_images = 0
for n in range(len(folder)):
    patient_id = folder[n]
    for c in [0, 1]:
        patient_path = base_path + patient_id
        class_path = patient_path + '/' + str(c) + '/'
        subfiles = os.listdir(class_path)
        total_images += len(subfiles)
        
print("Total Images in dataset: ", total_images )

No. of Patients total: 279
Total Images in dataset:  277524


In [4]:
# create an empty dataframe with a column for each the patient id,
# the path to the image and the target label for each patch
data = pd.DataFrame(index=np.arange(0, total_images), columns=["patient_id", "path", "target"])

patientData = pd.DataFrame(index=np.arange(0, len(folder)), columns=["patient_id", "nrPos", "nrNeg"])

k = 0
n = 0
# Iterate over all patients (1 folder = 1 patient)
for i in range(len(folder)):
    
    # Fill the patient Data dataframe with the patient and the number of pos and neg patches
    if n > 0:
        patientData.iloc[n-1]["patient_id"] = patient_id
        patientData.iloc[n-1]["nrPos"] = nrPos
        patientData.iloc[n-1]["nrNeg"] = nrNeg
    
    nrPos = 0
    nrNeg = 0
    
    patient_id = folder[n]
    patient_path = base_path + patient_id 
    
    # Iterate over the two subfolders with the negative and positive patches 
    for c in [0,1]:        
        class_path = patient_path + "/" + str(c) + "/"
        subfiles = os.listdir(class_path)
        
        # Iterate over the images in the subfolder and fill the dataframe
        for m in range(len(subfiles)):
            
            # Count the number of positive and negative patches per patient
            if c == 0:
                nrNeg += 1
            else:
                nrPos += 1
            
            image_path = subfiles[m]
            data.iloc[k]["path"] = class_path + image_path
            data.iloc[k]["target"] = c
            data.iloc[k]["patient_id"] = patient_id
            k += 1
    n += 1

# write the last patient
patientData.iloc[n-1]["patient_id"] = patient_id
patientData.iloc[n-1]["nrPos"] = nrPos
patientData.iloc[n-1]["nrNeg"] = nrNeg

In [5]:
data.to_pickle('data.pkl')
patientData.to_pickle('patientData.pkl')

In [9]:
X_data=[]
y_data=[]
resized = 0

with tqdm(total=data.shape[0]) as pbar: 
    for index, row in data[:].iterrows():
        pbar.update(1)
        image = PIL.Image.open(row['path'])
        npImage = np.asarray(image)

        # Resize images with format different than our 50x50 patches
        if npImage.shape != (50, 50, 3):
            resized += 1
            image = image.resize((50, 50))
            npImage = np.asarray(image)
        X_data.append(npImage)
        y_data.append(row['target'])
    
    
print('X_data shape: ', np.array(X_data).shape)
print('y_data shape: ', np.array(y_data).shape)

print('In total %d patches had to be resized, since the format differed from 50x50'%resized)

100%|█████████████████████████████████████████████████████████████████████████| 277524/277524 [44:47<00:00, 103.26it/s]


X_data shape:  (277524, 50, 50, 3)
y_data shape:  (277524,)
In total 2302 patches had to be resized, since the format differed from 50x50


In [10]:
open_file = open("x.pickle", "wb")
pickle.dump(X_data, open_file)

open_file = open("y.pickle", "wb")
pickle.dump(y_data, open_file)

In [13]:
X_data

[array([[[226, 164, 206],
         [224, 154, 196],
         [225, 175, 211],
         ...,
         [240, 221, 237],
         [232, 184, 214],
         [243, 213, 235]],
 
        [[217, 142, 188],
         [221, 130, 179],
         [224, 150, 196],
         ...,
         [227, 170, 204],
         [229, 180, 215],
         [236, 212, 232]],
 
        [[237, 178, 212],
         [229, 157, 199],
         [218, 125, 175],
         ...,
         [221, 184, 217],
         [190, 153, 193],
         [227, 164, 208]],
 
        ...,
 
        [[217, 145, 192],
         [214, 129, 184],
         [212, 129, 183],
         ...,
         [194, 122, 185],
         [204, 143, 193],
         [189, 129, 188]],
 
        [[218, 144, 192],
         [213, 128, 185],
         [208, 121, 171],
         ...,
         [136,  79, 145],
         [184, 111, 174],
         [188, 112, 176]],
 
        [[212, 125, 181],
         [211, 136, 181],
         [220, 162, 206],
         ...,
         [127,  90, 152],
  