# Data preparation

In [2]:
from pathlib import Path
import numpy as np 
import pandas as pd
import pickle
from skimage import io
from tqdm.notebook import tqdm
tqdm().pandas();
import os

0it [00:00, ?it/s]

## 1. Extracting paths & metadata in a dataframe

In [3]:
# Extract and store all image paths into a list
def store_image_paths(input_path = Path.cwd()/'input/breast-histopathology-images'):
    image_paths = [image_path for image_path in Path.glob(input_path,pattern = '*/*/*.png')]
    return image_paths


# Extract all metadata into a dictionary
def store_metadata(image_paths) -> dict:
    path_data = {'path':[],'patient_id':[],'x_coord':[] ,'y_coord':[],'target':[]}
    for image_path in tqdm(image_paths, total = len(image_paths)):
        
        # get only filename and split to get such a list : ['10253', 'idx5', 'x1001', 'y1001', 'class0.png']
        filename_splitted = str(image_path).split('/')[-1].split('_') # replace / by \\ for windows
        
        path_data['path'].append(image_path)
        path_data['patient_id'].append(filename_splitted[0])
        path_data['x_coord'].append(filename_splitted[2][1:])
        path_data['y_coord'].append(filename_splitted[3][1:])
        path_data['target'].append(filename_splitted[4][5])   
        
    return path_data

In [4]:
image_paths = store_image_paths()
path_data = store_metadata(image_paths)

# Create dataframe from dictionary
df_total = pd.DataFrame.from_dict(path_data)
df_total = df_total.astype({"patient_id" : int, "x_coord": int, "y_coord": int, "target" : int})

  0%|          | 0/277524 [00:00<?, ?it/s]

In [5]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277524 entries, 0 to 277523
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   path        277524 non-null  object
 1   patient_id  277524 non-null  int64 
 2   x_coord     277524 non-null  int64 
 3   y_coord     277524 non-null  int64 
 4   target      277524 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 10.6+ MB


## 2. Extracting patches arrays

In [6]:
def get_img_arrays(df,):
    # read each image array from corresponding path as grayscale and flatten the image array
    df['img_array'] = df.progress_apply(lambda x : io.imread(x['path']).flatten(),axis=1); # make sure to specify axis = 1
    # get the shape of each image array and store it in the dataframe
    df['array_shape'] = df.progress_apply(lambda x : x['img_array'].shape[0],axis=1); # make sure to specify axis = 1
    return df

df_img = get_img_arrays(df = df_total)

  0%|          | 0/277524 [00:00<?, ?it/s]

  0%|          | 0/277524 [00:00<?, ?it/s]

## 3. Removing outliers

In [7]:
# get all images that do not have an array shape of 7500
weird_imgs = df_img[df_img['array_shape'] != 7500] 

# drop images using indices of the filter
df_img.drop(weird_imgs.index,inplace=True) 

## 4. Selecting sample for model training

In [8]:
# quantity of cancerous and healthy data to be used for model training
size_cancerous_sample = 75000
size_healthy_sample = 75000

np.random.seed(0)
cancerous_indexes = np.random.choice(df_img[df_img['target']==1].index, size=size_cancerous_sample, replace=False)
healthy_indexes = np.random.choice(df_img[df_img['target']==0].index, size=size_healthy_sample, replace=False)
all_indexes = np.concatenate((cancerous_indexes, healthy_indexes))

sample_df = df_img.loc[all_indexes,:]

## 6. Saving data as pickle file

In [10]:
sample_df.to_pickle("./data/images.pkl")