In [2]:
# Imports

# File handling
import os
import pathlib
from zipfile import ZipFile
import splitfolders

# Visualization
import  matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data and maths
import numpy as np
import pandas as pd
import random

# ML
import tensorflow as tf
from tensorflow import keras
from keras.utils import image_dataset_from_directory
from keras.preprocessing.image import ImageDataGenerator

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
#from keras.callbacks import CSVLogger # if problems with SAVING model

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
def data_split(dataframes, val_split=0.1):
    ''' 
    (list, float) -> df
    
    Takes a list of df's (dataframes) and a float as input. Returns 1 df combining all df's with a new 'SPLIT' column, 
    indicating whether a row is marked for training, testing or validation. 
    
    dataframes: list of dataframes
    val_split: desired portion of validation data (default = 0.1)
    
    Note: the train-test split is a constant 80/20 of all data not in the validation set, no matter the value of val_split.
    '''
    train_set = pd.DataFrame()
    test_set = pd.DataFrame()
    val_set = pd.DataFrame()
    
    for df in dataframes:
        df = shuffle(df)
        df.reset_index(drop=True, inplace=True)
        val = pd.DataFrame(df[:round(val_split*len(df))-1]) # rows meant for validation set
        val_set = pd.concat([val_set, val], ignore_index=True) # add validation data to main validation set (val_set)
        df = df.drop(val.index) # drop validation data from df
        
        df.reset_index(drop=True, inplace=True)
        train, test = train_test_split(df, train_size=0.8) # 80/20 train-test split over the rest of data
        train_set = pd.concat([train_set, train], ignore_index=True)
        test_set = pd.concat([test_set, test], ignore_index=True)
        
    train_set = shuffle(train_set.assign(SPLIT = lambda x: ('train')))
    test_set = shuffle(test_set.assign(SPLIT = lambda x: ('test')))
    val_set = shuffle(val_set.assign(SPLIT = lambda x: ('valid')))
    
    df = pd.concat([train_set, test_set, val_set], ignore_index=True) # combine all 3 sets back in 1 df
    df.reset_index(drop=True, inplace=True)
    
    return df

In [7]:
# Creating 2 columns containing filepaths of images and masks files 

normal_df = pd.read_excel(r'data\Normal.metadata.xlsx').assign(PATH_IMAGES = lambda x: ('/data/image_data/Normal/' + x['FILE NAME']))
normal_df = normal_df.assign(PATH_MASKS = lambda x: ('/data/mask_data/Normal/' + x['FILE NAME']))

covid_df = pd.read_excel(r'data\COVID.metadata.xlsx').assign(PATH_IMAGES = lambda x: ('/data/image_data/COVID/' + x['FILE NAME']))
covid_df = covid_df.assign(PATH_MASKS = lambda x: ('/data/mask_data/COVID/' + x['FILE NAME']))

opacity_df = pd.read_excel(r'data\Lung_Opacity.metadata.xlsx').assign(PATH_IMAGES = lambda x: ('/data/image_data/Lung_Opacity/' + x['FILE NAME']))
opacity_df = opacity_df.assign(PATH_MASKS = lambda x: ('/data/mask_data/Lung_Opacity/' + x['FILE NAME']))

pneumonia_df = pd.read_excel(r'data\Viral Pneumonia.metadata.xlsx').assign(PATH_IMAGES = lambda x: ('/data/image_data/Viral Pneumonia/' + x['FILE NAME']))
pneumonia_df = pneumonia_df.assign(PATH_MASKS = lambda x: ('/data/mask_data/Viral Pneumonia/' + x['FILE NAME']))

#df = data_split([normal_df, covid_df, opacity_df, pneumonia_df], 0.1)[['FILE NAME', 'PATH_IMAGES', 'PATH_MASKS', 'SPLIT']]
#df.to_csv('data.csv', index=False)

In [8]:
df = pd.read_csv('data.csv')
df

Unnamed: 0,FILE NAME,PATH_IMAGES,PATH_MASKS,SPLIT
0,NORMAL-4744,/data/image_data/Normal/NORMAL-4744,/data/mask_data/Normal/NORMAL-4744,train
1,Lung_Opacity-186,/data/image_data/Lung_Opacity/Lung_Opacity-186,/data/mask_data/Lung_Opacity/Lung_Opacity-186,train
2,COVID-867,/data/image_data/COVID/COVID-867,/data/mask_data/COVID/COVID-867,train
3,COVID-647,/data/image_data/COVID/COVID-647,/data/mask_data/COVID/COVID-647,train
4,NORMAL-7502,/data/image_data/Normal/NORMAL-7502,/data/mask_data/Normal/NORMAL-7502,train
...,...,...,...,...
21160,Lung_Opacity-2794,/data/image_data/Lung_Opacity/Lung_Opacity-2794,/data/mask_data/Lung_Opacity/Lung_Opacity-2794,valid
21161,NORMAL-5298,/data/image_data/Normal/NORMAL-5298,/data/mask_data/Normal/NORMAL-5298,valid
21162,NORMAL-5276,/data/image_data/Normal/NORMAL-5276,/data/mask_data/Normal/NORMAL-5276,valid
21163,COVID-986,/data/image_data/COVID/COVID-986,/data/mask_data/COVID/COVID-986,valid


In [13]:
# image data
splitfolders.ratio('data/image_data/', output="images", move=True)

Copying files: 21165 files [01:19, 266.50 files/s]


In [14]:
# mask data
splitfolders.ratio('data/mask_data/', output="masks", move=True)

Copying files: 21165 files [01:11, 294.94 files/s]
