In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import glob
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Path
DATASET_PATH = "LungAbnotmalities_Dataset"
LABELS = ["Cardiomegaly", "Nodule_Mass", "Pneumothorax"]

In [3]:
def getAllFilePath(PATH):
    train_images_path = np.array([])
    train_labels = np.array([])
    test_images_path = np.array([])
    test_labels = np.array([])
    
    total_test_Cardiomegaly_label = 40
    total_test_Nodule_Mass_label = 40
    total_test_Pneumothorax_label = 37
    
    for label in LABELS:
        if label == 'Cardiomegaly':
            total_test_per_label = total_test_Cardiomegaly_label
        elif label == 'Nodule_Mass':
            total_test_per_label = total_test_Nodule_Mass_label
        elif label == 'Pneumothorax':
            total_test_per_label = total_test_Pneumothorax_label

        images_path = []
        for root, dirs, filenames in os.walk(PATH + "/" + label):
            for filename in filenames:
                if ".jpg" in filename:
                    image_path = os.path.join(root, filename)
                    # print(image_path)
                    images_path.append(image_path)
        # Path per label
        images_path = sorted(images_path)
        
        # Save train images
        train_images_path = np.concatenate((train_images_path, images_path[total_test_per_label:]))
        
        # Save test images
        test_images_path = np.concatenate((test_images_path, images_path[:total_test_per_label]))

        # Inset Label
        create_train_label_data = np.repeat(label, len(images_path[total_test_per_label:]))
        train_labels = np.concatenate((train_labels, create_train_label_data))
        create_test_label_data = np.repeat(label, len(images_path[:total_test_per_label]))
        test_labels = np.concatenate((test_labels, create_test_label_data))
    
    train_images_path = np.array(train_images_path).reshape(-1)
    test_images_path = np.array(test_images_path).reshape(-1)
    # Labels
    train_labels = np.array(train_labels).reshape(-1)
    test_labels = np.array(test_labels).reshape(-1)
    
    return train_images_path, train_labels, test_images_path, test_labels


train_images_path, train_labels, test_images_path, test_labels = getAllFilePath(DATASET_PATH)

In [4]:
train_images_path.shape

(468,)

In [5]:
def encodeLabels(data_labels):
    # Ordinal Encoder
    ord_enc = OrdinalEncoder()
    all_data_labels_ordinal = ord_enc.fit_transform(data_labels)
    # One-Hot Encoder
    one_hot_enc = OneHotEncoder(sparse=False)
    all_data_labels_onehot = one_hot_enc.fit_transform(data_labels.reshape(-1, 1))
    return all_data_labels_ordinal, all_data_labels_onehot

# Create Class Encoder
num_train_labels = len(train_labels)
all_data_labels = np.concatenate((train_labels, test_labels), axis=0).reshape(-1, 1)
all_data_labels_ordinal, all_data_labels_onehot = encodeLabels(all_data_labels)
train_labels_ord = all_data_labels_ordinal[:num_train_labels].reshape(-1)
test_labels_ord = all_data_labels_ordinal[num_train_labels:].reshape(-1)
train_labels_onehot = all_data_labels_onehot[:num_train_labels]
test_labels_onehot = all_data_labels_onehot[num_train_labels:]



In [6]:
# Shuffle Data Train
train_images_path, train_labels, train_labels_ord, train_labels_onehot = shuffle(train_images_path, train_labels, train_labels_ord, train_labels_onehot)

### Initiate Writer Excel Pandas

In [7]:
excel_filename = DATASET_PATH + "/Split_LungAbnotmalities_Dataset.xlsx"
writer_excel = pd.ExcelWriter(excel_filename, engine='xlsxwriter')

# Proses K-Fold

In [8]:

def kfold_save_excel(n_splits):
    kf = KFold(n_splits=n_splits)

    i = 0
    for train_idx, val_idx in kf.split(train_images_path):
        i += 1
        # Split Data into Train and Validation
        fold_train_images_path, fold_val_images_path = train_images_path[train_idx], train_images_path[val_idx]
        fold_train_npy_path, fold_val_npy_path = np.char.replace(fold_train_images_path, 'jpg', 'npy'), np.char.replace(fold_val_images_path, 'jpg', 'npy') # add npy columns
        fold_train_labels, fold_val_labels = train_labels[train_idx], train_labels[val_idx]
        fold_train_labels_ord, fold_val_labels_ord = train_labels_ord[train_idx], train_labels_ord[val_idx]
        fold_train_labels_onehot, fold_val_labels_onehot = train_labels_onehot[train_idx], train_labels_onehot[val_idx]
        # Create dataframe pandas all images path
        train_df = pd.DataFrame({"images_path": fold_train_images_path,
                                "labels": fold_train_labels,
                                "labels_ordinal": fold_train_labels_ord,
                                "labels_one_hot_state_1": fold_train_labels_onehot[:, 0],
                                "labels_one_hot_state_2": fold_train_labels_onehot[:, 1],
                                "labels_one_hot_state_3": fold_train_labels_onehot[:, 2],
                                })
        val_df = pd.DataFrame({"images_path": fold_val_images_path,
                                "labels": fold_val_labels,
                                "labels_ordinal": fold_val_labels_ord,
                                "labels_one_hot_state_1": fold_val_labels_onehot[:, 0],
                                "labels_one_hot_state_2": fold_val_labels_onehot[:, 1],
                                "labels_one_hot_state_3": fold_val_labels_onehot[:, 2],
                                })
        # Create Sheet Name of Train and Validation at per Fold
        sheet_train_name = "fold_{}_train".format(i)
        sheet_val_name = "fold_{}_val".format(i)
        # Save Dataframe each Sheet
        train_df.to_excel(writer_excel, sheet_name=sheet_train_name)
        val_df.to_excel(writer_excel, sheet_name=sheet_val_name)

### Save Train and Validation (K-Fold) Dataset to Excel

In [9]:
kfold_save_excel(5)

### Save Test Dataset to Excel

In [10]:
test_labels_ord.shape

(117,)

In [11]:
test_df = pd.DataFrame({"images_path": test_images_path,
                        "labels": test_labels,
                        "labels_ordinal": test_labels_ord,
                        "labels_one_hot_state_1": test_labels_onehot[:, 0],
                        "labels_one_hot_state_2": test_labels_onehot[:, 1],
                        "labels_one_hot_state_3": test_labels_onehot[:, 2],
                        })

test_df.to_excel(writer_excel, sheet_name="test")

### Save ALL DATASET TO EXCEL

In [12]:
# Last Save Excel
writer_excel.close()

In [13]:
train_labels

array(['Cardiomegaly', 'Nodule_Mass', 'Pneumothorax', 'Pneumothorax',
       'Cardiomegaly', 'Nodule_Mass', 'Cardiomegaly', 'Nodule_Mass',
       'Cardiomegaly', 'Pneumothorax', 'Cardiomegaly', 'Pneumothorax',
       'Nodule_Mass', 'Pneumothorax', 'Nodule_Mass', 'Nodule_Mass',
       'Pneumothorax', 'Cardiomegaly', 'Pneumothorax', 'Nodule_Mass',
       'Cardiomegaly', 'Cardiomegaly', 'Cardiomegaly', 'Pneumothorax',
       'Cardiomegaly', 'Nodule_Mass', 'Cardiomegaly', 'Pneumothorax',
       'Nodule_Mass', 'Cardiomegaly', 'Cardiomegaly', 'Nodule_Mass',
       'Nodule_Mass', 'Pneumothorax', 'Pneumothorax', 'Pneumothorax',
       'Cardiomegaly', 'Nodule_Mass', 'Cardiomegaly', 'Cardiomegaly',
       'Cardiomegaly', 'Cardiomegaly', 'Pneumothorax', 'Pneumothorax',
       'Cardiomegaly', 'Pneumothorax', 'Nodule_Mass', 'Nodule_Mass',
       'Nodule_Mass', 'Nodule_Mass', 'Pneumothorax', 'Cardiomegaly',
       'Cardiomegaly', 'Nodule_Mass', 'Cardiomegaly', 'Pneumothorax',
       'Nodule_Mass', '

In [14]:
# importing Image class from PIL package
from PIL import Image

# creating a object
im = Image.open((train_images_path[50]).replace('.npy', '.jpg').replace('npy_images', 'images'))

im.show()
print(train_images_path[50])
print(train_labels[50])

LungAbnotmalities_Dataset/Pneumothorax/images/00025787_050.jpg
Pneumothorax
