In [None]:
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image, ImageOps

# define image folders
img_dir = os.path.join(os.getcwd(), 'cell_images')
folders = ['Uninfected', 'Parasitized']

# splitting value eg. 80 (train), input folder path, output *.csv tables
def load_train_test(split_ratio, img_dir, folders):
    
    label = 0
    list_test = []
    list_train = []
    
    for folder in folders:
        # get all filenames
        fnames = [os.path.basename(x) for x in glob.glob(os.path.join(img_dir,folder,'*.png'))]

        # splitting ratio
        test_split = round(len(fnames)*(100-split_ratio)/100)

        # shuffle filenames
        #np.random.seed(0)
        np.random.shuffle(fnames)
    
        # save list of test filenames
        fn_test = fnames[:test_split]
        lb_test = np.full(len(fn_test),label,dtype=np.int)
        list_test.extend(list(zip(fn_test, lb_test)))

        # save list of training filenames
        fn_train = fnames[test_split:]
        lb_train = np.full(len(fn_train),label,dtype=np.int)
        list_train.extend(list(zip(fn_train, lb_train)))

        label+=1
    
    
    # save as csv
    df_test = pd.DataFrame(list_test)
    df_train = pd.DataFrame(list_train)
    df_test.to_csv('test.csv', index=False, header=False)
    df_train.to_csv('train.csv', index=False, header=False)
    
    # load images and labels
    x_train = []
    y_train = []    
    # read csv table, get filename and label
    filenames_train = df_train.iloc[:, 0].tolist()
    labels_train = df_train.iloc[:, 1].tolist()
    # load training images and label
    for fn,lb in zip(filenames_train,labels_train):
        img = Image.open(os.path.join(img_path,folders[lb],fn))   
        # normalize
        img = np.array(img) / 255.
        # to greyscale
        img = img.convert('L')
        # padding and downsizing
        img = ImageOps.pad(img, (50,50),color='black', centering=(0.5, 0.5))
        x_train.append(img)
        y_train.append(lb)
        
    # load testing images and label    
    x_test = []
    y_test = []
    # read csv table, get filename and label
    filenames_test = df_test.iloc[:, 0].tolist()
    labels_test = df_test.iloc[:, 1].tolist()
    # load testing images and label
    for fn,lb in zip(filenames_test,labels_test):
        img = Image.open(os.path.join(img_path,folders[lb],fn))   
        # normalize
        img = np.array(img) / 255.
        # to greyscale
        img = img.convert('L')
        # padding and downsizing
        img = ImageOps.pad(img, (50,50),color='black', centering=(0.5, 0.5))
        x_test.append(img)
        y_test.append(lb)
        

    print(folders[0]+':'+str(y_train.count(0)))
    print(folders[1]+':'+str(y_test.count(1)))
    
    return x_train, y_train, x_test, y_test
    

# split images in training and testing set with partition 80/20
# saves partition into csv tables: train.csv, test.csv
# load images and labels
# convert rgb images to greyscale
# pad and downsize images to 50x50 (wxh)
x_train, y_train, x_test, y_test = load_train_test(80, img_dir, folders)