split for inova crohns vs cincinnati crohns

In [10]:
import pandas as pd
import numpy as np
import os
import shutil
import random
from sklearn import model_selection

In [11]:
def test_train_split_patches(source, dest = None, copy = False):
    
    '''
    Performs test train split on PATCHED images. Test size 0.25, train size 0.75.
    
    Inputs: 
        source: Path of directory where patched images are located
        dest: Path of directory to copy split images to
        copy: If true, copies the patched images over
        
    Output:
        dataframe with image label column, and test/train label column
    '''
    
    # Get list of participant IDs
    files = os.listdir(source) # get list of image files
    labels = [label.split('__')[0] for label in files] # extract only patient labels from files
    labels = np.unique(labels)
    
    # Test train split
    split = model_selection.train_test_split(labels)
    train, val = split[0], split[1]
    
    # Split image files into test, train
    train_files = []
    for label in train:
        train_files += [file for file in files if label in file]
    val_files = []
    for label in val:
        val_files += [file for file in files if label in file]
        
    train_df = pd.DataFrame(train_files, columns = ['image'])
    train_df['label'] = 'train'
    
    val_df = pd.DataFrame(val_files, columns = ['image'])
    val_df['label'] = 'val'
    
    
    # Extract group from path
    group = source.split('/')[-1]
    
    # Copy files to new directory
    if copy == True:
        # Copy files over
        for file in train_files:
            shutil.copy(source+file, dest+'train/'+group)
        print('Successfully copied files to', dest+'train/'+group)

        for file in val_files:
            shutil.copy(source+file, dest+'val/'+group)
        print('Successfully copied files to', dest+'val/'+group)
    
    out_df = pd.concat([train_df, val_df])
    
    return out_df

In [12]:
# Set path to images
inova_path = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/INOVA/patched/Crohns'
cinc_path  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/patched/Crohns'

# I was running into issues where the split included a '.ipynb_chekpoints' folder.
# Creates file list and removes the '.ipynb_chekpoints' folder
inova_imgs = [i for i in os.listdir(inova_path) if 'ipynb' not in i]
cinc_imgs  = [i for i in os.listdir(cinc_path) if 'ipynb' not in i]

np.random.seed(69)
inova_split = test_train_split_patches(inova_path)
cinc_split  = test_train_split_patches(cinc_path)

In [13]:
def test_train_split_WSIs(image_list, dest = None, copy = False):

    '''
    Performs test train split on unpatched whole slide images
    
    Inputs: 
        source: list of image filenames 
        dest: Path of directory to copy split images to
        copy: If true, copies the patched images over
    '''
    
    # Perform test train split on files
    split = model_selection.train_test_split(image_list)

    train_files = split[0]
    val_files = split[1]

    if copy == True:
        # Copy files over
        for file in train_files:
            shutil.copy(source+file, dest+'train/'+group)
        print('Successfully copied files to', dest+'train/'+group)

        for file in train_files:
            shutil.copy(source+file, dest+'val/'+group)
        print('Successfully copied files to', dest+'val/'+group)

    # Strip file extention from image files
    train_labels = [label.split('.')[0] for label in train_files] 
    val_labels = [label.split('.')[0] for label in val_files]
    
    # Put into dataframes
    train_df = pd.DataFrame(train_labels, columns = ['label'])
    train_df['set'] = 'train'
    
    val_df = pd.DataFrame(val_labels, columns = ['label'])
    val_df['set'] = 'val'
    
    out_df = pd.concat([train_df, val_df])
    
    return out_df

In [14]:
def extract_patches(wsi_labels, patch_labels):
    patch_list = []
    for i in wsi_labels:
        new = [patch for patch in patch_labels if i in patch]
        patch_list = patch_list + new
    return patch_list

### Perform test/train split on whole slide image labels

In [8]:
path = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/INOVA/'
len(os.listdir(os.path.join(path, 'patched', 'Crohns')))

55448

In [15]:
# Set path to images
inova_path = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/INOVA/unpatched/Crohns'
cinc_path1  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/unpatched/B1'
cinc_path2  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/unpatched/B2'
cinc_path3  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/unpatched/B2_B3'
cinc_path4  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/unpatched/B3'

# I was running into issues where the split included a '.ipynb_chekpoints' folder.
# Creates file list and removes the '.ipynb_chekpoints' folder
inova_imgs = [i for i in os.listdir(inova_path) if 'ipynb' not in i]
cinc_imgs1  = [i for i in os.listdir(cinc_path1) if 'ipynb' not in i]
cinc_imgs2  = [i for i in os.listdir(cinc_path2) if 'ipynb' not in i]
cinc_imgs3  = [i for i in os.listdir(cinc_path3) if 'ipynb' not in i]
cinc_imgs4  = [i for i in os.listdir(cinc_path4) if 'ipynb' not in i]
cinc_imgs = cinc_imgs1+cinc_imgs2+cinc_imgs3+cinc_imgs4

np.random.seed(69)
inova_split = test_train_split_WSIs(inova_imgs)
cinc_split  = test_train_split_WSIs(cinc_imgs)

In [16]:
len(inova_split)

260

In [17]:
len(cinc_split)

181

### Using split from WSIs, extract correct patched file labels

In [18]:
inova_train = list(inova_split[inova_split.set == 'train'].label)
inova_test = list(inova_split[inova_split.set == 'val'].label)
cinc_train = list(cinc_split[cinc_split.set == 'train'].label)
cinc_test = list(cinc_split[cinc_split.set == 'val'].label)

# Set paths
inova_path = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/INOVA/patched/Crohns'
cinc_path  = '/sfs/qumulo/qproject/GutIntelligenceLab/msds/data/Cincinnati/patched/Crohns'

# Creates file list and removes the '.ipynb_chekpoints' folder
inova_patches = [i for i in os.listdir(inova_path) if 'ipynb' not in i]
cinc_patches  = [i for i in os.listdir(cinc_path) if 'ipynb' not in i]
cinc_patches  = random.sample(cinc_patches, len(inova_patches)) # to have equal dataset sizes

# Split the patches
inova_train_patches = extract_patches(inova_train, inova_patches)
inova_test_patches = extract_patches(inova_test, inova_patches)
cinc_train_patches = extract_patches(cinc_train, cinc_patches)
cinc_test_patches = extract_patches(cinc_test, cinc_patches)

# Put into dataframes
inova_train_df = pd.DataFrame(extract_patches(inova_train, inova_patches), columns = ['label'])
inova_train_df['loc'] = 0
inova_train_df['set'] = 'train'
inova_val_df =  pd.DataFrame(extract_patches(inova_test, inova_patches), columns = ['label'])
inova_val_df['loc'] = 0
inova_val_df['set'] = 'val'
cinc_train_df =  pd.DataFrame(extract_patches(cinc_train, cinc_patches), columns = ['label'])
cinc_train_df['loc'] = 1
cinc_train_df['set'] = 'train'
cinc_val_df =  pd.DataFrame(extract_patches(cinc_test, cinc_patches), columns = ['label'])
cinc_val_df['loc'] = 1
cinc_val_df['set'] = 'val'

inova_vs_cinc_train = pd.concat([inova_train_df,cinc_train_df])
inova_vs_cinc_val = pd.concat([inova_val_df,cinc_val_df])

### Export to CSV

In [22]:
out = '/sfs/qumulo/qproject/GutIntelligenceLab/bwl3xy/data/inova_vs_cinc_crohns'
os.chdir(out)

inova_vs_cinc_train.to_csv(os.path.join(out,'inova_vs_cinc_train.csv'), index = False, header=False)
inova_vs_cinc_val.to_csv(os.path.join(out,'inova_vs_cinc_val.csv'), index = False, header = False)
# inova_vs_cinc.to_csv(os.path.join(out,'inova_vs_cinc.csv'), index = False)
# inova_train_df.to_csv(os.path.join(out,'inova_train.csv'), index = False)
# inova_val_df.to_csv('inova_val.csv', index = False)
# cinc_train_df.to_csv('cinc_train.csv', index = False)
# cinc_val_df.to_csv('cinc_val.csv', index = False)

### Import back in to do some data exploration

In [3]:
os.chdir('/sfs/qumulo/qproject/GutIntelligenceLab/bwl3xy/data/inova_vs_cinc_crohns/')

In [4]:
train = pd.read_csv('inova_vs_cinc_train.csv', header=None)
val = pd.read_csv('inova_vs_cinc_val.csv', header=None)

In [5]:
inova_train = len(train[train[1] == 0])/(len(train[train[1] == 0])+len(train[train[1] == 1]))
inova_val = len(val[val[1] == 0])/(len(val[val[1] == 0])+len(val[val[1] == 1]))
cinc_train = len(train[train[1] == 1])/(len(train[train[1] == 0])+len(train[train[1] == 1]))
cinc_val = len(val[val[1] == 1])/(len(val[val[1] == 0])+len(val[val[1] == 1]))

In [7]:
train[train[1]==0]

Unnamed: 0,0,1,2
0,INCR0077_A_003__5120_12288.jpg,0,train
1,INCR0077_A_003__14336_9216.jpg,0,train
2,INCR0077_A_003__13312_9728.jpg,0,train
3,INCR0077_A_003__2560_9728.jpg,0,train
4,INCR0077_A_003__9728_5632.jpg,0,train
...,...,...,...
2538,INCR0080_1_D_003__3080_7680.jpg,0,train
2539,INCR0080_1_D_003__10248_12800.jpg,0,train
2540,INCR0080_1_D_003__10760_8192.jpg,0,train
2541,INCR0080_1_D_003__10248_7168.jpg,0,train
