# Step 1: Preprocessing
This file is used to create folders (i.e., Train, Valid, and Test) as well as moving/copy the files from the original dataset into these folders. 
<br>
<br>
**Requirments**
- Download the dataset into the same folder ("unZipDatafolder").
- Unzip every zip files.
- Remove the zip files such that there are only unzipped folders left.

## Setup

In [None]:
## Import required libraries
import os
import pandas as pd
from pathlib import Path
import shutil

## Get current working directory
cwd = os.getcwd()
print('Current working directory = ' + cwd)

In [None]:
# Define folder names
dataFolder = 'data'
trainFolder = 'Train'
validFolder = 'Valid'
testFolder = 'Test'
subFolders = ['1_esophagus','2_stomach','3_small_bowel','4_colon']

In [None]:
# Define paths for original (unzipped) dataset as well as the lists of path for each folders(traiing, validation, and testing) 
unZipDatafolder = r'E:\Brown_GI_VCE_Dataset\Unzipped_images' # Insert your folder name in this line for unzipped dataset
listPathTrain = os.path.join(cwd,'csv','path_train.csv')
listPathValid = os.path.join(cwd,'csv','path_valid.csv')
listPathTest = os.path.join(cwd,'csv','path_test.csv')
print(listPathTrain)
print(listPathValid)
print(listPathTest)

In [None]:
# Define the path for each folders
# Create paths for data, train, valid, and test
dataPath = os.path.join(cwd,dataFolder)
trainPath = os.path.join(dataPath,trainFolder)
validPath = os.path.join(dataPath,validFolder)
testPath = os.path.join(dataPath,testFolder)
print('Path for dataFolder = ' + dataPath)
print('Path for trainFolder = ' + trainPath)
print('Path for validFolder = ' + validPath)
print('Path for testFolder = ' + testPath)

## Create folders and sub-folders
For separating the dataset into training, validation, and testing sets with the sub-folders

In [None]:
# Create folders
def createFolder(folderName,folderPath):
    # This function is for creating a folder only if it does not exist.
    if os.path.isdir(folderPath) == True:
        print(folderName +' already exists.')
    else:
        os.mkdir(folderPath)
        print(folderName +' has been created.')
        
# Create sub-folders
def createSubfolders(subFolders, mainfolderPath):
    # This function is for creating sub-folders under the main folder only if it does not exist.
    for s in subFolders:
        path_s = os.path.join(mainfolderPath,s)
        createFolder(s,path_s)
        
createFolder('dataFolder',dataPath)
createFolder('trainFolder',trainPath)
createSubfolders(subFolders, trainPath)
createFolder('validFolder',validPath)
createSubfolders(subFolders, validPath)
createFolder('testFolder',testPath)
createSubfolders(subFolders, testPath)

## Copy (or move) files into the created folders

In [None]:
# Copy/move files to training folder
dfTrain = pd.read_csv(listPathTrain)
n_dfTrain = len(dfTrain)
for i in range(n_dfTrain):
    org_i = dfTrain['organ'][i]
    path_i = dfTrain['path'][i]
    # Create a new file name with sample_id as a prefix to avoid duplicated filenames
    newFileName = path_i.replace('/' + org_i + '/','-')
    src = Path(os.path.join(unZipDatafolder,path_i))
    dst = Path(os.path.join(trainPath,org_i,newFileName))
    shutil.copy(src,dst) # can change to shutil.move for moving the files instrad of copying them

In [None]:
# Copy/move files to validation folder
dfValid = pd.read_csv(listPathValid)
n_dfValid = len(dfValid)
for i in range(n_dfValid):
    org_i = dfValid['organ'][i]
    path_i = dfValid['path'][i]
    # Create a new file name with sample_id as a prefix to avoid duplicated filenames
    newFileName = path_i.replace('/' + org_i + '/','-')
    print(newFileName)
    src = Path(os.path.join(unZipDatafolder,path_i))
    dst = Path(os.path.join(validPath,org_i,newFileName))
    shutil.copy(src,dst) # can change to shutil.move for moving the files instrad of copying them

In [None]:
# Copy/move files to testing folder
dfTest = pd.read_csv(listPathTest,header=None)
testSamples = dfTest[0].values.tolist()

for t in testSamples:
    for s in subFolders:
        print(t,s)
        files = os.listdir(os.path.join(unZipDatafolder,t,s))
        print(files)
        for f in files:
            src = os.path.join(unZipDatafolder,t,s,f)
            # Create a new file name with sample_id as a prefix to avoid duplicated filenames
            newFileName = t+'-'+f
            dst = os.path.join(testPath,s,newFileName)
            print(src)
            print(dst)
            shutil.copy(src,dst) # can change to shutil.move for moving the files instrad of copying them