In this notebook, we will gather all the OASIS-1 data , however only filter down to those that explicitly have CDR==0.
The code in this notebook will uncompress the tar.gz files and drop the uncompressed folders into scratch folders.
It will also clean up any tar.gz files since they are no longer needed once the uncompressed files are available. Additionally, the files of CDR!=0 are also cleaned up.

In [None]:
# Import standard Python distribution libraries
import tarfile
import os
import re
import subprocess #To run 'wget' shell command from within Python natively
import pickle

# Additional libraries 
import pandas as pd
from tqdm import tqdm

In [None]:
# Define base path where files will be stored.
# This is unpacked from the pickle file created in Step 0.

with open('pickledHomeScratchShared.pickle', "rb") as f:
    baseHomePath,baseScratchPath,baseSharedPath = pickle.load(f)

In [None]:
# Download OASIS-1 data labels into a CSV and load into Pandas dataframe 
subprocess.run(["wget", "-O", '{}/milestone_II_project/data/oasis_labelled_data/oasis_1_labelled_data.csv'.format(baseHomePath), 'https://www.oasis-brains.org/files/oasis_cross-sectional.csv']);
oasis_1_datatable = pd.read_csv('{}/milestone_II_project/data/oasis_labelled_data/oasis_1_labelled_data.csv'.format(baseHomePath))
# Display Images which have CDR==0
display(oasis_1_datatable[oasis_1_datatable['CDR']==0])
# Store list of the IDs
cdrZeroList = oasis_1_datatable[oasis_1_datatable['CDR']==0]['ID'].values

#Create the tar_gz_files if it does not exist
if not os.path.exists("{}/data/tar_gz_files".format(baseScratchPath)):
    print('tar_gz_files does not exist.Creating directory...')
    subprocess.run(["mkdir", "-p", "{}/data/tar_gz_files".format(baseScratchPath)]) 

# Create list of tar.gz files to download
oasis1_targz_files = ['oasis_cross-sectional_disc{}.tar.gz'.format(i) for i in range(1,13)] # 12 files in total
oasis1_targz_files

In [None]:
# Function to find all tar.gz files in a directory and then uncompress them to another directory.
def uncompressTar(datadir):
    baseDirs = list()
    for root, dirs, files in os.walk(datadir):
        for file in files:
            if re.match("^.+tar\.gz$",file):
                fullFilePath = root+'/'+ file # Get full file path
                tar = tarfile.open(fullFilePath, 'r:gz')
                members = tar.getmembers() # Get a list of all the 'members' in the archive
                baseDir = os.path.commonprefix([m.name for m in members]) # Get the base directory by finding the common prefix of all members' names
                baseDir = baseDir.rstrip("/") # Remove trailing slashes from the base directory
                baseDirs.append(baseDir)
                uncompressedDirPath = datadir + '/' + baseDir
                if os.path.isdir(uncompressedDirPath): # if folder exists, do not try and uncompress and simply warn user
                    print("The folder {} exists. Hence skipping uncompressing".format(uncompressedDirPath))
                    tar.close()
                else:
                    print("The folder does not exist. Proceeding with uncompressing")
                    tar.extractall(datadir)
                    tar.close()   
    return baseDirs

### Download all the OASIS-1 files and only keep those that are needed (i.e. CDR == 0).

In [None]:
# Code to get all the tar.gz files, uncompress them and only keep the ones which are in the labels we need.
for tarFile in tqdm(oasis1_targz_files):
    # 1. Download tar.gz files (skip unless they already exist) and uncompress them
    if not os.path.exists("{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)):
        url = "https://download.nrg.wustl.edu/data/{}".format(tarFile);
        saveFilePath = "{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile);
        result = subprocess.run(["wget", "-O", saveFilePath, url]);
    baseDirs = uncompressTar("{}/data".format(baseScratchPath)) # Function returns base directories where uncompressed files will save to
    # 2. Code to iterate through base directories where tar files landed and delete those not needed
    for baseDir in baseDirs:
        folderPath = "{}/data/{}".format(baseScratchPath,baseDir)
        mriImgFolders = [f for f in os.listdir(folderPath) if not os.path.isfile(os.path.join(folderPath, f))]
        for mriImgFolder in mriImgFolders: #Iterate through each mriImgFolder and remove files not needed
            if mriImgFolder not in cdrZeroList: #Remove those folders where CDR not equal to zero.
                subprocess.run(["rm", "-rf", folderPath+'/'+mriImgFolder])
    # 3. Remove the tar.gz file to clean up space before next file is downloaded
    if os.path.exists("{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)):
        subprocess.run(["rm", "-rf", "{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)])