In this notebook, we will gather all the OASIS-2 data .
The code in this notebook will uncompress the tar.gz files and drop the uncompressed folders into scratch folders.
It will also clean up any tar.gz files since they are no longer needed once the uncompressed files are available. 

In [1]:
# Import standard Python distribution libraries
import tarfile
import os
import re
import subprocess
import pickle

# Additional libraries 
import pandas as pd
from tqdm import tqdm

### Download each of the OASIS-2 tar.gz files 
This is data provided directly from the OASIS-2 website as part of instructions stated in [this link](https://www.oasis-brains.org/#data). Once there, navigate to 'OASIS-2' and the 'Downlaod instructions' references the links being used below.

In [2]:
# Define base path where files will be stored.
# This is unpacked from the pickle file created in Step 0.

with open('pickledHomeScratchShared.pickle', "rb") as f:
    baseHomePath,baseScratchPath,baseSharedPath = pickle.load(f)

In [3]:
#Create the tar_gz_files if it does not exist
if not os.path.exists("{}/data/tar_gz_files".format(baseScratchPath)):
    print('tar_gz_files does not exist.Creating directory...')
    subprocess.run(["mkdir", "-p", "{}/data/tar_gz_files".format(baseScratchPath)]) 

In [4]:
# Function to find all tar.gz files in a directory and then uncompress them to another directory.
"""
# Old code below just for reference before testing new code
def uncompressTar(datadir):
    for root, dirs, files in os.walk(datadir):
        for file in files:
            if re.match("^.+tar\.gz$",file):
                fullFilePath = root+'/'+ file # Get full file path
                uncompressedDirPath = re.sub("\.tar\.gz$", "", datadir + '/' + file)
                if os.path.isdir(uncompressedDirPath): # if file exists, do not try and uncompress and simply warn user
                    print("The folder {} exists. Hence skipping uncompressing".format(uncompressedDirPath))
                else:
                    print("The folder does not exist. Proceeding with uncompressing")
                    tar = tarfile.open(fullFilePath, 'r:gz')
                    tar.extractall(datadir)
                    tar.close()          
                    
# Run function to uncompress tar.gz files
uncompressTar("/home/anilcm/milestone_II_project/data")
"""

# Function to find all tar.gz files in a directory and then uncompress them to another directory.
def uncompressTar(datadir):
    baseDirs = list()
    for root, dirs, files in os.walk(datadir):
        for file in files:
            if re.match("^.+tar\.gz$",file):
                fullFilePath = root+'/'+ file # Get full file path
                tar = tarfile.open(fullFilePath, 'r:gz')
                members = tar.getmembers() # Get a list of all the 'members' in the archive
                baseDir = os.path.commonprefix([m.name for m in members]) # Get the base directory by finding the common prefix of all members' names
                baseDir = baseDir.rstrip("/") # Remove trailing slashes from the base directory
                baseDirs.append(baseDir)
                uncompressedDirPath = datadir + '/' + baseDir
                if os.path.isdir(uncompressedDirPath): # if folder exists, do not try and uncompress and simply warn user
                    print("The folder {} exists. Hence skipping uncompressing".format(uncompressedDirPath))
                    tar.close()
                else:
                    print("The folder does not exist. Proceeding with uncompressing")
                    tar.extractall(datadir)
                    tar.close()   
    return baseDirs

### Download all the OASIS-2 files and uncompress

In [None]:
# Download all tarfiles and uncompress them if needed
for tarFile in tqdm(['OAS2_RAW_PART1.tar.gz', 'OAS2_RAW_PART2.tar.gz']):
    # 1. Download tar.gz files (skip unless they already exist) and uncompress them
    if not os.path.exists("{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)):
        url = "https://download.nrg.wustl.edu/data/{}".format(tarFile);
        saveFilePath = "{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile);
        result = subprocess.run(["wget", "-O", saveFilePath, url]);
    baseDirs = uncompressTar("{}/data".format(baseScratchPath)) # Function returns base directories where uncompressed files will save to
    # 2. Remove the tar.gz file to clean up space before next file is downloaded
    if os.path.exists("{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)):
        subprocess.run(["rm", "-rf", "{}/data/tar_gz_files/{}".format(baseScratchPath,tarFile)])

### Download the spreadsheets containing labels

In [None]:
#!wget https://www.oasis-brains.org/files/oasis_longitudinal_demographics.xlsx -O '../data/oasis_labelled_data/oasis_2_labelled_data.xlsx'

# Download and load into Pandas datafram
subprocess.run(["wget", "-O", '{}/milestone_II_project/data/oasis_labelled_data/oasis_2_labelled_data.xlsx'.format(baseHomePath), 'https://www.oasis-brains.org/files/oasis_longitudinal_demographics.xlsx']);
oasis_2_datatable = pd.read_excel('{}/milestone_II_project/data/oasis_labelled_data/oasis_2_labelled_data.xlsx'.format(baseHomePath))

# Display Pandas DataFrame as a table
display(oasis_2_datatable)