In [24]:
import numpy as np
import seaborn as sns
import numpy as np #Library for numerical calculations
from scipy import stats #Library for statistical analysis 
import pandas as pd
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn
import os
import scipy.io

In [113]:
def make_directory_tree(tree, output_dir):
    """
    Create the output directory tree structure specified by `tree` in `output_dir`

    Parameters
    -----
    tree : list of paths to create under `output_dir`

    output_dir : path to root of output directory tree
    """

    for d in tree:
        try:
            path = os.path.join(output_dir, d)
            os.makedirs(path, exist_ok=True)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(path):
                print("Path already exists: {}".format(d))
                print("Files may be overwritten")
                continue
            else:
                raise


def combine_reports(files_to_combine, outpath, is_mat=False,is_train=True):
    """
    Produce a combined file from all the input files and saves it in outpath

    Parameters
    -----
    files_to_combine : data of same type

    outpath : path at which to save the combined dataset
    """
    
    if outpath.endswith(".txt"):
        outpath = outpath.replace(".txt",".csv")
        
    print("Creating file: ",outpath)
    
    if is_mat:
        if is_train:
            arrays = tuple([scipy.io.loadmat(file)['trainData'] for file in files_to_combine])
            merged_array = np.concatenate(arrays,axis=-1)
            matlab_dict = {'trainData':merged_array}
        else:
            arrays = tuple([scipy.io.loadmat(file)['testData'] for file in files_to_combine])
            merged_array = np.concatenate(arrays,axis=-1)
            matlab_dict = {'testData':merged_array}
            
        scipy.io.savemat(outpath,matlab_dict)
        
    else:
        dfs = [pd.read_csv(file,header=None) for file in files_to_combine]
        df = pd.concat(dfs,ignore_index=True)
        df.columns=['Values']
        df.to_csv(outpath,index=False)


def get_datasets(data_dir):
    """
    Get all filenames from the specified directory

    Parameters
    -----
    data_dir : path to directory containing the files

    Returns
    -----
    ret : list containing dataset filenames
    """

    return [f for f in os.listdir(data_dir) if os.path.isfile(
        os.path.join(data_dir, f)) and (f.endswith(".txt") or f.endswith(".mat"))]   


def run_data_aggregation(input_dir, output_dir):
    """
    Iterate over the input directories and generate the new aggregated files in the output_dir

    Parameters
    -----
    input_dir : path to directory containing input data
    
    output_dir : output directory for storing processed data

    """
    number_of_subjects=15
    subject_names = ["SBJ"+str(i).zfill(2) for i in range(1,number_of_subjects+1)]
    number_of_sessions = 3
    session_names = ["S"+str(i).zfill(2) for i in range(1,number_of_sessions+1)]
    train_files = get_datasets(os.path.join(input_dir,"SBJ01/S01/Train"))
    test_files = get_datasets(os.path.join(input_dir,"SBJ01/S01/Test"))
    
    #Creating folders for each subject
    make_directory_tree(subject_names,output_dir)
    for subject_no in range(number_of_subjects):
        trainEvents=[]
        trainLabels=[]
        trainTargets=[]
        trainData=[]
        testData=[]
        testEvents=[]
        testRunsPerBlock=[]
        subject_output_folder = os.path.join(output_dir,subject_names[subject_no])
        print("Currently processing : ",subject_output_folder)
        #Making train and test folders for each subject
        make_directory_tree(['Train','Test'],subject_output_folder)
        subject_input_folder = os.path.join(input_dir,subject_names[subject_no])
        for session_no in range(number_of_sessions):
            input_session_folder = os.path.join(subject_input_folder,session_names[session_no])
            train_folder = os.path.join(input_session_folder,"Train")
            test_folder = os.path.join(input_session_folder,"Test")
            train_paths = [os.path.join(train_folder,train_file) for train_file in train_files]
            test_paths = [os.path.join(test_folder,test_file) for test_file in test_files]

            trainData.append(train_paths[0])
            trainEvents.append(train_paths[1])
            trainLabels.append(train_paths[2])
            trainTargets.append(train_paths[3])
            
            testRunsPerBlock.append(test_paths[0])
            testData.append(test_paths[1])
            testEvents.append(test_paths[2])
            
        train_output_folder = os.path.join(subject_output_folder,"Train")
        test_output_folder = os.path.join(subject_output_folder,"Test")
        
        combine_reports(trainData,os.path.join(train_output_folder,train_files[0]),is_mat=True)
        combine_reports(trainEvents,os.path.join(train_output_folder,train_files[1]))
        combine_reports(trainLabels,os.path.join(train_output_folder,train_files[2]))
        combine_reports(trainTargets,os.path.join(train_output_folder,train_files[3]))
        
        combine_reports(testRunsPerBlock,os.path.join(test_output_folder,test_files[0]),is_train=False)
        combine_reports(testData,os.path.join(test_output_folder,test_files[1]),is_mat=True,is_train=False)
        combine_reports(testEvents,os.path.join(test_output_folder,test_files[2]),is_train=False)

In [114]:
run_data_aggregation("medicon_data/subjects_pre", "medicon_data/subjects_post")

Currently processing :  medicon_data/subjects_post\SBJ01
Creating file:  medicon_data/subjects_post\SBJ01\Train\trainData.mat
Creating file:  medicon_data/subjects_post\SBJ01\Train\trainEvents.csv
Creating file:  medicon_data/subjects_post\SBJ01\Train\trainLabels.csv
Creating file:  medicon_data/subjects_post\SBJ01\Train\trainTargets.csv
Creating file:  medicon_data/subjects_post\SBJ01\Test\runs_per_block.csv
Creating file:  medicon_data/subjects_post\SBJ01\Test\testData.mat
Creating file:  medicon_data/subjects_post\SBJ01\Test\testEvents.csv
Currently processing :  medicon_data/subjects_post\SBJ02
Creating file:  medicon_data/subjects_post\SBJ02\Train\trainData.mat
Creating file:  medicon_data/subjects_post\SBJ02\Train\trainEvents.csv
Creating file:  medicon_data/subjects_post\SBJ02\Train\trainLabels.csv
Creating file:  medicon_data/subjects_post\SBJ02\Train\trainTargets.csv
Creating file:  medicon_data/subjects_post\SBJ02\Test\runs_per_block.csv
Creating file:  medicon_data/subjects_

In [115]:
#Testing mat train file (1 session vs all 3 sessions shapes). New files should be 3 times the size:
old_train_folder = "medicon_data/subjects_pre/SBJ01/S01/Train"

mat = scipy.io.loadmat(os.path.join(old_train_folder,"trainData"))
data = mat['trainData']
print("Old mat train file shape: ",data.shape)

train_events = pd.read_csv(os.path.join(old_train_folder,"trainEvents.txt"),header=None)
print("Old train events shape: ",train_events.shape)

train_labels = pd.read_csv(os.path.join(old_train_folder,"trainLabels.txt"),header=None)
print("Old train labels shape: ",train_labels.shape)

train_targets = pd.read_csv(os.path.join(old_train_folder,"trainTargets.txt"),header=None)
print("Old train targets shape: ",train_targets.shape)

new_train_folder = "medicon_data/subjects_post/SBJ01/Train"

mat = scipy.io.loadmat(os.path.join(new_train_folder,"trainData"))
data = mat['trainData']
print("New mat train file shape: ",data.shape)

train_events = pd.read_csv(os.path.join(new_train_folder,"trainEvents.csv"))
print("New train events shape: ",train_events.shape)

train_labels = pd.read_csv(os.path.join(new_train_folder,"trainLabels.csv"))
print("New train labels shape: ",train_labels.shape)

train_targets = pd.read_csv(os.path.join(new_train_folder,"trainTargets.csv"))
print("New train targets shape: ",train_targets.shape)

Old mat train file shape:  (8, 350, 1600)
Old train events shape:  (1600, 1)
Old train labels shape:  (20, 1)
Old train targets shape:  (1600, 1)
New mat train file shape:  (8, 350, 4800)
New train events shape:  (4800, 1)
New train labels shape:  (60, 1)
New train targets shape:  (4800, 1)


In [116]:
print(train_events)

      Values
0          7
1          3
2          4
3          1
4          6
...      ...
4795       6
4796       7
4797       2
4798       5
4799       3

[4800 rows x 1 columns]
