# Merging Outputs

This notebook merges the outputs of the BIDS validator run on individual subjects

In [8]:
import os
import glob
import json
import datetime
import pandas as pd

In [17]:
def parse_validator(path):

    def get_nested(dct, *keys):
        for key in keys:
            try:
                dct = dct[key]
            except (KeyError, TypeError):
                return None
        return dct

    with open(path, 'r') as read_file:
        data = json.load(read_file)

    issues = data['issues']

    def parse_issue(issue_dict):

        return_dict = {}
        return_dict['files'] = [get_nested(x, 'file', 'relativePath') for x in issue_dict.get('files', '')]
        return_dict['type'] = issue_dict.get('key' '')
        return_dict['severity'] = issue_dict.get('severity', '')
        return_dict['description'] = issue_dict.get('reason', '')
        return_dict['code'] = issue_dict.get('code', '')
        return_dict['url'] = issue_dict.get('helpUrl', '')

        return(return_dict)

    df = pd.DataFrame()

    for warn in issues['warnings']:

        parsed = parse_issue(warn)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    for err in issues['errors']:

        parsed = parse_issue(err)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    return df

issues = {}

## PNC

In [18]:
dataset = 'PNC'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_issues.json'.format(dataset.lower())
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

['/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/pnc_issues.json']
Last Run: 2020-09-29 15:32:49.097678


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,DWI_MISSING_BVAL,3
error,DWI_MISSING_BVEC,2
error,ECHO_TIME1-2_NOT_DEFINED,17
error,ECHO_TIME_MUST_DEFINE,10
error,EMPTY_FILE,7
error,NIFTI_TOO_SMALL,1
error,NOT_INCLUDED,5636
error,NO_VALID_DATA_FOUND_FOR_SUBJECT,1
error,REPETITION_TIME_MUST_DEFINE,15
error,TASK_NAME_MUST_DEFINE,15


In [19]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)

## HBN

In [9]:
dataset = 'HBN'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_issues.json'.format(dataset.lower())
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

Last Run: 2020-09-25 09:39:52.537517


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,DWI_MISSING_BVAL,2.0
error,DWI_MISSING_BVEC,3.0
error,ECHO_TIME1-2_NOT_DEFINED,1.0
error,EMPTY_FILE,39.0
error,INTERNAL ERROR,0.0
error,NIFTI_TOO_SMALL,36.0
error,NOT_INCLUDED,4.0
error,PHASE_ENCODING_DIRECTION_MUST_DEFINE,31.0
error,REPETITION_TIME_MUST_DEFINE,192.0
error,TASK_NAME_MUST_DEFINE,192.0


In [21]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)

## NKI

In [22]:
dataset = 'NKI'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/{}/*_issues.json'.format(dataset)
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

['/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/NKI/nki_issues.json']
Last Run: 2020-09-29 15:34:17.729585


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,EMPTY_FILE,2.0
error,NOT_INCLUDED,202.0
error,SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER,101.0
SLICE_TIMING_NOT_DEFINED,10,
SUSPICIOUSLY_LONG_EVENT_DESIGN,1,
SUSPICIOUSLY_SHORT_EVENT_DESIGN,2661,


For code 1 (`NOT_INCLUDED`), it's confirmed that all 104 items are compressed tsv files, ending in `physio.tsv.gz`, hence they are not included in the BIDS spec.

These same errors are included in code 64 (`SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER`) and all account for instances of physio files.

In [23]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)