# Merging Outputs

This notebook merges the outputs of the BIDS validator run on individual subjects

In [1]:
import os
import glob
import json
import datetime
import pandas as pd

In [2]:
def parse_validator(path):

    def get_nested(dct, *keys):
        for key in keys:
            try:
                dct = dct[key]
            except (KeyError, TypeError):
                return None
        return dct

    with open(path, 'r') as read_file:
        data = json.load(read_file)

    issues = data['issues']

    def parse_issue(issue_dict):

        return_dict = {}
        return_dict['files'] = [get_nested(x, 'file', 'relativePath') for x in issue_dict.get('files', '')]
        return_dict['type'] = issue_dict.get('key' '')
        return_dict['severity'] = issue_dict.get('severity', '')
        return_dict['description'] = issue_dict.get('reason', '')
        return_dict['code'] = issue_dict.get('code', '')
        return_dict['url'] = issue_dict.get('helpUrl', '')

        return(return_dict)

    df = pd.DataFrame()

    for warn in issues['warnings']:

        parsed = parse_issue(warn)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    for err in issues['errors']:

        parsed = parse_issue(err)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    return df

issues = {}

## PNC

In [3]:
dataset = 'PNC'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_issues.json'.format(dataset.lower())
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files', 'code']].groupby(['severity', 'type', 'code']).agg(['count'])

['/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/pnc_issues.json']
Last Run: 2020-10-13 01:22:26.441702


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count
severity,type,code,Unnamed: 3_level_2
error,DWI_MISSING_BVAL,33,1
error,DWI_MISSING_BVEC,32,1
error,ECHO_TIME1-2_NOT_DEFINED,15,17
error,NOT_INCLUDED,1,5650
warning,EVENTS_TSV_MISSING,25,3067
warning,INCONSISTENT_PARAMETERS,39,1597
warning,MISSING_MAGNITUDE1_FILE,92,17
warning,NO_AUTHORS,113,0
warning,README_FILE_MISSING,101,0


The missing BVAL & BVEC in code 32 and 33 are a confirmed subject who has none of these files in their DICOM. Additionally, 17 subjects have only a phasediff file and no magnitude fieldmaps, for code 15. Lastly, NOT_INCLUDED errors refer to the ASL files which are not in the BIDS spec. PNC is complete on CUBIC!

In [4]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)

## HBN

In [5]:
dataset = 'HBN'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_issues.json'.format(dataset.lower())
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files', 'code']].groupby(['severity', 'type', 'code']).agg(['count'])

['/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/hbn_issues.json']
Last Run: 2020-10-13 01:22:26.963124


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count
severity,type,code,Unnamed: 3_level_2
warning,EVENTS_TSV_MISSING,25,8627
warning,INCONSISTENT_PARAMETERS,39,621
warning,NO_AUTHORS,113,0
warning,README_FILE_MISSING,101,0


In [6]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)

## NKI

In [7]:
dataset = 'NKI'
path = '/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_issues.json'.format(dataset.lower())
all_files = glob.glob(path)
print(all_files)
li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files', 'code']].groupby(['severity', 'type', 'code']).agg(['count'])

['/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/nki_issues.json']
Last Run: 2020-10-13 01:22:27.419300


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count
severity,type,code,Unnamed: 3_level_2
error,EMPTY_FILE,99,1
error,NOT_INCLUDED,1,7135
error,SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER,63,102


For code 1 (`NOT_INCLUDED`), it's confirmed that all 204 items are compressed tsv files, ending in `physio.tsv.gz`, hence they are not included in the BIDS spec.

These same errors are included in code 64 (`SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER`) and all account for instances of physio files.

In [8]:
issues[dataset].to_csv('/cbica/projects/RBC/flywheel_curation/RBC/PennLINC/Validation/CUBIC_Curation/{}_validation.csv'.format(dataset), index=False)