# Merging Outputs

This notebook merges the outputs of the BIDS validator run on individual subjects

In [2]:
import os
import glob
import json
import datetime
import pandas as pd

In [3]:
def parse_validator(path):

    def get_nested(dct, *keys):
        for key in keys:
            try:
                dct = dct[key]
            except (KeyError, TypeError):
                return None
        return dct

    with open(path, 'r') as read_file:
        data = json.load(read_file)

    issues = data['issues']

    def parse_issue(issue_dict):

        return_dict = {}
        return_dict['files'] = [get_nested(x, 'file', 'relativePath') for x in issue_dict.get('files', '')]
        return_dict['type'] = issue_dict.get('key' '')
        return_dict['severity'] = issue_dict.get('severity', '')
        return_dict['description'] = issue_dict.get('reason', '')
        return_dict['code'] = issue_dict.get('code', '')
        return_dict['url'] = issue_dict.get('helpUrl', '')

        return(return_dict)

    df = pd.DataFrame()

    for warn in issues['warnings']:

        parsed = parse_issue(warn)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    for err in issues['errors']:

        parsed = parse_issue(err)
        parsed = pd.DataFrame(parsed)
        df = df.append(parsed, ignore_index=True)

    return df

issues = {}

## PNC

In [4]:
dataset = 'PNC'
path = '/storage/ttapera/RBC/data/validation/{}/sub-*/issues.json'.format(dataset)
all_files = glob.glob(path)

li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

Last Run: 2020-09-24 19:09:47.945094


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,DWI_MISSING_BVAL,1
error,DWI_MISSING_BVEC,1
error,ECHO_TIME1-2_NOT_DEFINED,17
warning,EVENTS_TSV_MISSING,3067
warning,INCONSISTENT_PARAMETERS,1421
warning,MISSING_MAGNITUDE1_FILE,17
warning,NO_AUTHORS,0
warning,README_FILE_MISSING,0


In [5]:
issues[dataset].to_csv('/storage/ttapera/RBC/PennLINC/Validation/{}/validation.csv'.format(dataset), index=False)

## HBN

In [6]:
dataset = 'HBN'
path = '/storage/ttapera/RBC/data/validation/{}/sub-*/issues.json'.format(dataset)
all_files = glob.glob(path)

li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

Last Run: 2020-09-24 19:10:11.029821


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,NOT_INCLUDED,98
warning,EVENTS_TSV_MISSING,8622
warning,INCONSISTENT_PARAMETERS,38
warning,NO_AUTHORS,0
warning,README_FILE_MISSING,0


In [7]:
issues[dataset].to_csv('/storage/ttapera/RBC/PennLINC/Validation/{}/validation.csv'.format(dataset), index=False)

## NKI

In [12]:
dataset = 'NKI'
path = '/storage/ttapera/RBC/data/validation/{}/sub-*/issues.json'.format(dataset)
all_files = glob.glob(path)

li = []

for filename in all_files:
    df = parse_validator(filename)
    df['subject'] = filename.split('/')[7]
    li.append(df)
    
issues[dataset] = pd.concat(li, axis=0, ignore_index=True)
print('Last Run:', datetime.datetime.now())
issues[dataset][['severity', 'type', 'files']].groupby(['severity', 'type']).agg(['count'])

Last Run: 2020-09-24 09:16:58.123098


Unnamed: 0_level_0,Unnamed: 1_level_0,files
Unnamed: 0_level_1,Unnamed: 1_level_1,count
severity,type,Unnamed: 2_level_2
error,EVENTS_COLUMN_DURATION,14825
error,EVENTS_COLUMN_ONSET,14825
error,NOT_INCLUDED,104
error,SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER,104
warning,CUSTOM_COLUMN_WITHOUT_DESCRIPTION,14825
warning,NO_AUTHORS,0
warning,README_FILE_MISSING,0


For code 1 (`NOT_INCLUDED`), it's confirmed that all 104 items are compressed tsv files, ending in `physio.tsv.gz`, hence they are not included in the BIDS spec.

These same errors are included in code 64 (`SESSION_VALUE_CONTAINS_ILLEGAL_CHARACTER`) and all account for instances of physio files.

In [8]:
issues[dataset].to_csv('/storage/ttapera/RBC/PennLINC/Validation/{}/validation.csv'.format(dataset), index=False)