# Fixing HRC

We've just downloaded HRC data from globus, and the first output of the validator is ready. In this notebook we're going to fix the errors.

In [1]:
import pandas as pd

In [2]:
validation = pd.read_csv('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/hrc_2021-01-27_validation.csv')

In [3]:
validation.groupby(['severity', 'type', 'code']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files,description,url
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,count,count
severity,type,code,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
error,DATASET_DESCRIPTION_JSON_MISSING,57,0,1,1
error,JSON_SCHEMA_VALIDATION_ERROR,55,25,25,25
error,NOT_INCLUDED,1,2037,2037,2037
error,NO_VALID_DATA_FOUND_FOR_SUBJECT,67,11,11,11
error,TASK_NAME_MUST_DEFINE,50,874,874,874
warning,README_FILE_MISSING,101,0,1,1


We'll start by dealing with the NOT_INCLUDED error; these all have the incorrect order of key-value pairs.

In [5]:
import os
import glob

root_dir = '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/'

rec_files = []
for filename in glob.iglob(root_dir + '**/**/**/**', recursive=True):
     if os.path.isfile(filename) and 'rec-' in filename:
            rec_files.append(filename)

In [7]:
len(rec_files)

40720

In [8]:
rec_files[:10]

['/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_run-1_rec-refaced_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_run-1_rec-refaced_T1w.json',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_run-1_rec-refaced_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_run-1_rec-refaced_T1w.json',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_run-1_rec-refaced_T1w.json',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_run-1_rec-refaced_T1w.nii.gz',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-1_rec-refaced_T1w.json',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-2_rec-refaced_T1w.json',
 '/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-202

In [39]:
def reorder_bids(oldpath, apply=False):
    
    print(oldpath)
    
    if not os.path.exists(oldpath):
        return None
    directory = os.path.dirname(oldpath)
    fname = os.path.basename(oldpath)
    entities = fname.split("_")
    #print(entities)
    
    rec_index = [i for i, s in enumerate(entities) if 'refaced' in s][0]
    ses_index = [i for i, s in enumerate(entities) if 'ses-' in s][0]
    run_index = [i for i, s in enumerate(entities) if 'run-' in s][0]

    assert rec_index
    assert run_index
    assert rec_index > run_index
    #print(rec_index, ses_index, run_index)
    
    # rec should go after ses and before run
    entities[run_index], entities[rec_index] = entities[rec_index], entities[run_index]
    
    #print(entities)
    
    newname = '_'.join(entities)
    #print(newname)
    
    newpath = directory + '/' + newname
    print(newpath)
    
    if apply:
        os.rename(oldpath, newpath)

In [40]:
reorder_bids(rec_files[0], apply=True)

/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_run-1_rec-refaced_T1w.nii.gz


In [41]:
[reorder_bids(x, apply=True) for x in rec_files[1:]]

/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-1/anat/sub-10703_ses-1_run-1_rec-refaced_T1w.json
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_run-1_rec-refaced_T1w.nii.gz
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10703/ses-2/anat/sub-10703_ses-2_run-1_rec-refaced_T1w.json
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_run-1_rec-refaced_T1w.json
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-10753/ses-1/anat/sub-10753_ses-1_run-1_rec-refaced_T1w.nii.gz
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-1_rec-refaced_T1w.json
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-2_rec-refaced_T1w.json
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-1_rec-refaced_T1w.nii.gz
/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/HRC/sub-20294/ses-1/anat/sub-20294_ses-1_run-

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Let's see how that worked:

In [42]:
validation2 = pd.read_csv('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/hrc_2021-01-27_02:53:25_validation.csv')

In [43]:
validation2.groupby(['severity', 'type', 'code']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files,description,url
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,count,count
severity,type,code,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
error,DATASET_DESCRIPTION_JSON_MISSING,57,0,1,1
error,JSON_SCHEMA_VALIDATION_ERROR,55,25,25,25
error,NOT_INCLUDED,1,1,1,1
error,TASK_NAME_MUST_DEFINE,50,874,874,874
warning,README_FILE_MISSING,101,0,1,1


Only 1 file still under `NOT INCLUDED`, and it's the README! Perfect.

We can move on to the json error issue:

In [45]:
# test to see if the json can be opened:

import json

path = root_dir + 'sub-20765/ses-2/func/sub-20765_ses-2_task-rest_run-1_bold.json'

with open(path, 'r') as read_file:
        data = json.load(read_file)

In [46]:
data

{'Modality': 'MR',
 'MagneticFieldStrength': 1.5,
 'ImagingFrequency': 63.8303,
 'Manufacturer': 'GE',
 'PulseSequenceName': 'epi',
 'InternalPulseSequenceName': 'EPI',
 'ManufacturersModelName': 'Signa HDxt',
 'InstitutionName': 'INRAD - HC - FMUSP',
 'DeviceSerialNumber': '0000000000035925',
 'StationName': 'INRADRM2',
 'PatientPosition': 'HFS',
 'ProcedureStepDescription': 'RM CRANIO',
 'SoftwareVersions': '15\\LX\\MR Software release:15.0_M4A_0947.a',
 'MRAcquisitionType': '2D',
 'SeriesDescription': 'RESTING STATE fMRI',
 'ProtocolName': 'RESTING STATE fMRI',
 'ScanningSequence': 'EP\\GR',
 'SequenceVariant': 'SS',
 'ScanOptions': 'MP_GEMS\\EPI_GEMS\\ACC_GEMS',
 'ImageType': ['ORIGINAL', 'PRIMARY', 'OTHER'],
 'SeriesNumber': 7,
 'AcquisitionTime': '09:51:45.000000',
 'AcquisitionNumber': 1,
 'TriggerDelayTime': 1924,
 'SliceThickness': 4,
 'SpacingBetweenSlices': 4.5,
 'SAR': 0.0175701,
 'EchoTime': 0.03,
 'RepetitionTime': 2,
 'FlipAngle': 80,
 'PhaseEncodingPolarityGE': 'Flipped

Negative slice timing... That could be a problem. Otherwise, we can move on to the `TaskName` issue. We just have to add this into the json sidecar.

In [50]:
task_files = validation2[validation2.code == 50]['files'].values

In [54]:
task_files = list(task_files)

In [56]:
task_files

['/sub-10001/ses-1/func/sub-10001_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10001/ses-2/func/sub-10001_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10612/ses-1/func/sub-10612_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10615/ses-1/func/sub-10615_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10615/ses-2/func/sub-10615_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10617/ses-2/func/sub-10617_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10618/ses-1/func/sub-10618_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10618/ses-2/func/sub-10618_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10620/ses-1/func/sub-10620_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10620/ses-2/func/sub-10620_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10621/ses-1/func/sub-10621_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10622/ses-1/func/sub-10622_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10624/ses-2/func/sub-10624_ses-2_task-rest_run-1_bold.nii.gz',
 '/sub-10627/ses-1/func/sub-10627_ses-1_task-rest_run-1_bold.nii.gz',
 '/sub-10627/ses-2/f

In [57]:
task_jsons = [x.replace("nii.gz", "json") for x in task_files]

In [58]:
task_jsons

['/sub-10001/ses-1/func/sub-10001_ses-1_task-rest_run-1_bold.json',
 '/sub-10001/ses-2/func/sub-10001_ses-2_task-rest_run-1_bold.json',
 '/sub-10612/ses-1/func/sub-10612_ses-1_task-rest_run-1_bold.json',
 '/sub-10615/ses-1/func/sub-10615_ses-1_task-rest_run-1_bold.json',
 '/sub-10615/ses-2/func/sub-10615_ses-2_task-rest_run-1_bold.json',
 '/sub-10617/ses-2/func/sub-10617_ses-2_task-rest_run-1_bold.json',
 '/sub-10618/ses-1/func/sub-10618_ses-1_task-rest_run-1_bold.json',
 '/sub-10618/ses-2/func/sub-10618_ses-2_task-rest_run-1_bold.json',
 '/sub-10620/ses-1/func/sub-10620_ses-1_task-rest_run-1_bold.json',
 '/sub-10620/ses-2/func/sub-10620_ses-2_task-rest_run-1_bold.json',
 '/sub-10621/ses-1/func/sub-10621_ses-1_task-rest_run-1_bold.json',
 '/sub-10622/ses-1/func/sub-10622_ses-1_task-rest_run-1_bold.json',
 '/sub-10624/ses-2/func/sub-10624_ses-2_task-rest_run-1_bold.json',
 '/sub-10627/ses-1/func/sub-10627_ses-1_task-rest_run-1_bold.json',
 '/sub-10627/ses-2/func/sub-10627_ses-2_task-res

In [59]:
def insert_taskname(js):
    
    with open(js, 'r') as read_file:
        data = json.load(read_file)
    
    data.update({"TaskName": "rest"})
    
    with open(js, 'w') as outfile:
        json.dump(data, outfile)

In [63]:
[insert_taskname(root_dir + x[1:]) for x in task_jsons]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

This should fix everything except the slice timing:

In [64]:
validation3 = pd.read_csv('/cbica/projects/RBC/RBC_RAWDATA/bidsdatasets/hrc_2021-01-27_03:20:38_validation.csv')

In [66]:
validation3.groupby(['severity', 'type', 'code']).agg(['count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,files,description,url
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,count,count
severity,type,code,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
error,JSON_SCHEMA_VALIDATION_ERROR,55,25,25,25
warning,NO_AUTHORS,113,0,1,1
