# Create OAI Dataframes

This notebook creates dataframes from reading in all data, and then saves them as pickle files for faster ingestion in other notebooks.

In [None]:
import os
import pandas as pd
import pickle
import pyreadstat
from tqdm import tqdm

In [None]:
# Constants
data_dir = '../data/structured_data/'
visits = {'P02':'IEI', 'P01':'SV', 'V00':'EV', 'V01':'12m', 'V02':'18m', 'V03':'24m', 'V04':'30m', 'V05':'36m', 'V06':'48m', 'V07':'60m', 'V08':'72m', 'V09':'84m', 'V10':'96m', 'V11':'108m', 'V99':"Outcomes"}

In [None]:
# All SAS files
all_files = os.listdir(data_dir)
all_files = [x for x in all_files if '.sas7bdat' in x]

In [None]:
# Given a list of files, read them all in and join them into a single dataframe
def create_df(prefix, files, index=None):
    fileset = [x for x in files if x.startswith(prefix)]
    fileset.sort()

    tmp_df_list = []
    # for filename in tqdm(fileset):
    for filename in fileset:
        tmp_df, _ = pyreadstat.read_file_multiprocessing(pyreadstat.read_sas7bdat, data_dir + filename, catalog_file=data_dir + 'formats.sas7bcat', num_processes=6, user_missing=True)
        if index:
            tmp_df_list.append(tmp_df.set_index(index))
        else:
            tmp_df_list.append(tmp_df)
    tmp_df = pd.concat(tmp_df_list, axis=1)
    print(prefix + ' - ' + str(tmp_df.shape))
    pickle.dump(tmp_df, open( 'pkl/' + prefix + '_values.pkl', 'wb'))

all_prefix_list = [
'Biospec_fnih_joco_demographics',
'Clinical_fnih',
'acceldatabyday',
'acceldatabymin',
'accelerometry',
'allclinical',
'biomarkers',
'biospec_fnih_joco_assays',
'biospec_fnih_labcorp',
'boneancillarystudy',
'enrollees',
'flxr_kneealign_cooke',
'flxr_kneealign_duryea',
'kmri_fnih_boneshape_imorphics',
'kmri_fnih_qcart_Chondrometrics',
'kmri_fnih_qcart_biomediq',
'kmri_fnih_sbp_qmetrics',
'kmri_fnih_sq_moaks_bicl',
'kmri_poma_incoa_moaks_bicl',
'kmri_poma_tkr_chondrometrics',
'kmri_poma_tkr_moaks_bicl',
'kmri_qcart_eckstein',
'kmri_qcart_link',
'kmri_qcart_vs',
'kmri_sq_bicl',
'kmri_sq_blksbml_bicl',
'kmri_sq_moaks_bicl',
'kmri_sq_worms_link',
'kxr_fnih_bti_duke',
'kxr_fta_duryea',
'kxr_qjsw_duryea',
'kxr_qjsw_rel_duryea',
'kxr_sq_bu',
'kxr_sq_rel_bu',
'measinventory',
'mif',
'mri',
'outcomes',
'sageancillarystudy',
'sageancillarystudy_formats',
'subjectchar',
'xray']

## Files indexed by 'ID'

In [None]:
prefix_list = [
'Clinical_fnih',
'allclinical',
'biomarkers',
'biospec_fnih_labcorp',
'boneancillarystudy',
'enrollees',
'kmri_fnih_boneshape_imorphics',
'kmri_fnih_qcart_Chondrometrics',
'kmri_fnih_qcart_biomediq',
'kmri_fnih_sbp_qmetrics',
'kmri_fnih_sq_moaks_bicl',
'kmri_poma_incoa_moaks_bicl',
'kmri_qcart_link',
'kmri_qcart_vs',
'kmri_sq_bicl',
'kmri_sq_blksbml_bicl',
'kmri_sq_worms_link',
'kxr_fnih_bti_duke',
'sageancillarystudy.sas7bdat',
'subjectchar']

for prefix in prefix_list:
    create_df(prefix, all_files, 'ID')

## Files indexed by 'id'

In [None]:
prefix_list = [
'kmri_poma_tkr_chondrometrics',
'kmri_poma_tkr_moaks_bicl',
'measinventory',
'outcomes'
]

for prefix in prefix_list:
    create_df(prefix, all_files, 'id')

## Files with no index

In [None]:
prefix_list = [
'acceldatabyday',
'acceldatabymin',
'accelerometry',
'flxr_kneealign_cooke',
'flxr_kneealign_duryea',
'kmri_qcart_eckstein',
'kmri_sq_moaks_bicl',
'kxr_fta_duryea',
'kxr_qjsw_duryea',
'kxr_qjsw_rel_duryea',
'kxr_sq_bu',
'kxr_sq_rel_bu',
'mif',
'mri',
'xray']

for prefix in prefix_list:
    create_df(prefix, all_files)

## Files indexed by SpecID

In [None]:
prefix_list = [
 'Biospec_fnih_joco_demographics',
 'biospec_fnih_joco_assays'
]

for prefix in prefix_list:
    create_df(prefix, all_files, 'SpecID')

## Unknown
TODO: Figure out what 'sageancillarystudy_formats.sas7bdat' is. Is it a mis-named sas7bcat file? Doesn't seem to be a standard data file.