# Create Data Specific Dataframes
Each data set is examined and concatenated in a way that makes the most sense for that set. Further, column types are set for more efficient storage.

## Imports / Definitions

In [None]:
import os
import pandas as pd
import pickle
import pyreadstat
from tqdm import tqdm
import datetime
import re

In [None]:
# Constants
data_dir = '../data/structured_data/'
visits = {'P02':'IEI', 'P01':'SV', 'V00':'EV', 'V01':'12m', 'V02':'18m', 'V03':'24m', 'V04':'30m', 'V05':'36m', 'V06':'48m', 'V07':'60m', 'V08':'72m', 'V09':'84m', 'V10':'96m', 'V11':'108m', 'V99':"Outcomes"}

In [None]:
def remove_prefix(text, prefix): # replaced by str.removeprefix in Python 3.9
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

def remove_suffix(text, suffix): # replaced by str.removesuffix in Python 3.9
    if text.endswith(suffix):
        return text[:-len(suffix)]
    return text

# Clean visit prefixes
def remove_visit_prefixes(str_list):
    return [s[3:] if re.match("^[vVpP]\d\d\D\S*", s) else s for s in str_list]

# Return a list of all unique prefixes
def collect_prefixes(str_list):
    return list({s[:3] for s in str_list if re.match("^[vVpP]\d\d\D\S*", s) })

# Check that all the columns are the same between files
def column_uniformity_check(prefix):
    fileset = [x for x in all_files if x.startswith(prefix)]
    fileset.sort()

    tot_cnt = 0
    col_set = {}
    for filename in fileset:
        tmp_df, _ = pyreadstat.read_file_multiprocessing(pyreadstat.read_sas7bdat, data_dir + filename, catalog_file=data_dir + 'formats.sas7bcat', num_processes=6, user_missing=True)
        tot_cnt += tmp_df.shape[0]
        print(filename + ': '+ str(tmp_df.shape))
        
        col_names = remove_visit_prefixes(list(tmp_df.columns))
        if not col_set:
            print(col_names)
            col_set = set([c.upper() for c in col_names])
        # display any new or missing elements 
        elif col_set ^ set(col_names):
            # Weed out difference solely from SAS case insensitivity
            upper_col_names = set([c.upper() for c in col_names])
            if not col_set ^ upper_col_names:
                print('Names only differ by case')
            else:
                print(list(col_set ^ set(upper_col_names)))
                col_set = set(upper_col_names)

    print('Total rows: ' + str(tot_cnt))

In [None]:
# Create a single dataframe for all variables across a given fileset
def create_df(prefix, files):
    fileset = [x for x in files if x.startswith(prefix)]
    fileset.sort()              
    
    # Get column names from first file in set
    filename = fileset[0]
    tmp_df, _ = pyreadstat.read_file_multiprocessing(pyreadstat.read_sas7bdat, data_dir + filename, catalog_file=data_dir + 'formats.sas7bcat', num_processes=6, user_missing=True)

    # remove visit prefix from all names
    cols = [c.upper() for c in list(tmp_df.columns)]  # Keeping all OAI provided variables uppercase
    cols = list(dict.fromkeys(remove_visit_prefixes(cols)))  # Also drops duplicate names
    cols.insert(1,'Visit')  # Variables added by this code are camel case

    # create empty master dataframe
    master_df = pd.DataFrame(columns=cols)

    # read in data and append data to master dataframe
    for filename in fileset:
        tmp_df, _ = pyreadstat.read_file_multiprocessing(pyreadstat.read_sas7bdat, data_dir + filename, catalog_file=data_dir + 'formats.sas7bcat', num_processes=6, user_missing=True)
        # Move all column names to uppercase (SAS is case insensitive and the data is inconsistent)
        upper_cols = {c: c.upper() for c in tmp_df.columns}
        tmp_df = tmp_df.rename(columns=upper_cols)
        
        # For each visit 
        visits = collect_prefixes(tmp_df.columns)
        print('Visits: ' + str(visits))
        for visit in visits:
            visit_vars = ['ID', 'VERSION']
            visit_vars.extend([v for v in tmp_df.columns if v.startswith(visit)])
            print(visit_vars)
            tmp2_df = tmp_df[visit_vars]
            tmp2_df.insert(1, 'Visit', visit)
            master_df = master_df.append(tmp2_df)
        
        #visit = 'V' + remove_suffix(remove_prefix(filename, prefix), '.sas7bdat')
        # make column name map
        #new_cols = {c: remove_prefix(c.upper(), visit) for c in tmp_df.columns}
        #tmp_df = tmp_df.rename(columns=new_cols)
        #
        #master_df = master_df.append(tmp_df)

    return master_df

In [None]:
# Inspect values and types
def show_vals_types(df):
    for col in df.columns:
        vals = pd.unique(df[col])    
        types = set([type(v) for v in vals])
        
        types_warning = ''
        if len(types) > 1:
            types_warning = '\t' + str(len(types)) + ' types found.'
        if len(vals) <= 20:
            print(col + ':\t' + str(list(vals)) + types_warning)
        else:    
            if datetime.date in types:
                vals = [v for v in vals if isinstance(v, datetime.date)]
            elif float in types:
                vals = [v for v in vals if isinstance(v, float)]
            print(col + ':\tNot a limited set: ' + str(len(vals)) + '\t(Max: ' + str(max(vals)) +')'  + types_warning)

## Look at the filesets

In [None]:
# All SAS files
all_files = os.listdir(data_dir)
all_files = [x for x in all_files if '.sas7bdat' in x]

In [None]:
# How many files are there?
len(all_files)

In [None]:
# How many sets?
from string import digits

# Drop extensions
tmp = [removesuffix(f, '.sas7bdat') for f in all_files]
# Drop visit suffixes
tmp = set([f.translate(f.maketrans('', '', digits)) for f in tmp])
len(tmp)

## acceldatabyday

In [None]:
prefix = 'acceldatabyday'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'PAWEEKDAY'
]
int_cols = [
    'ID', 'PASTUDYDAY', 'VDAYSEQUENCE', 'PAMONTH',
    'DAYMODMINT', 'DAYMODMINF', 'DAYMODMINS',
    'DAYVIGMINT', 'DAYVIGMINF', 'DAYVIGMINS',
    'DAYMVMINT', 'DAYMVMINF', 'DAYMVMINS',
    'DAYCNT', 'DAYLTMINT', 'DAYLTMINF', 'DAYLTMINS',
    'DAYMVBOUTMINT', 'DAYMVBOUTMINF', 'DAYMVBOUTMINS',
    'DAYVBOUTMINT', 'DAYVBOUTMINF', 'DAYVBOUTMINS'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
# tmp_df = tmp_df.astype({'ID': int})
# tmp_df = tmp_df.astype({'ID': 'UInt32'})
for col in int_cols:
    tmp_df[col] = pd.to_numeric(tmp_df[col], downcast='unsigned') # Somehow this is smaller than UInt32
tmp_df['WEARHR'] = pd.to_numeric(tmp_df['WEARHR'], downcast='float')

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

In [None]:
pickle.dump(tmp_df, open(prefix + '_values.pkl', 'wb'))

## acceldatabymin

In [None]:
prefix = 'acceldatabymin'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'PAWEEKDAY'
]
int_cols = [
    'ID', 'PASTUDYDAY', 'MINSEQUENCE',
    'SUSPECTMINUTE', 'PAMONTH'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
for col in int_cols:
    tmp_df[col] = pd.to_numeric(tmp_df[col], downcast='unsigned')
tmp_df['MINCNT'] = pd.to_numeric(tmp_df['MINCNT'], downcast='float')

In [None]:
tmp_df.memory_usage(deep=True)

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

In [None]:
pickle.dump(tmp_df, open(prefix + '_values.pkl', 'wb'))

## accelerometry

In [None]:
prefix = 'accelerometry'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'ADHHS8', 'ADHHSD8', 'APASTAT'
]
int_cols = [
    'ID'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
for col in int_cols:
    tmp_df[col] = pd.to_numeric(tmp_df[col], downcast='unsigned')

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

Need to revisit an further clean up the data.

In [None]:
pickle.dump(tmp_df, open(prefix + '_values.pkl', 'wb'))

## allclinical

In [None]:
prefix = 'allclinical'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

## Load x-ray data and examine it

In [None]:
xray_df = create_df('xray', all_files)

cat_cols = [
    'Visit', 'FileSet', 'VERSION', 'ACCEPT',
    'ALIGN', 'CENTER', 'DEPICT', 'EXAMTP', 
    'EXPOSE', 'MOTION', 'POSITN', 'XNDREAS',
    'XRCOMP', 'XRSIDE'
]

xray_df = xray_df.astype({col: 'category' for col in cat_cols})
xray_df = xray_df.astype({'ID': int})
xray_df = xray_df.astype({'ID': 'UInt32'})

In [None]:
pickle.dump(xray_df, open('xray_values.pkl', 'wb'))

In [None]:
print('DF Size: ' + str(xray_df.shape))
show_vals_types(xray_df)

In [None]:
xray_df.XRCOMP.value_counts()

In [None]:
print(xray_df.EXAMTP.value_counts())
print('Total: ' + str(xray_df.EXAMTP.value_counts().sum()))

In [None]:
xray_df.ACCEPT.value_counts()

In [None]:
xray_df[(xray_df.EXAMTP=='Bilateral PA Fixed Flexion Knee') & (xray_df.XRCOMP=='1: Yes')].ACCEPT.value_counts()

## MRI

In [None]:
mri_df = create_df('mri', all_files)
print(mri_df.shape)

In [None]:
cat_cols = [
    'Visit', 'FileSet', 'VERSION', 'MEXAMTP',
    'MNDREAS', 'MRCOMP', 'MRSIDE', 'QCRESLT', 'SCNUPGR', 
    'MRMARK', 'CLUPGR', 'MQCCMNT', 'MQCFLAG'
]

mri_df = mri_df.astype({col: 'category' for col in cat_cols})
mri_df = mri_df.astype({'ID': int})
mri_df = mri_df.astype({'ID': 'UInt32'})

In [None]:
mri_df.dtypes

In [None]:
pickle.dump(mri_df, open('mri_values.pkl', 'wb'))

In [None]:
mri_df.columns

In [None]:
mri_df.Visit.value_counts()

In [None]:
mri_df.MRCOMP.value_counts()

In [None]:
print('DF Size: ' + str(mri_df.shape) + '\n')
show_vals_types(mri_df)

## MIF

In [None]:
column_uniformity_check('mif')

In [None]:
tmp_df = create_df('mif', all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'MIFNAME',
    'FRMCODE', 'MIFFREQ', 'MIFDUR', 'MIFUSE',
    'INGNAME'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

It isn't clear what to do with INGCODE, they seem to be 9 digits plus '.0' except for some values labelled 'M'

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

## kxr_sq_rel_bu

In [None]:
prefix = 'kxr_sq_rel_bu'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

Note that XRJSM, XRJSL are floats being treated as categorical. Doing so for now to save memory, until I look into their meaning.

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

## kxr_sq_bu

In [None]:
prefix = 'kxr_sq_bu'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

In [None]:
cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

In [None]:
print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

## kxr_qjsw_rel_duryea

In [None]:
prefix = 'kxr_sq_bu'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## kxr_qjsw_duryea

In [None]:
prefix = 'kxr_qjsw_duryea'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## kxr_fta_duryea

In [None]:
prefix = 'kxr_fta_duryea'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## kmri_sq_moaks_bicl

In [None]:
prefix = 'kmri_sq_moaks_bicl'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## kmri_qcart_eckstein

In [None]:
prefix = 'kmri_qcart_eckstein'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## flxr_kneealign_duryea

In [None]:
prefix = 'flxr_kneealign_duryea'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))

## flxr_kneealign_cooke

In [None]:
prefix = 'flxr_kneealign_cooke'
column_uniformity_check(prefix)

In [None]:
tmp_df = create_df(prefix, all_files)
print(tmp_df.shape)

cat_cols = [
    'Visit', 'VERSION', 'SIDE', 'READPRJ', 'VERSION', # 'BARCDBU',
    'XROSFM', 'XRSCFM', 'XRCYFM', 'XRJSM',
    'XRCHM', 'XROSTM', 'XRSCTM', 'XRCYTM', 'XRATTM',
    'XRKL', 'XROSFL', 'XRSCFL', 'XRCYFL', 'XRJSL',
    'XRCHL', 'XROSTL', 'XRSCTL', 'XRCYTL', 'XRATTL',
    'XRNWKL2', 'XRNW2N'
]

tmp_df = tmp_df.astype({col: 'category' for col in cat_cols})
tmp_df = tmp_df.astype({'ID': int})
tmp_df = tmp_df.astype({'ID': 'UInt32'})

print(tmp_df.dtypes)
print()
show_vals_types(tmp_df)

pickle.dump(mri_df, open(prefix + '_values.pkl', 'wb'))