In [1]:
import glob
import tarfile, gzip
import pandas as pd
from collections import Counter
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [2]:
# Updates a dictionary from a single patient json
def dataPipe(training_data, patient):
    id = patient['entry'][0]['resource']['id']
    # List of dates and codes
    patient_data = []
    # First discharge and re-hosp date
    patient_label = {}
    patient_label['chf_first_discharge'] = None
    patient_label['chf_rehosp'] = None
    
    for entry in patient['entry']:
        resource_type = entry['resource']['resourceType']
        
        if resource_type == "Encounter":
            class_code = entry['resource']['class']['code']
            # Get chf hospitalization dates
            if class_code == "IMP" or class_code == "EMER": # hospitalized
                try:
                    reason_code = entry['resource']['reasonCode'][0]['coding'][0]['code']
                except:
                    continue
                if reason_code == "88805009":
                    start_date = entry['resource']['period']['start'][0:10]
                    end_date = entry['resource']['period']['end'][0:10]
                    if patient_label['chf_first_discharge'] is None:
                        patient_label['chf_first_discharge'] = end_date
                    elif patient_label['chf_rehosp'] is None:
                        patient_label['chf_rehosp'] = end_date
            # Get codes
            start_date = entry['resource']['period']['start'][0:10]
            try:
                reason_code = entry['resource']['reasonCode'][0]['coding'][0]['code']
            except:
                reason_code = entry['resource']['type'][0]['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Observation":
            start_date = entry['resource']['effectiveDateTime'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Procedure":
            start_date = entry['resource']['performedPeriod']['start'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Condition":
            start_date = entry['resource']['onsetDateTime'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Immunization":
            start_date = entry['resource']['occurrenceDateTime'][0:10]
            reason_code = entry['resource']['vaccineCode']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "MedicationRequest":
            start_date = entry['resource']['authoredOn'][0:10]
            reason_code = entry['resource']['medicationCodeableConcept']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
    
    
    # patient_data and patient_label are added to training_data
    training_data[id] = {}
    training_data[id].update(patient_label)
    training_data[id]['codes'] = []
    training_data[id]['codes'].append(patient_data)

In [3]:
# Constructs then returns training_data
def getPatientData(tfile):
    training_data = {}
    n=0
    for member in tfile:
        #Skip directory labeled at the top
        if(n==0):
            n = 1
            continue
            
        # Extract and decompress json
        f = tfile.extractfile(member)
        content=f.read()
        expanded = gzip.decompress(content)
        # Updates data_dict from patient json
        dataPipe(training_data, pd.read_json(expanded))
        # Empties list to save on RAM
        tfile.members = []
        
        if n == 1:
            #display(training_data)
            print(1)
        elif n == 100:
            #display(training_data)
            print(100)
        elif n == 1000:
            #display(training_data)
            print(1000)
            break
        elif n == 10000:
            #display(training_data)
            print(10000)
        elif n == 100000:
            #display(training_data)
            print(100000)
        n+=1
    
    return training_data

In [4]:
# Converts a DataFrame of codes into monthly buckets
def frameToBuckets(df_codes, training_data):
    frames = []
    for index, row in df_codes.iterrows():
        buckets = []
        id = row['id']
        
        # Get range of re-hospitalization date (or most current date) and prior 2 years
        try:
            end_range = datetime.strptime(training_data[id]['chf_rehosp'], '%Y-%m-%d').date()
        except:
            end_range = datetime.strptime(row['codes'][0][-1][0], '%Y-%m-%d').date()
        # start_range = end_range.replace(year=end_range.year - 2)
        start_range = end_range - relativedelta(years=2)
        # print(start_range, end_range)
        
        
        # Set DataFrame to range
        df = pd.DataFrame(row['codes'][0]).rename(columns={0:'date', 1:'codes'})
        df['date'] = pd.to_datetime(df['date'])
        df = df[df['date'].between(start_range, end_range)]
        df = df.set_index('date')
        
        # Group codes by month
        df = df.groupby(pd.Grouper(freq='M'))
        df = df.aggregate(lambda x: tuple(x))

        # Reverse DataFrame and drop dates so the index represents each month
        df = df.iloc[::-1].reset_index()

        # Each month's codes gets added to buckets
        for codes in df['codes']:
            code_dict = {}
            for code in codes:
                code_dict[code] = 1.0
            buckets.append(code_dict)

        # Create patient_frame
        patient_frame = pd.DataFrame.from_dict(buckets).fillna(0).sort_index(ascending=False, axis=1)
        if training_data[id]['chf_first_discharge'] is not None:
            patient_frame['chf_rehosp'] = 1.0
        else:
            patient_frame['chf_rehosp'] = 0

        frames.append(patient_frame)
        
    return frames

In [5]:
# Constructs and returns training_data
def getPatientDataTemp():
    training_data = {}
    for file in glob.glob('../data/fhir/*'):
        dataPipe(training_data, pd.read_json(file))
    return training_data

In [6]:
# Open Files
fileName = "synthea-100k-MA-json-gz.tar"
tfile = tarfile.open(fileName)

In [7]:
# training_data = getPatientData(tfile)
training_data = getPatientDataTemp()

In [8]:
# Construct DataFrames from training_data
df_all = pd.DataFrame.from_dict(training_data)
df_all = df_all.T.reset_index().rename(columns={'index':'id'})

# Separate hospitalization label from codes
df_all_hosp = df_all.iloc[:, [0,1,2]]
df_all_codes = df_all.iloc[:, [0,3]]
 
# Separate chf from general populace
df_chf = df_all[df_all.chf_first_discharge.notnull()]
df_chf_hosp = df_chf.iloc[:, [0,1,2]]
df_chf_codes = df_chf.iloc[:, [0,3]]

# display(df_chf_codes)
display(df_chf)


Unnamed: 0,id,chf_first_discharge,chf_rehosp,codes
0,ec3c191c-81f3-4d2f-ac27-52125c0fb43d,2013-01-24,2013-01-25,"[[[1959-08-30, 162673000], [1959-08-30, 596210..."
2,c3ac4f32-9d24-4b40-9362-6f79645c3ff7,2010-12-31,2011-01-01,"[[[1949-01-17, 75498004], [1949-01-17, 4005500..."
3,bfdc96ef-c8f0-424b-9a2d-367ebd9478bd,1964-03-15,1964-03-16,"[[[1929-10-14, 162673000], [1929-10-14, 596210..."
4,ab714f8c-4f3c-4fe4-988d-ec3e67f2ed5f,2019-07-16,2019-07-17,"[[[1959-09-22, 162673000], [1959-09-22, 157770..."


In [12]:
# Convert DataFrame codes to monthly bucket
bucket_frame = frameToBuckets(df_chf_codes, training_data)

In [10]:
display(bucket_frame[0])


Unnamed: 0,88805009,866414,85354-9,8302-2,746030,72514-3,72166-2,6768-6,6299-2,6298-4,...,18262-6,1751-7,1742-6,1719286,162673000,140,10834-0,10509002,10230-1,chf_rehosp
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
7,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
