In [0]:
# !wget http://public.gi.ucsc.edu/~rcurrie/synthea-100k-MA-json-gz.tar

In [0]:
import glob
import tarfile, gzip
import pandas as pd
import os as os
from collections import Counter
from collections import defaultdict
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time

In [0]:
# Updates a dictionary from a single patient json
def dataPipe(training_data, patient):
    if patient['entry'][0]['resource']['resourceType'] != "Patient":
      return

    id = patient['entry'][0]['resource']['id']
    # List of dates and codes
    patient_data = []
    # First discharge and re-hosp date
    patient_label = {}
    patient_label['chf_first_discharge'] = None
    patient_label['chf_rehosp'] = None
    
    for entry in patient['entry']:
        resource_type = entry['resource']['resourceType']
        
        if resource_type == "Encounter":
            class_code = entry['resource']['class']['code']
            # Get chf hospitalization dates
            if class_code == "IMP" or class_code == "EMER": # hospitalized
                try:
                    reason_code = entry['resource']['reasonCode'][0]['coding'][0]['code']
                except:
                    reason_code = entry['resource']['type'][0]['coding'][0]['code']
                start_date = entry['resource']['period']['start'][0:10]
                end_date = entry['resource']['period']['end'][0:10]
                if reason_code == "88805009":
                    if patient_label['chf_first_discharge'] is None:
                        patient_label['chf_first_discharge'] = end_date
                    #Check if rehosp date is greater than 29 days from first discharge
                    elif patient_label['chf_rehosp'] is None and (pd.date_range(patient_label['chf_first_discharge'], start_date).shape[0] > 29):
                        patient_label['chf_rehosp'] = start_date
                else:
                    patient_data.append([start_date, reason_code])
            # Get codes
            start_date = entry['resource']['period']['start'][0:10]
            try:
                reason_code = entry['resource']['reasonCode'][0]['coding'][0]['code']
            except:
                reason_code = entry['resource']['type'][0]['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Observation":
            start_date = entry['resource']['effectiveDateTime'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Procedure":
            start_date = entry['resource']['performedPeriod']['start'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Condition":
            start_date = entry['resource']['onsetDateTime'][0:10]
            reason_code = entry['resource']['code']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "Immunization":
            start_date = entry['resource']['occurrenceDateTime'][0:10]
            reason_code = entry['resource']['vaccineCode']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
        
        elif resource_type == "MedicationRequest":
            start_date = entry['resource']['authoredOn'][0:10]
            reason_code = entry['resource']['medicationCodeableConcept']['coding'][0]['code']
            patient_data.append([start_date, reason_code])
    
    
    # patient_data and patient_label are added to training_data
    training_data[id] = {}
    training_data[id].update(patient_label)
    training_data[id]['codes'] = []
    training_data[id]['codes'].append(patient_data)

In [0]:
# Constructs then returns training_data
def getPatientData(tfile):
    training_data = {}
    n=0
    for member in tfile:
        #Skip directory labeled at the top
        if n == 0:
            n = 1
            continue
            
        # Extract and decompress json
        f = tfile.extractfile(member)
        content=f.read()
        try:
          expanded = gzip.decompress(content)
        except:
          continue
        # Updates training_data from patient json
        dataPipe(training_data, pd.read_json(expanded))
        # Empties list to save on RAM
        tfile.members = []

    return training_data

In [0]:
# Converts a DataFrame of codes into monthly buckets
def frameToBuckets(df_codes, training_data):
    frames = []
    for index, row in df_codes.iterrows():
        buckets = []
        id = row['id']
        
        # Get range of re-hospitalization date (or most current date) and prior 2 years
        try:
            end_range = datetime.strptime(training_data[id]['chf_rehosp'], '%Y-%m-%d').date()
        except:
            end_range = datetime.strptime(row['codes'][0][-1][0], '%Y-%m-%d').date()
        start_range = end_range - relativedelta(years=2)
        
        # Set DataFrame to range
        df = pd.DataFrame(row['codes'][0]).rename(columns={0:'date', 1:'codes'})
        df['date'] = pd.to_datetime(df['date'])
        df = df[df['date'].between(start_range, end_range)]
        df = df.set_index('date')
        
        # Group codes by month
        df = df.groupby(pd.Grouper(freq='M'))
        df = df.aggregate(lambda x: tuple(x))

        # Reverse DataFrame and drop dates so the index represents each month
        df = df.iloc[::-1].reset_index()

        # Each month's codes gets added to buckets
        for codes in df['codes']:
            code_dict = {}
            for code in codes:
                code_dict[code] = 1.0
            buckets.append(code_dict)

        # Create patient_frame
        patient_frame = pd.DataFrame.from_dict(buckets).fillna(0).sort_index(ascending=False, axis=1)
        # Append zero-filled rows so all frames match in row size
        for _ in range(25 - len(patient_frame)):
            patient_frame.loc[len(patient_frame)] = 0
        # Add re-hosp column
        if training_data[id]['chf_rehosp'] is None:
            patient_frame['chf_rehosp'] = 0.0
        else:
            patient_frame['chf_rehosp'] = 1.0

        frames.append(patient_frame)
        
    return frames

In [0]:
start_time = time.time()

# Open Files
fileName = "synthea-100k-MA-json-gz.tar"
tfile = tarfile.open(fileName)

In [0]:
training_data = getPatientData(tfile)

In [0]:
# Construct DataFrames from training_data
df_all = pd.DataFrame.from_dict(training_data)
df_all = df_all.T.reset_index().rename(columns={'index':'id'})
 
# Separate chf from general populace
df_chf = df_all[df_all.chf_first_discharge.notnull()]
df_chf_codes = df_chf.iloc[:, [0,3]]

# display(df_chf_codes)
display(df_chf)

Unnamed: 0,id,chf_first_discharge,chf_rehosp,codes
8,784f07df-663a-42f2-863a-4222b6de8ffe,2017-10-19,,"[[[1960-05-06, 185349003], [1985-03-27, 239873..."
47,5464c695-732f-46c3-8ebe-3af67248dbaf,2016-09-01,,"[[[1978-04-03, 162673000], [1978-04-03, 596210..."
101,88805406-2b6e-4fae-b8b2-4829752f0271,2017-02-22,,"[[[1981-09-23, 162673000], [1981-09-23, 157770..."
137,9a2a4280-3613-4c2b-827f-00ee5d589a4f,2005-12-30,,"[[[1941-11-27, 72892002], [1941-11-27, 1916900..."
171,4abe1ebf-68c2-4085-8d60-be76ea4b6c9d,2013-08-12,2016-07-27,"[[[1940-02-15, 410620009], [1940-02-15, 162864..."
...,...,...,...,...
114302,70a6846d-2c6b-4412-8e61-8d2e98c6e633,2006-02-10,,"[[[1912-06-03, 185347001], [1912-06-17, 185347..."
114316,07fe287e-bd25-4c63-b4fc-f35e48a4c063,2002-03-19,,"[[[1983-03-10, 162673000], [1983-03-10, 162864..."
114317,5bbff0d2-cf54-4eca-9a7c-b33131123199,1990-08-02,1991-01-29,"[[[1944-04-02, 30832001], [1944-04-02, 3083200..."
114339,143b2f0e-bd4e-4175-9979-ca17b664399f,2014-05-03,,"[[[1991-08-15, 185349003], [1991-08-15, 162864..."


In [0]:
# Convert DataFrame codes to monthly bucket
bucket_frames = frameToBuckets(df_chf_codes, training_data)

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  lmask = self >= left
'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.
  rmask = self <= right


In [0]:
all_columns = []
for frame in bucket_frames:
    all_columns.extend(x for x in frame.columns.tolist() if not x in all_columns)
final_frames = []
for df in bucket_frames:
    cols = df.columns.tolist()
    cols.extend(x for x in all_columns if not x in cols)
    df = df.reindex(columns=sorted(cols, reverse=False), fill_value=0)
    final_frames.append(df)

In [0]:
display(final_frames[0])

Unnamed: 0,10230-1,104091002,104326007,1043400,104375008,104435004,10480-2,1049221,1049635,105078,10509002,105585,106892,108290001,10834-0,1094107,109838007,1100184,110030002,112790001,113,11466000,117010004,117015009,118001005,1190795,121,1225002,122548005,1234995,124171000119105,126906006,127013003,127783003,128613002,133,140,14768001,14959-1,15081005,...,85339-0,85343-2,85344-0,85352-3,85354-9,855332,85548006,856980,856987,857005,86013001,860975,861467,865098,866414,87433001,88039007,88040-1,8867-4,88805009,895994,896209,897718,90226004,904419,90470006,90560007,90781000119102,91602002,92691004,9279-1,93761005,94260004,95417003,966222,97331000119101,996740,997223,999967,chf_rehosp
0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
1,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
2,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1.0,0,1.0,0,...,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,1.0,0.0,0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0,0,0,0,0,0.0
3,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
4,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
6,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
7,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
8,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0
9,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0,0.0,0,...,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0.0


In [0]:
# # Export csv's
for i in range(len(final_frames)):
    final_frames[i].to_csv('csv/patient' + str(i) + '.csv')

In [0]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 4970.646319389343 seconds ---


In [0]:
!zip -r /content/csv.zip /content/csv

  adding: content/csv/ (stored 0%)
  adding: content/csv/patient1752.csv (deflated 91%)
  adding: content/csv/patient839.csv (deflated 92%)
  adding: content/csv/patient1364.csv (deflated 92%)
  adding: content/csv/patient3106.csv (deflated 92%)
  adding: content/csv/patient1309.csv (deflated 92%)
  adding: content/csv/patient2012.csv (deflated 92%)
  adding: content/csv/patient2647.csv (deflated 91%)
  adding: content/csv/patient4063.csv (deflated 92%)
  adding: content/csv/patient363.csv (deflated 91%)
  adding: content/csv/patient3651.csv (deflated 92%)
  adding: content/csv/patient4688.csv (deflated 92%)
  adding: content/csv/patient34.csv (deflated 92%)
  adding: content/csv/patient2475.csv (deflated 91%)
  adding: content/csv/patient3546.csv (deflated 91%)
  adding: content/csv/patient4746.csv (deflated 91%)
  adding: content/csv/patient3884.csv (deflated 91%)
  adding: content/csv/patient3060.csv (deflated 92%)
  adding: content/csv/patient414.csv (deflated 91%)
  adding: conten

In [0]:
#!rm csv/*