In [0]:
import glob
import json
import tarfile
import pandas as pd
import numpy as np
import os as os
from datetime import datetime
from dateutil.relativedelta import relativedelta
from gensim.models import Word2Vec
from itertools import chain

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 16)

In [0]:
class CHFDataset:

    def __init__(self):
        self.emb = None

    def set_dataset(self, path, sample_size=None, additional_path=None):
        dataset = {}

        for json_patient in ExtractJsonFile(path, sample_size, additional_path):
            id = json_patient['entry'][0]['resource']['id']
            dataset[id] = {}
            dataset[id]['codes'] = []
            dataset[id]['chf'] = None

            for bundle in ExtractEntry(json_patient):
                if (bundle.get('encounter_type') == "IMP" or bundle.get('encounter_type') == "EMER" or bundle.get('encounter_type') == "AMB") \
                and bundle['code'] == "Chronic congestive heart failure (disorder)":
                    if dataset[id]['chf'] is None:
                        dataset[id]['chf'] = bundle['start_date']
                        break

                dataset[id]['codes'].append([bundle['start_date'], bundle['code']])

        self.emb = Embeddings(dataset)

In [0]:
class MYOINFDataset:

    def __init__(self):
        self.emb = None

    def set_dataset(self, path, sample_size=None, additional_path=None):
        dataset = {}

        for json_patient in ExtractJsonFile(path, sample_size, additional_path):
            id = json_patient['entry'][0]['resource']['id']
            dataset[id] = {}
            dataset[id]['codes'] = []
            dataset[id]['myoinf'] = None

            for bundle in ExtractEntry(json_patient):
                if (bundle.get('encounter_type') == "IMP" or bundle.get('encounter_type') == "EMER" or bundle.get('encounter_type') == "AMB") \
                and bundle['code'] == "Myocardial Infarction":
                    if dataset[id]['myoinf'] is None:
                        dataset[id]['myoinf'] = bundle['start_date']
                        break

                dataset[id]['codes'].append([bundle['start_date'], bundle['code']])

        self.emb = Embeddings(dataset)

In [0]:
class ExtractJsonFile:

    def __init__(self, path, sample_size=None, additional_path=None):
        self.path = path
        self.sample_size = sample_size
        self.additional_path = additional_path

    def __iter__(self):
        fhircodes = {}
        counter, n = 0, 1

        with tarfile.open(self.path, "r:gz") as tfile:
            for member in tfile:
                if (member.isdir()):
                    continue

                yield pd.read_json(tfile.extractfile(member))

                counter = counter+1
                if (counter == n):
                    print("Processed " + str(n) + " files")
                    n = n*2
                if (self.sample_size == counter):
                    break

        # Read an equal number of files to that of the target path
        additional_path_size = counter
        counter, n = 0, 1

        if self.additional_path is not None:
            with tarfile.open(self.additional_path, "r:gz") as tfile:
                for member in tfile:
                    if (member.isdir()):
                        continue

                    yield pd.read_json(tfile.extractfile(member))

                    counter = counter+1
                    if (counter == n):
                        print("Processed " + str(n + additional_path_size) + " files")
                        n = n*2
                    if (additional_path_size == counter):
                        break
                    

In [0]:
class ExtractEntry:

    def __init__(self, json_patient):
        self.json_patient = json_patient
        self.filter = set(["Allergic disorder initial assessment", "Encounter for 'check-up'", "Encounter for check up (procedure)",
                           "Encounter for symptom", "Encounter for problem", "Encounter for problem (procedure)", "Emergency Encounter",
                           "Emergency room admission (procedure)", "General examination of patient (procedure)", "Outpatient procedure",
                           "Urgent care clinic (procedure)", "Well child visit (procedure)", "Medication Reconciliation (procedure)"])

    def __iter__(self):
        for entry in self.json_patient['entry']:
            codes = []
            end_date, encounter_type = None, None
            resource_type = entry['resource']['resourceType']

            if resource_type == "Encounter":
                start_date = entry['resource']['period']['start'][0:10]
                end_date = entry['resource']['period']['end'][0:10]
                encounter_type = entry['resource']['class']['code']
                try:
                    codes.append(entry['resource']['reasonCode'][0]['coding'][0]['display'])
                except:
                    codes.append(entry['resource']['type'][0]['coding'][0]['display'])
                    
            if resource_type == "Procedure":
                start_date = entry['resource']['performedPeriod']['start'][0:10]
                codes.append(entry['resource']['code']['coding'][0]['display'])
            
            if resource_type == "Condition":
                start_date = entry['resource']['onsetDateTime'][0:10]
                codes.append(entry['resource']['code']['coding'][0]['display'])

            if resource_type == "CarePlan":
                start_date = entry['resource']['period']['start'][0:10]
                try:
                    activity = entry['resource']['activity']
                except:
                    continue
                for plan in activity:
                    codes.append(plan['detail']['code']['coding'][0]['display'])

            for code in codes:
                if code not in self.filter:
                    yield {'start_date':start_date, 'end_date':end_date, 'code':code, 'encounter_type':encounter_type}
                

In [0]:
class Embeddings:

    def __init__(self, dataset):
        self.dataset = dataset
        self.onehot = None
        self.wordemb = None

    def generate_onehot(self, label, window_range, predict_range, step_size=0):
        df_dataset = pd.DataFrame.from_dict(self.dataset).T.reset_index().rename(columns={'index':'id'})
        
        self.onehot = []
        offset = step_size + predict_range
        for df, id in self.get_df_from_range(df_dataset, label, window_range, offset):
            # Flatten df['codes'] into array[month][code] = 1
            arr_codes = []
            for codes in df['codes']:
                code_dict = {}
                for code in codes:
                    code_dict[code] = 1
                arr_codes.append(code_dict)

            # Add flattened codes to df
            df = df.join(pd.DataFrame.from_dict(arr_codes).fillna(0)).drop(columns=['codes'])

            # Add label
            if (self.dataset[id][label] is not None):
                df[label] = 1
            else:
                df[label] = 0

            self.onehot.append(df)

        self.normalize(label)

    def generate_wordemb(self, label, window_range, predict_range, step_size=0):
        df_dataset = pd.DataFrame.from_dict(self.dataset).T.reset_index().rename(columns={'index':'id'})

        dfs, master_list = [], []
        offset = step_size + predict_range
        for df, id in self.get_df_from_range(df_dataset, label, window_range, offset):
            master_list.append(list(chain.from_iterable(df["codes"])))
            
            if (self.dataset[id][label] is not None):
                df[label] = 1
            else:
                df[label] = 0

            dfs.append(df)

        # Generate embeddings from master list
        master_embeddings = Word2Vec(master_list, size=100, window=2, min_count=1, workers=2, sg=1)
        
        # Average each month's embeddings per patient
        self.wordemb = []
        for df in dfs:
            emb = []
            for index, row in df.iterrows():
                sum = None
                for code in row['codes']:
                    if sum is None:
                        sum = np.array(master_embeddings[code])
                    else:
                        sum = np.add(sum, np.array(master_embeddings[code]))
                
                if sum is not None:
                    emb.append(np.divide(sum, len(row['codes'])))
                else:
                    # append 100 vector instead of empty tuple
                    emb.append(())

            self.wordemb.append(np.asarray((emb, df[label][0])))

    def get_df_from_range(self, df, label, window_range, offset):
        # yields one patient's df at a time
        for index, row in df.iterrows():
            id = row['id']

            # Get specified range
            try: # via label
                end_range = pd.to_datetime(datetime.strptime(self.dataset[id][label], '%Y-%m-%d').date())
            except: # via latest date 
                try:
                    end_range = pd.to_datetime(datetime.strptime(row['codes'][-1][0], '%Y-%m-%d').date())
                except:
                    continue

            end_range = end_range - relativedelta(months=offset)
            start_range = end_range - relativedelta(months=window_range)
            
            # Set DataFrame to range
            df = pd.DataFrame(row['codes']).rename(columns={0:'date', 1:'codes'})
            try:
                df['date'] = pd.to_datetime(df['date'])
            except: # no codes exist between the start_range and end_range
                continue
            df = df[df['date'].between(start_range, end_range)].set_index('date')

            if len(df) == 0:
                continue
            
            # Group codes by month
            df = df.groupby(pd.Grouper(freq='M'))
            df = df.aggregate(lambda x: tuple(x)).reset_index()

            # Fill in missing months
            df.set_index('date', inplace=True)
            df = df.reindex(pd.date_range(start_range + relativedelta(months=1), end_range + relativedelta(months=1), freq='M'))
            df['codes'] = df['codes'].apply(lambda x: x if pd.notnull(x) else ())
            
            # remove excess months
            while len(df) > window_range:
                df = df.shift(-1)
                df = df[:-1]
            
            df = df.reset_index(drop=True)

            yield df, id

    def normalize(self, training_label):
        all_columns = []
        for frame in self.onehot:
            all_columns.extend(x for x in frame.columns.tolist() if not x in all_columns)
            
        final_frames = []
        for df in self.onehot:
            cols = df.columns.tolist()
            cols.extend(x for x in all_columns if not x in cols)
            df = df.reindex(columns=sorted(cols, reverse=False), fill_value=0)
            col = df[training_label] # move training_label column to end of dateframe
            df.drop(labels=[training_label], axis=1,inplace = True)
            df[training_label] = col
            final_frames.append(df)

        self.onehot = final_frames

    def shuffleColumns(self, dfs, training_label, num_shuffled):
        dfs_shuffled = []
        for _ in range(num_shuffled):
            df_shuffled = dfs.copy()
            # makes column labels the first row. (numpy only works with numbered columns so this preserves our label names)
            df_shuffled[0] = pd.DataFrame(np.vstack([df_shuffled[0].columns, df_shuffled[0]]))
            # randomize columns using numpy
            arr = df_shuffled[0].to_numpy()
            np.random.shuffle(arr.T)
            # convert back to pandas dataframe
            df_shuffled[0] = pd.DataFrame(arr)
            df_shuffled[0].columns = df_shuffled[0].iloc[0]
            df_shuffled[0] = df_shuffled[0].drop(df_shuffled[0].index[0]).reset_index(drop=True)
            # move training_label to end of dataframe
            col = df_shuffled[0][training_label]
            df_shuffled[0].drop(labels=[training_label], axis=1, inplace = True)
            df_shuffled[0][training_label] = col
            # reindex all dfs on df_shuffled[0]
            for i in range(len(df_shuffled)):
              df_shuffled[i] = df_shuffled[i].reindex(df_shuffled[0].columns, axis=1)

            dfs_shuffled.append(df_shuffled)

        return dfs_shuffled
  

In [0]:
class OSUtil:

    @staticmethod
    def import_onehot_csv(path):
        dfs_onehot = []
        for file in glob.glob(path + "/*"):
            try:
              dfs_onehot.append(pd.read_csv(file, index_col ='Unnamed: 0'))
            except Exception as e:
              print(e)
              continue

        return dfs_onehot

    @staticmethod
    def export_csv(path, dfs, overwrite=False):
        if (os.path.exists(path) == False):
            !mkdir $path
        if (overwrite == True):
            !rm $path"/*.csv"
        for i in range(len(dfs)):
            dfs[i].to_csv(path + '/patient' + str(i) + '.csv')

    @staticmethod
    def export_npy(path, nps, overwrite=False):
        if (os.path.exists(path) == False):
            !mkdir $path
        if (overwrite == True):
            !rm $path"/*.csv"
        for i in range(len(nps)):
            np.save(path + '/matrix' + str(i), nps[i])

    @staticmethod
    def zip_folder(input_path, output_path):
        if ".zip" not in output_path:
            output_path = output_path + ".zip"
        !zip -r "$output_path" "$input_path"

    @staticmethod
    def unzip_folder(input_path, output_path="."):
        if ".zip" not in input_path:
            input_path = input_path + ".zip"
        !unzip "$input_path" -d "$output_path"

    @staticmethod
    def delete_folder(path):
        !rm -rf $path

In [0]:
def sum_codes(onehot, label):
    df_no_label, df_label = [], []
    for df in onehot:
        if df[label][0] == 0:
            df_no_label.append(df)
        else:
            df_label.append(df)

    sum_zero_label = df_no_label[0].sum()
    for i in range(1, len(df_no_label)):
        sum_zero_label = sum_zero_label.add(df_no_label[i].sum())

    sum_one_label = df_label[0].sum()
    for i in range(1, len(df_label)):
        sum_one_label = sum_one_label.add(df_label[i].sum())

    print("0 labels:", len(df_no_label), "\n1 labels:", len(df_label))
    return sum_zero_label.sort_values(ascending=False), sum_one_label.sort_values(ascending=False)


### Congestive Heart Failure: pre-processing & embedding generation

In [0]:
chf = CHFDataset()

In [0]:
chf.set_dataset("/content/drive/Shared drives/Anthem Project/Data/synthea-data-general.tar.gz", sample_size=1000)

In [0]:
chf.emb.generate_onehot(label="chf", window_range=24, predict_range=1)

In [0]:
chf.emb.generate_wordemb(label="chf", window_range=24, predict_range=1)

In [193]:
display(chf.emb.onehot[1])

Unnamed: 0,Acute bacterial sinusitis (disorder),Acute bronchitis (disorder),Acute viral pharyngitis (disorder),Administration of intravenous fluids,Admission to orthopedic department,Admission to surgical department,Alcohol-free diet,Alcoholism counseling,...,"Vaccination for diphtheria, pertussis, and tetanus",Vasectomy,Viral sinusitis (disorder),Whiplash injury to neck,X-ray or wrist,low salt diet education,physical exercise,chf
0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
1,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
2,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
3,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
4,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
5,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
6,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
7,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
8,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0
9,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0


In [190]:
display(chf.emb.wordemb[1])

array([list([array([-0.97376156,  0.42644697, -0.18664764,  0.44022125,  0.99464387,
        0.66931194,  0.9994124 ,  0.5218683 ,  0.18924333,  0.920966  ,
       -0.15240845,  0.6238289 , -1.2213124 ,  0.27423763, -0.14442196,
        0.18469092,  0.5671648 , -1.1593103 , -0.3935447 , -0.16390122,
       -0.09582812,  0.73851573,  0.69044405, -0.24226102, -0.08175816,
        0.59284395, -1.0286157 ,  1.0107305 ,  0.50056183,  0.33053395,
        0.56174576, -0.324557  , -0.17795071, -0.34581536, -0.10981272,
        0.12090482,  0.30611783,  1.1266242 ,  0.09032709, -0.27274016,
       -0.39443216,  0.07478089,  0.48144168, -1.2242945 , -0.7228542 ,
       -0.40861362,  0.25929478, -0.7137217 , -0.1328745 , -0.6278158 ,
        0.74507195, -0.34576586,  0.59866333, -0.32803553, -0.09299324,
       -0.10727149, -0.8427776 ,  0.26417014, -0.02304317,  0.30485347,
       -1.0091257 ,  0.53707075, -0.26012805, -0.2977462 , -0.46183568,
        0.87602246,  0.20717111, -0.5111076 ,  0.94

In [183]:
sum_codes(chf.emb.onehot, 'chf')

0 labels: 710 
1 labels: 129


(Combined chemotherapy and radiation therapy (procedure)                             660.0
 Normal pregnancy                                                                    386.0
 Auscultation of the fetal heart                                                     363.0
 Evaluation of uterine fundal height                                                 363.0
 Hyperlipidemia                                                                      293.0
 Electrical cardioversion                                                            278.0
 Viral sinusitis (disorder)                                                          192.0
 Renal dialysis (procedure)                                                          173.0
 Primary small cell malignant neoplasm of lung, TNM stage 1 (disorder)               164.0
 Hearing examination (procedure)                                                     159.0
 High resolution computed tomography of chest without contrast (procedure)           159.0

### Myocardial Infarction: pre-processing & embedding generation



In [0]:
myoinf = MYOINFDataset()

In [0]:
myoinf.set_dataset("/content/drive/Shared drives/Anthem Project/Data/synthea-data-general.tar.gz", sample_size=1000)

In [0]:
myoinf.emb.generate_onehot(label="myoinf", window_range=24, predict_range=1)

In [0]:
myoinf.emb.generate_wordemb(label="myoinf", window_range=24, predict_range=1)

### Export embeddings and compress

In [0]:
OSUtil.export_csv("csv-chf-onehot", chf.emb.onehot)
OSUtil.export_npy("csv-chf-wordemb", chf.emb.wordemb)

OSUtil.export_csv("csv-myoinf-onehot", myoinf.emb.onehot)
OSUtil.export_npy("csv-myoinf-wordemb", myoinf.emb.wordemb)

In [0]:
OSUtil.zip_folder('csv-chf-onehot', 'csv-chf-onehot')
OSUtil.zip_folder('csv-chf-wordemb', 'csv-chf-wordemb')

OSUtil.zip_folder('csv-myoinf-onehot', 'csv-myoinf-onehot')
OSUtil.zip_folder('csv-myoinf-wordemb', 'csv-myoinf-wordemb')