In [11]:
%reset -f
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [12]:
# %load main.py

In [17]:
#%%writefile main.py

import numpy as np
import pandas as pd
import pickle
import os
import json
import sklearn
from sklearn import metrics

def saveobj(datain,namein='output'):    
    """
    Purpose: 
        Serialize and store objects
    Input:
        datain: Object to be stored
        namein: Path of to-be-stored object
    Output:
        None
    Comments:
        N/A
    """
    with open(namein, 'wb') as fp:
        pickle.dump(datain, fp, protocol=pickle.HIGHEST_PROTOCOL)

def loadobj(namein):
    """
    Purpose: 
        Read and deserialize objects
    Input:
        namein: Path of to-be-read object
    Output:
        Read and deserialized object
    Comments:
        N/A
    """
    curdir=os.path.dirname(__file__)
    newpath=os.path.join(curdir,namein)
    
    with open(newpath, 'rb') as fp:
        data = pickle.load(fp)
        return data

def savejson(datain,namein='output'):
    with open(namein+'.json', 'w') as fp:
        json.dump(datain, fp)

def loadjson(namein):
    with open(namein+'.json', 'r') as fp:
        return json.load(fp)

def getmappingfile():
    """
    Purpose: 
        Return bidirectional (icd9 to icd10 and icd10 to icd9 combined) icd mapping file in pandas dataframe format
        Mapping is based on Center for Medicare and Medicaid Services General Equivalence Mappings version 2018 [https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html]
    Input:
        None
    Output:
        bidirectional icd mapping file
    Comments:
        A diagnosis code may be mapped to none, one, or multiple diagnosis codes
        Columns for mapping file are as follows:
            source: source icd code
            target: converted icd code
            flag: 
            Approximate
            No Map
            Combination
            Scenario
            Choice List
            sourcetype: icd9 or icd10
            targettype: icd10 or icd9
    """
    curdir=os.path.dirname(__file__)
    newpath=os.path.join(curdir,'mapdf.pickle')
    return pd.read_pickle(newpath)

def getelixhauser_comorbidities():
    """
    Purpose: 
        Returns dictionary for elixhauser comorbidities based on Quan et al (2005):
            Quan H, Sundararajan V, Halfon P, Fong A, Burnand B, Luthi JC, Saunders LD, Beck CA, Feasby TE, Ghali WA. 
            Coding algorithms for defining comorbidities in ICD-9-CM and ICD-10 administrative data. 
            Medical care. 2005 Nov 1:1130-9.
        Scores for elixhauser comorbidity index is based on van Walraven et al (2009):
            van Walraven C, Austin PC, Jennings A, Quan H, Forster AJ. 
            A modification of the Elixhauser comorbidity measures into a point system for hospital death using administrative data. 
            Medical care. 2009 Jun 1:626-33.
    Input:
        N/A
    Output:
        dictionary containing icd9 codes, icd10 codes, and elixhauser comorbidity index score for each comorbidity in the elixhauser comorbidity index.
    Comments:
        N/A
    """
    return loadjson('elixhauser_comorbidities')

def checkcols(datain,idcolumn,icdcolumn):
    """
    Purpose: 
        Ensure that id column and icd column are both present in the dataframe
    Input:
        datain: diagnosis code dataframe in long format
        idcolumn: name of the patient/encounter identifier column
        icdcolumn: name of the icd code column
    Output:
        True if both id column and icd column are both present in the dataframe, False otherwise
    Comments:
        N/A
    """
    if icdcolumn is None:
        print('please specify the icd column')
        return False
    if icdcolumn not in list(datain.columns):
        print('column ['+str(icdcolumn)+'] not found in columns: '+str(datain.columns))
        return False
    if idcolumn is None:
        print('please specify the id column')
        return False
    if idcolumn not in list(datain.columns):
        print('column ['+str(idcolumn)+'] not found in columns: '+str(datain.columns))
        return False
    return True

def cleanicd(dfin,icdcolumn):
    """
    Purpose: 
        Strip whitespace and period from icd codes
    Input:
        dfin: diagnosis code dataframe in long format
        icdcolumn: name of the icd code column
    Output:
        Returns diagnosis dataframe with cleaned up icd code column
    Comments:
        Assuming checkcols has already been performed prior to running this function
    """
    dfin[icdcolumn] = dfin[icdcolumn].astype(str).str.replace('.', '').str.replace(' ','')
    return dfin

def icdconv(datain, idcolumn=None, icdcolumn=None, typecol='ICDtype'):
    """
    Purpose: 
        Convert icd9 to icd10 codes, as well as icd10 to icd9 codes and return a diagnosis dataframe that includes both the original code and the converted code in long format
    Input:
        datain: diagnosis code dataframe in long format
        idcolumn: name of the patient/encounter identifier column
        icdcolumn: name of the icd code column
        typecol: column that indicates if the diagnosis code is in icd9 or icd10
    Output:
        Diagnosis dataframe that includes both the original code and the converted code in long format
    Comments:
        N/A
    """
    mapdf = getmappingfile()
    unqcodes = list(mapdf.source.unique())

    #if pandas dataframe
    if type(datain) == pd.core.frame.DataFrame:
        if checkcols(datain, idcolumn, icdcolumn) == False:
            return None
        dataout_original = datain.loc[:, [idcolumn, icdcolumn]].copy()

        dataout_original=cleanicd(dataout_original,icdcolumn)
        
        dataout_original[typecol] = None
        dataout_original = dataout_original.merge(
            mapdf.loc[:, ['source', 'target', 'sourcetype', 'targettype']],
            left_on=icdcolumn,
            right_on='source',
            how='left')
        #display(dataout_original)
        for icdcode in dataout_original.loc[dataout_original['sourcetype']
                                            .isnull(), icdcolumn]:
            print('Underspecified or unidentifiable diagnosis code: ' + str(icdcode))

        #dataout_original = dataout_original.loc[(dataout_original['sourcetype'].notnull()), :]
        dataout_converted = dataout_original.loc[(dataout_original[
            'target'].notnull()), [idcolumn, 'target', 'targettype']]
        dataout_converted.columns = [idcolumn, icdcolumn, typecol]
        dataout_original = dataout_original.loc[:, [
            idcolumn, icdcolumn, 'sourcetype'
        ]].drop_duplicates()
        dataout_original.columns = [idcolumn, icdcolumn, typecol]
        return pd.concat([dataout_original, dataout_converted], axis=0)   
    
    

    
    
def icdtophenotype(datain,idcolumn=None,icdcolumn=None,featurematrix=False):
    """
    Purpose: 
        Map icd codes to Elixhauser comorbidites (Quan et al. 2005) and return comorbidity dataframe
    Input:
        datain: diagnosis code dataframe in long format
        idcolumn: name of the patient/encounter identifier column
        icdcolumn: name of the icd code column
        featurematrix: if False, returns a long-format diagnosis dataframe, if True, returns a binary feature-matrix-format diagnosis dataframe
    Output:
        Depending on the featurematrix parameter, either a long-format diagnosis dataframe or a binary feature-matrix-format diagnosis dataframe
    Comments:
        N/A
    """
    elixhauser_comorbidities=getelixhauser_comorbidities()
    
    if checkcols(datain, idcolumn, icdcolumn) == False:
        return None
    
    dataout=datain.copy()
    dataout=cleanicd(dataout,icdcolumn)
    dataout['comorbidity']=None
    
    for como in elixhauser_comorbidities:
        print('Currently processing: '+como+'                                                         ',end="\r",flush=True)
        
        dataout.loc[dataout[icdcolumn].str.startswith(tuple(elixhauser_comorbidities[como]['icd9']+elixhauser_comorbidities[como]['icd10']),na=False),'comorbidity']=como
        
    
    if not featurematrix:
        return dataout
    
    dataout=pd.concat([dataout[idcolumn],pd.get_dummies(dataout['comorbidity'])],axis=1).groupby(idcolumn).max()
    
    return dataout
    
    

def comorbidityindex(datain,scorecol='Elixhauser_Comorbidity_Score',scoreonly=False):
    """
    Purpose: 
        Return Elixhauser Index Score dataframe
    Input:
        datain: phenotype featurematrix
        scorecol: name of the Elixhauser Index Score column in the returned dataframe
        scoreonly: if True, only returns the Elixhauser Index Score, if False, returns the full binary comorbidity dataframe in addition to the Elixhauser Index Score
    Output:
        Elixhauser Index Score dataframe
    Comments:
        N/A
    """

    dataout=datain.copy()
    
    if dataout is None:
        return None
    
    elixhauser_comorbidities=getelixhauser_comorbidities()
    for como in list(dataout.columns):
        dataout.loc[:,como]*=elixhauser_comorbidities[como]['score']
    
    dataout[scorecol]=dataout.sum(axis=1)
    if scoreonly:
        return dataout.loc[:,[scorecol]]
    
    return dataout
    

def comorbiditypipeline(datain, idcolumn=None, icdcolumn=None,scorecol='Elixhauser_Comorbidity_Score'):
    """
    Purpose: 
        Return Elixhauser Index Score dataframe
    Input:
        datain: phenotype featurematrix
        scorecol: name of the Elixhauser Index Score column in the returned dataframe
        scoreonly: if True, only returns the Elixhauser Index Score, if False, returns the full binary comorbidity dataframe in addition to the Elixhauser Index Score
    Output:
        Elixhauser Index Score dataframe
    Comments:
        N/A
    """
    convdiagdf=icdconv(datain, idcolumn=idcolumn, icdcolumn=icdcolumn, typecol='ICDtype')
    phenodf=icdtophenotype(convdiagdf,idcolumn=idcolumn,icdcolumn=icdcolumn,featurematrix=True)
    #display(phenodf.sum(axis=0))
    comodf=comorbidityindex(phenodf,scorecol=scorecol,scoreonly=True)
    return pd.concat([phenodf,comodf],axis=1)

#NDC to RxNORM to Drug Class
def ndc2rxcui(df_med,col_ndc='ndc'):
    print('Converting NDC to RXCUI')
    output=[]
    ndclist=df_med[col_ndc].unique()
    lenndc = len(ndclist)
    for i in range(0,len(ndclist)):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenndc,(i+1)/lenndc*100), end='\r', flush=True)
        curndc=ndclist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/ndcstatus.json?ndc='+str(curndc)).json()['ndcStatus']
        if 'ndcHistory' in r:
            for entry in r['ndcHistory']:
                output.append({
                    'ndc':curndc,
                    'rxcui':entry['activeRxcui'],
                    'start':pd.to_datetime(entry['startDate']+'01'),
                    'end':pd.to_datetime(entry['endDate']+'01'),
                })
        else:
            print('NDC code [{}] was not able to be mapped to rxcui'.format(curndc))
        time.sleep(1/20)
    output=pd.DataFrame(output).replace({r'^\s*$':None}, regex=True).dropna()
    return output
def rxcui2class(df_mapin,getname=True):
    print('Converting rxcui to drug class')
    rxcuilist=df_mapin['rxcui'].unique()
    lenrxcui=len(rxcuilist)
    output=[]
    
    identifier='classId'
    if getname:
        identifier='className'
    
    for i in range(0,lenrxcui):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenrxcui,(i+1)/lenrxcui*100), end='\r', flush=True)
        currxcui=rxcuilist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/rxclass/class/byRxcui.json?rxcui='+str(currxcui)).json()

        if 'rxclassDrugInfoList' in r:
            tempdict={'rxcui':currxcui}
            for curclass in r['rxclassDrugInfoList']['rxclassDrugInfo']:
                classtype=curclass['relaSource']+'_'+curclass['rxclassMinConceptItem']['classType']
                if classtype not in tempdict:
                    tempdict[classtype]=set([curclass['rxclassMinConceptItem'][identifier]])
                else:
                    tempdict[classtype].add(curclass['rxclassMinConceptItem'][identifier])
            output.append(tempdict)
        else:
            print('rxcui [{}] was not able to be mapped to drug class'.format(currxcui))
        time.sleep(1/20)
    return pd.DataFrame(output)
def ndc2class(df_med,col_ndc='ndc',getname=True,indexcol='ROWID',timecol='TIME'):
    map1=ndc2rxcui(df_med,col_ndc=col_ndc)
    map2=rxcui2class(map1,getname=getname)

    newmed=df_med.copy()
    
    #merge maps to go from ndc to rxnorm to class
    fullmap=map1.merge(map2,how='left',on='rxcui')
    
    #merge medication dataframe to full map
    temp=newmed.merge(fullmap,how='left',left_on=col_ndc,right_on='ndc')
    
    #filter based on start and end date
    temp=temp.loc[(temp[timecol]>=temp.start) & (temp[timecol] <=temp.end),:]
    
    #if multiple rxcui per ndc, take the one with the later end date
    temp=temp.sort_values('end')
    temp=temp.drop_duplicates(subset=df_med.columns,keep='last')

    #if no rxcui exist for ndc given the time range, try to just take the latest rxcui
    temp=pd.concat([temp,newmed.loc[~newmed[indexcol].isin(temp[indexcol]),:].merge(fullmap.sort_values(by='end').drop_duplicates(subset='ndc',keep='last'),how='left',left_on='NDC',right_on='ndc')],axis=0)
    temp=temp.sort_values(by=indexcol)
    return temp

# similar to sklearn.model_selection.cross_val_score but supports multiple 
def cvmultiscore(model,x,y,scorelist=[sklearn.metrics.accuracy_score,sklearn.metrics.brier_score_loss,sklearn.metrics.f1_score,sklearn.metrics.log_loss,sklearn.metrics.precision_score,sklearn.metrics.recall_score,sklearn.metrics.roc_auc_score],cv=5):
    plt.rcParams['figure.figsize']=[5.,5.]
    splitter=sklearn.model_selection.StratifiedKFold(n_splits=cv,shuffle=True)
    output=[]
    for train,test in splitter.split(x,y):
        
        model.fit(x.iloc[train],y.iloc[train])
        
        trainypred=model.predict_proba(x.iloc[train])[:,1]
        
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y.iloc[train],trainypred)
        plt.plot(fpr,tpr,'b--')
        
        
        ypred=model.predict_proba(x.iloc[test])[:,1]
        tempdict={}
        for score in scorelist:
            tempdict[score.__name__]=score(y.iloc[test], ypred)
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y.iloc[test],ypred)
        plt.plot(fpr,tpr,'r--')
        
        
        output.append(tempdict)
    plt.show()
    return pd.DataFrame(output)


# USE CASES

In [281]:
import pandas as pd
import numpy as np
import os
import requests
import time

#load icd 9 to icd 10 mapping file
#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def load_icd9to10():
    icd9to10 = pd.read_csv(os.path.join(os.getcwd(),'2018_I9gem.txt'),delim_whitespace=True,header=None,dtype=str)
    icd9to10.columns=['icd9','icd10','flag']
    return icd9to10

#load icd 10 to icd 9 mapping file
#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def load_icd10to9():
    icd10to9 = pd.read_csv(os.path.join(os.getcwd(),'2018_I10gem.txt'),delim_whitespace=True,header=None,dtype=str)
    icd10to9.columns=['icd10','icd9','flag']
    return icd10to9

#convert icd 9 to 10, or 10 to 9
#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def icdconvert(df,col_icd='icd',icd_version=9):
    if icd_version==9:
        source='icd9'
        target='icd10'
        df_gem=load_icd9to10()
    elif icd_version==10:
        source='icd10'
        target='icd9'
        df_gem=load_icd10to9()
    else:
        print('invalid icd version {}, please set curicd to 9 or 10'.format(icd_version))
        return None
    
    print('Comment: because of the discrepancy between icd9 and icd10, diagnosis codes may be mapped to many codes or no codes')
    
    output=df.merge(df_gem,how='left',left_on=col_icd,right_on=source)
    print('{:.2f}% mapped'.format(output[target].notnull().mean()*100))
    if 'flag' in output.columns:
        output.drop('flag',axis=1,inplace=True)
    if col_icd!=source in output.columns:
        output.drop(source,axis=1,inplace=True)
    
    return output

# load Elixhauser comorbidities mapping file
#Quan H, Sundararajan V, Halfon P, Fong A, Burnand B, Luthi JC, Saunders LD, Beck CA, Feasby TE, Ghali WA. 
#Coding algorithms for defining comorbidities in ICD-9-CM and ICD-10 administrative data. 
#Medical care. 2005 Nov 1:1130-9.
def loadelixcomo():
    elixcomo = pd.read_csv('Elixhauser_Comorbidities.csv').iloc[:,1:]
    return elixcomo

#convert icd 9 or 10 to Elixhauser Comorbidities
def icdtoelixcomo(df,col_icd):
    elixcomo = loadelixcomo()
    unqcomos = elixcomo['Comorbidity'].unique()
    df['ElixComo']=None
    df['ElixComoScore']=None
    for como in unqcomos:
        icdlist = tuple(elixcomo.loc[elixcomo['Comorbidity']==como,'ICD'])
        comoidx = df[col_icd].str.startswith(icdlist,na=False)
        df.loc[comoidx,'ElixComo']=como
        df.loc[comoidx,'ElixComoScore']=elixcomo.loc[elixcomo.Comorbidity==como,'Score'].values[0]
    return df
  

# score patients based on elixhauser comorbidities
#van Walraven C, Austin PC, Jennings A, Quan H, Forster AJ. 
#A modification of the Elixhauser comorbidity measures into a point system for hospital death using administrative data. 
#Medical care. 2009 Jun 1:626-33.
def elixcomoscore(df,col_icd,col_id):
    output = icdtoelixcomo(df,col_icd)
    output = output.loc[output['ElixComo'].notnull(),:]
    output = output.loc[:,[col_id,'ElixComo','ElixComoScore']]
    output = output.drop_duplicates()
    output = pd.DataFrame(output.groupby(col_id)['ElixComoScore'].sum()).reset_index()
    output = output.merge(df.loc[:,[col_id]].drop_duplicates(),how='outer',left_on=col_id,right_on=col_id).fillna(0.)
    return output
    
    
# load mapping file from icd9 to Chronic Condition Indicator (CCI)
#https://www.hcup-us.ahrq.gov/toolssoftware/chronic/chronic.jsp
def load_cci9():
    cci9 = pd.read_csv(os.path.join(os.getcwd(),'cci2015.csv'),skiprows=1)
    cci9.columns = [i.strip('\'') for i in cci9.columns]
    
    for col in cci9.columns:
        cci9.loc[:,col] = cci9[col].str.strip('\'')
    cci9 = cci9.replace(r'^\s*$', np.nan, regex=True)
    cci9.columns=[i.replace('CATEGORY DESCRIPTION','CHRONIC') for i in cci9.columns]

    dict_bodysystem=[
        ('1' ,'Infectious and parasitic disease'),
        ('2' ,'Neoplasms'),
        ('3' ,'Endocrine, nutritional, and metabolic diseases and immunity disorders'),
        ('4' ,'Diseases of blood and blood-forming organs'),
        ('5' ,'Mental disorders'),
        ('6' ,'Diseases of the nervous system and sense organs'),
        ('7' ,'Diseases of the circulatory system'),
        ('8' ,'Diseases of the respiratory system'),
        ('9' ,'Diseases of the digestive system'),
        ('10','Diseases of the genitourinary system'),
        ('11','Complications of pregnancy, childbirth, and the puerperium'),
        ('12','Diseases of the skin and subcutaneous tissue'),
        ('13','Diseases of the musculoskeletal system'),
        ('14','Congenital anomalies'),
        ('15','Certain conditions originating in the perinatal period'),
        ('16','Symptoms, signs, and ill-defined conditions'),
        ('17','Injury and poisoning'),
        ('18','Factors influencing health status and contact with health service'),
    ]
    
    cci9 = cci9.merge(pd.DataFrame(dict_bodysystem,columns=['BODY SYSTEM','BODY SYSTEM DESCRIPTION']),how='left',on='BODY SYSTEM')
    
    cci9.loc[:,'ICD-9-CM CODE'] = cci9['ICD-9-CM CODE'].str.replace(' ','')
    
    return cci9

#convert icd9 to CCI
def icd9tocci(df,col_icd='icd9'):
    cci9 = load_cci9()
    return df.merge(cci9,how='left',left_on=col_icd,right_on='ICD-9-CM CODE')
    
    
    

# load mapping file from icd10 to Chronic Condition Indicator (CCI)
#https://www.hcup-us.ahrq.gov/toolssoftware/chronic_icd10/chronic_icd10.jsp
def load_cci10():
    
    cci10 = pd.read_csv(os.path.join(os.getcwd(),'cci_icd10cm_2019_1.csv'))
    
    cci10.columns = [i.strip('\'') for i in cci10.columns]
    
    for col in cci10.columns:
        cci10.loc[:,col] = cci10[col].str.strip('\'')
    cci10 = cci10.replace(r'^\s*$', np.nan, regex=True)
    cci10.columns = [i.replace('CHRONIC INDICATOR','CHRONIC') for i in cci10.columns]
    
    dict_bodysystem = [
        ('1','Infectious and parasitic disease'),
        ('2','Neoplasms'),
        ('3','Endocrine, nutritional, and metabolic diseases and immunity disorders'),
        ('4','Diseases of blood and blood-forming organs'),
        ('5','Mental disorders'),
        ('6','Diseases of the nervous system and sense organs'),
        ('7','Diseases of the circulatory system'),
        ('8','Diseases of the respiratory system'),
        ('9','Diseases of the digestive system'),
        ('10','Diseases of the genitourinary system'),
        ('11','Complications of pregnancy, childbirth, and the puerperium'),
        ('12','Diseases of the skin and subcutaneous tissue'),
        ('13','Diseases of the musculoskeletal system'),
        ('14','Congenital anomalies'),
        ('15','Certain conditions originating in the perinatal period'),
        ('16','Symptoms, signs, and ill-defined conditions'),
        ('17','Injury and poisoning'),
        ('18','Factors influencing health status and contact with health services'),
    ]
    
    cci10 = cci10.merge(pd.DataFrame(dict_bodysystem,columns=['BODY SYSTEM','BODY SYSTEM DESCRIPTION']),how='left',on='BODY SYSTEM')
    
    cci10.loc[:,'ICD-10-CM CODE'] = cci10['ICD-10-CM CODE'].str.replace(' ','')
    
    return cci10

#convert icd10 to CCI
def icd10tocci(df,col_icd='icd10'):
    cci10 = load_cci10()
    return df.merge(cci10,how='left',left_on=col_icd,right_on='ICD-10-CM CODE')

# load mapping file from icd9 to Clinical Classification Software (CCS)
#https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp
def load_ccs9():
    ccs9 = pd.read_csv(os.path.join(os.getcwd(),'$dxref 2015.csv'))
    ccs9 = ccs9.reset_index()
    for col in ccs9.columns:
        ccs9.loc[:,col]=ccs9[col].str.strip('\'')
    ccs9.columns=ccs9.iloc[0,:]
    ccs9 = ccs9.iloc[1:,:]
    ccs9 = ccs9.replace(r'^\s*$', np.nan, regex=True)
    ccs9 = ccs9.loc[ccs9['ICD-9-CM CODE'].notnull(),:]
    ccs9.loc[:,'ICD-9-CM CODE'] = ccs9['ICD-9-CM CODE'].str.replace(' ','')
    ccs9.loc[:,'CCS CATEGORY'] = ccs9['CCS CATEGORY'].str.replace(' ','')
    ccs9 = ccs9.iloc[:,0:4]    
    ccs9_labels = pd.read_csv(os.path.join(os.getcwd(),'dxlabel 2015.csv'))
    ccs9 = ccs9.merge(ccs9_labels,how='left',left_on='CCS CATEGORY',right_on='CCS DIAGNOSIS CATEGORIES')
    ccs9.drop('CCS CATEGORY DESCRIPTION',axis=1,inplace=True)
    ccs9.drop('CCS DIAGNOSIS CATEGORIES',axis=1,inplace=True)
    ccs9.columns = [i.replace('CCS DIAGNOSIS CATEGORIES LABELS','CCS CATEGORY DESCRIPTION') for i in ccs9.columns]
    return ccs9

#https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp
def icd9toccs(df,col_icd='icd9'):
    ccs9 = load_ccs9()
    output = df.merge(ccs9,how='left',left_on=col_icd,right_on='ICD-9-CM CODE')
    if col_icd!='ICD-9-CM CODE':
        output.drop('ICD-9-CM CODE',axis=1,inplace=True)
    return output

# load mapping file from icd10 to Clinical Classification Software (CCS)
#https://www.hcup-us.ahrq.gov/toolssoftware/ccs10/ccs10.jsp
def load_ccs10():
    ccs10 = pd.read_csv(os.path.join(os.getcwd(),'ccs_dx_icd10cm_2019_1.csv'))
    ccs10.columns=[i.strip('\'') for i in ccs10.columns]
    for col in ccs10.columns:
        ccs10.loc[:,col]=ccs10[col].str.strip('\'')
    ccs10 = ccs10.replace(r'^\s*$', np.nan, regex=True)
    ccs10.loc[:,'ICD-10-CM CODE'] = ccs10['ICD-10-CM CODE'].str.replace(' ','')
    ccs10=ccs10.iloc[:,0:4]
    return ccs10
    
def icd10toccs(df,col_icd='icd10'):
    ccs10 = load_ccs10()
    output = df.merge(ccs10,how='left',left_on=col_icd,right_on='ICD-10-CM CODE')
    if col_icd!='ICD-10-CM CODE':
        output.drop('ICD-10-CM CODE',axis=1,inplace=True)
    return output

def parsediag(dfin,col_icd,col_id,icd_version):
    df = dfin.copy()
    
    if icd_version==9:
        toccs = icd9toccs
    elif icd_version==10:
        toccs = icd10toccs
    else:
        print('ERROR: Please set icd_version to 9 or 10')
        return None
    
    df = toccs(df,col_icd=col_icd)
    
    if icd_version==9:
        tocci = icd9tocci
    elif icd_version==10:
        tocci = icd10tocci
    
    df = tocci(df,col_icd=col_icd)
    
    df = icdtoelixcomo(df,col_icd=col_icd)
    
    df = df.loc[:,list(dfin.columns)+['CCS CATEGORY','CCS CATEGORY DESCRIPTION','CHRONIC','ElixComo','ElixComoScore']]
    
    
    return df

# onehotifying columns
def onehotify(df,col_id,col_val):
    return pd.concat([df.loc[:,[col_id]],pd.get_dummies(df[col_val])],axis=1).groupby(col_id).max()

def ndc2rxcui(df_med,col_ndc='ndc'):
    print('Converting NDC to RXCUI')
    output=[]
    ndclist=df_med[col_ndc].unique()
    lenndc = len(ndclist)
    for i in range(0,len(ndclist)):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenndc,(i+1)/lenndc*100), end='\r', flush=True)
        curndc=ndclist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/ndcstatus.json?ndc='+str(curndc)).json()['ndcStatus']
        if 'ndcHistory' in r:
            for entry in r['ndcHistory']:
                output.append({
                    'ndc':curndc,
                    'rxcui':entry['activeRxcui'],
                    'start':pd.to_datetime(entry['startDate']+'01'),
                    'end':pd.to_datetime(entry['endDate']+'01'),
                })
        else:
            print('NDC code [{}] was not able to be mapped to rxcui'.format(curndc))
        time.sleep(1/20)
    output=pd.DataFrame(output).replace({r'^\s*$':None}, regex=True).dropna()
    return output
def rxcui2class(df_mapin,getname=True):
    print('Converting rxcui to drug class')
    rxcuilist=df_mapin['rxcui'].unique()
    lenrxcui=len(rxcuilist)
    output=[]
    
    identifier='classId'
    if getname:
        identifier='className'
    
    for i in range(0,lenrxcui):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenrxcui,(i+1)/lenrxcui*100), end='\r', flush=True)
        currxcui=rxcuilist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/rxclass/class/byRxcui.json?rxcui='+str(currxcui)).json()

        if 'rxclassDrugInfoList' in r:
            tempdict={'rxcui':currxcui}
            for curclass in r['rxclassDrugInfoList']['rxclassDrugInfo']:
                classtype=curclass['relaSource']+'_'+curclass['rxclassMinConceptItem']['classType']
                if classtype not in tempdict:
                    tempdict[classtype]=set([curclass['rxclassMinConceptItem'][identifier]])
                else:
                    tempdict[classtype].add(curclass['rxclassMinConceptItem'][identifier])
            output.append(tempdict)
        else:
            print('rxcui [{}] was not able to be mapped to drug class'.format(currxcui))
        time.sleep(1/20)
    return pd.DataFrame(output)
def ndc2class(df_med,col_ndc='ndc',getname=True,indexcol='ROWID',timecol='TIME'):
    map1=ndc2rxcui(df_med,col_ndc=col_ndc)
    map2=rxcui2class(map1,getname=getname)

    newmed=df_med.copy()
    
    #merge maps to go from ndc to rxnorm to class
    fullmap=map1.merge(map2,how='left',on='rxcui')
    
    #merge medication dataframe to full map
    temp=newmed.merge(fullmap,how='left',left_on=col_ndc,right_on='ndc')
    
    #filter based on start and end date
    temp=temp.loc[(temp[timecol]>=temp.start) & (temp[timecol] <=temp.end),:]
    
    #if multiple rxcui per ndc, take the one with the later end date
    temp=temp.sort_values('end')
    temp=temp.drop_duplicates(subset=df_med.columns,keep='last')

    #if no rxcui exist for ndc given the time range, try to just take the latest rxcui
    temp=pd.concat([temp,newmed.loc[~newmed[indexcol].isin(temp[indexcol]),:].merge(fullmap.sort_values(by='end').drop_duplicates(subset='ndc',keep='last'),how='left',left_on='NDC',right_on='ndc')],axis=0)
    temp=temp.sort_values(by=indexcol)
    return temp

In [279]:
#helper function for loading diagdf

def loaddiagdf():
    diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\mixicdtest.csv').iloc[:,1:]
    diagdf.ICDX_VERSION_NO = diagdf.ICDX_VERSION_NO.str.strip('-CM')
    diagdf.ICDX_DIAGNOSIS_CODE = diagdf.ICDX_DIAGNOSIS_CODE.str.replace('.','')
    diagdf.ICDX_DIAGNOSIS_CODE = diagdf.ICDX_DIAGNOSIS_CODE.str.strip(' ')
    return diagdf


In [280]:
# ICD 9 to 10

#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def load_icd9to10():
    icd9to10 = pd.read_csv(os.path.join(os.getcwd(),'2018_I9gem.txt'),delim_whitespace=True,header=None,dtype=str)
    icd9to10.columns=['icd9','icd10','flag']
    return icd9to10

#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def load_icd10to9():
    icd10to9 = pd.read_csv(os.path.join(os.getcwd(),'2018_I10gem.txt'),delim_whitespace=True,header=None,dtype=str)
    icd10to9.columns=['icd10','icd9','flag']
    return icd10to9

#https://www.cms.gov/Medicare/Coding/ICD10/2018-ICD-10-CM-and-GEMs.html
def icdconvert(df,col_icd='icd',icd_version=9):
    if icd_version==9:
        source='icd9'
        target='icd10'
        df_gem=load_icd9to10()
    elif icd_version==10:
        source='icd10'
        target='icd9'
        df_gem=load_icd10to9()
    else:
        print('invalid icd version {}, please set curicd to 9 or 10'.format(icd_version))
        return None
    
    print('Comment: because of the discrepancy between icd9 and icd10, diagnosis codes may be mapped to many codes or no codes')
    
    output=df.merge(df_gem,how='left',left_on=col_icd,right_on=source)
    print('{:.2f}% mapped'.format(output[target].notnull().mean()*100))
    if 'flag' in output.columns:
        output.drop('flag',axis=1,inplace=True)
    if col_icd!=source in output.columns:
        output.drop(source,axis=1,inplace=True)
    
    return output


#diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\DIAGNOSES_ICD.csv')
#icdconvert(diagdf,col_icd='ICD9_CODE',col_pid='HADM_ID').head()

diagdf = loaddiagdf()

display(icdconvert(diagdf.loc[diagdf.ICDX_VERSION_NO=='9'],col_icd = 'ICDX_DIAGNOSIS_CODE',icd_version=9).head())
display(icdconvert(diagdf.loc[diagdf.ICDX_VERSION_NO=='10'],col_icd = 'ICDX_DIAGNOSIS_CODE',icd_version=10).head())

Comment: because of the discrepancy between icd9 and icd10, diagnosis codes may be mapped to many codes or no codes
97.70% mapped


Unnamed: 0,ICDX_VERSION_NO,ICDX_DIAGNOSIS_CODE,DESCRIPTION,icd10
0,9,36911,1 EYE-SEV/OTH-BLIND NOS,H5410
1,9,94810,10-19% BDY BRN/3 DEG NOS,T3110
2,9,94810,10-19% BDY BRN/3 DEG NOS,T3210
3,9,94214,1ST DEG BURN BACK,T2113XA
4,9,94214,1ST DEG BURN BACK,T2153XA


Comment: because of the discrepancy between icd9 and icd10, diagnosis codes may be mapped to many codes or no codes
99.03% mapped


Unnamed: 0,ICDX_VERSION_NO,ICDX_DIAGNOSIS_CODE,DESCRIPTION,icd9
0,10,G912,(Idiopathic) normal pressure hydrocephalus,3315
1,10,Z3A10,10 weeks gestation of pregnancy,NoDx
2,10,Z3A11,11 weeks gestation of pregnancy,NoDx
3,10,Z3A12,12 weeks gestation of pregnancy,NoDx
4,10,Z3A13,13 weeks gestation of pregnancy,NoDx


In [422]:
# Elixhauser Comorbidities from ICD 9 or 10
def loadelixcomo():
    elixcomo = pd.read_csv('Elixhauser_Comorbidities.csv').iloc[:,1:]
    return elixcomo

def icdtoelixcomo(df,col_icd):
    elixcomo = loadelixcomo()
    unqcomos = elixcomo['Comorbidity'].unique()
    df['ElixComo']=None
    df['ElixComoScore']=None
    for como in unqcomos:
        icdlist = tuple(elixcomo.loc[elixcomo['Comorbidity']==como,'ICD'])
        comoidx = df[col_icd].str.startswith(icdlist,na=False)
        df.loc[comoidx,'ElixComo']=como
        df.loc[comoidx,'ElixComoScore']=elixcomo.loc[elixcomo.Comorbidity==como,'Score'].values[0]
    return df
  

def elixcomoscore(df,col_icd,col_id):
    output = icdtoelixcomo(df,col_icd)
    output = output.loc[output['ElixComo'].notnull(),:]
    output = output.loc[:,[col_id,'ElixComo','ElixComoScore']]
    output = output.drop_duplicates()
    output = pd.DataFrame(output.groupby(col_id)['ElixComoScore'].sum()).reset_index()
    output = output.merge(df.loc[:,[col_id]].drop_duplicates(),how='outer',left_on=col_id,right_on=col_id).fillna(0.)
    return output
    
diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\DIAGNOSES_ICD.csv')
d_diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\D_ICD_DIAGNOSES.csv')
diagdf = diagdf.drop('ROW_ID',axis=1).merge(d_diagdf.drop('ROW_ID',axis=1),how='left',on='ICD9_CODE')

elixcomoscore(diagdf,'ICD9_CODE','HADM_ID').head()
#diagdf = icdtoelixcomo(diagdf,'ICD9_CODE')
#diagdf = diagdf.loc[diagdf.ElixComo.notnull(),:]

#onehotify(diagdf,col_id='HADM_ID',col_val='ElixComo')
#diagdf.loc[:,'ElixComoScore'] = pd.to_numeric(diagdf['ElixComoScore'])
#diagdf.pivot_table(index='HADM_ID',columns='ElixComo',values='ElixComoScore').fillna(0)

#diagdf = loaddiagdf()
#diagdf = diagdf.loc[diagdf['ICDX_VERSION_NO']=='10',:]
#diagdf = icdtoelixcomo(diagdf,col_icd='ICDX_DIAGNOSIS_CODE')

Unnamed: 0,HADM_ID,ElixComoScore
0,100375,12.0
1,100969,10.0
2,101361,20.0
3,102203,22.0
4,103379,16.0


In [444]:
# Chronic Condition Indicator (CCI) from ICD 9

#https://www.hcup-us.ahrq.gov/toolssoftware/chronic/chronic.jsp
def load_cci9():
    cci9 = pd.read_csv(os.path.join(os.getcwd(),'cci2015.csv'),skiprows=1)
    cci9.columns = [i.strip('\'') for i in cci9.columns]
    
    for col in cci9.columns:
        cci9.loc[:,col] = cci9[col].str.strip('\'')
    cci9 = cci9.replace(r'^\s*$', np.nan, regex=True)
    cci9.columns=[i.replace('CATEGORY DESCRIPTION','CHRONIC') for i in cci9.columns]

    dict_bodysystem=[
        ('1' ,'Infectious and parasitic disease'),
        ('2' ,'Neoplasms'),
        ('3' ,'Endocrine, nutritional, and metabolic diseases and immunity disorders'),
        ('4' ,'Diseases of blood and blood-forming organs'),
        ('5' ,'Mental disorders'),
        ('6' ,'Diseases of the nervous system and sense organs'),
        ('7' ,'Diseases of the circulatory system'),
        ('8' ,'Diseases of the respiratory system'),
        ('9' ,'Diseases of the digestive system'),
        ('10','Diseases of the genitourinary system'),
        ('11','Complications of pregnancy, childbirth, and the puerperium'),
        ('12','Diseases of the skin and subcutaneous tissue'),
        ('13','Diseases of the musculoskeletal system'),
        ('14','Congenital anomalies'),
        ('15','Certain conditions originating in the perinatal period'),
        ('16','Symptoms, signs, and ill-defined conditions'),
        ('17','Injury and poisoning'),
        ('18','Factors influencing health status and contact with health service'),
    ]
    
    cci9 = cci9.merge(pd.DataFrame(dict_bodysystem,columns=['BODY SYSTEM','BODY SYSTEM DESCRIPTION']),how='left',on='BODY SYSTEM')
    
    cci9.loc[:,'ICD-9-CM CODE'] = cci9['ICD-9-CM CODE'].str.replace(' ','')
    
    return cci9

def icd9tocci(df,col_icd='icd9'):
    cci9 = load_cci9()
    return df.merge(cci9,how='left',left_on=col_icd,right_on='ICD-9-CM CODE')

diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\DIAGNOSES_ICD.csv')
d_diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\D_ICD_DIAGNOSES.csv')
diagdf = diagdf.drop('ROW_ID',axis=1).merge(d_diagdf.drop('ROW_ID',axis=1),how='left',on='ICD9_CODE')

icdtocci9(diagdf,col_icd='ICD9_CODE').head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,ICD-9-CM CODE,ICD-9-CM CODE DESCRIPTION,CHRONIC,BODY SYSTEM,BODY SYSTEM DESCRIPTION
0,10032,140372,3,5070,Food/vomit pneumonitis,Pneumonitis due to inhalation of food or vomitus,5070,FOOD/VOMIT PNEUMONITIS,0,8,Diseases of the respiratory system
1,10032,140372,4,42830,Diastolc hrt failure NOS,"Diastolic heart failure, unspecified",42830,UNSPECIFIED DIASTOLIC HEART FAILURE (Begin 2002),1,7,Diseases of the circulatory system
2,10032,140372,5,4280,CHF NOS,"Congestive heart failure, unspecified",4280,CONGESTIVE HEART FAILURE,1,7,Diseases of the circulatory system
3,10032,140372,6,2851,Ac posthemorrhag anemia,Acute posthemorrhagic anemia,2851,AC POSTHEMORRHAG ANEMIA,0,4,Diseases of blood and blood-forming organs
4,10032,140372,7,2765,,,2765,HYPOVOLEMIA (End 2005),0,3,"Endocrine, nutritional, and metabolic diseases..."


In [442]:
# Chronic Condition Indicator (CCI) from ICD 10

#https://www.hcup-us.ahrq.gov/toolssoftware/chronic_icd10/chronic_icd10.jsp
def load_cci10():
    
    cci10 = pd.read_csv(os.path.join(os.getcwd(),'cci_icd10cm_2019_1.csv'))
    
    cci10.columns = [i.strip('\'') for i in cci10.columns]
    
    for col in cci10.columns:
        cci10.loc[:,col] = cci10[col].str.strip('\'')
    cci10 = cci10.replace(r'^\s*$', np.nan, regex=True)
    cci10.columns = [i.replace('CHRONIC INDICATOR','CHRONIC') for i in cci10.columns]
    
    dict_bodysystem = [
        ('1','Infectious and parasitic disease'),
        ('2','Neoplasms'),
        ('3','Endocrine, nutritional, and metabolic diseases and immunity disorders'),
        ('4','Diseases of blood and blood-forming organs'),
        ('5','Mental disorders'),
        ('6','Diseases of the nervous system and sense organs'),
        ('7','Diseases of the circulatory system'),
        ('8','Diseases of the respiratory system'),
        ('9','Diseases of the digestive system'),
        ('10','Diseases of the genitourinary system'),
        ('11','Complications of pregnancy, childbirth, and the puerperium'),
        ('12','Diseases of the skin and subcutaneous tissue'),
        ('13','Diseases of the musculoskeletal system'),
        ('14','Congenital anomalies'),
        ('15','Certain conditions originating in the perinatal period'),
        ('16','Symptoms, signs, and ill-defined conditions'),
        ('17','Injury and poisoning'),
        ('18','Factors influencing health status and contact with health services'),
    ]
    
    cci10 = cci10.merge(pd.DataFrame(dict_bodysystem,columns=['BODY SYSTEM','BODY SYSTEM DESCRIPTION']),how='left',on='BODY SYSTEM')
    
    cci10.loc[:,'ICD-10-CM CODE'] = cci10['ICD-10-CM CODE'].str.replace(' ','')
    
    return cci10

def icd10tocci(df,col_icd='icd10'):
    cci10 = load_cci10()
    return df.merge(cci10,how='left',left_on=col_icd,right_on='ICD-10-CM CODE')



diagdf=loaddiagdf()
diagdf=diagdf.loc[diagdf['ICDX_VERSION_NO']=='10',:]
#load_cci10()
#diagdf.merge(load_cci10(),how='left',left_on='ICDX_DIAGNOSIS_CODE',right_on='ICD-10-CM CODE').head()
icd10tocci(diagdf,col_icd='ICDX_DIAGNOSIS_CODE').head()

Unnamed: 0,ICDX_VERSION_NO,ICDX_DIAGNOSIS_CODE,DESCRIPTION,ICD-10-CM CODE,ICD-10-CM CODE DESCRIPTION,CHRONIC,BODY SYSTEM,BODY SYSTEM DESCRIPTION
0,10,G912,(Idiopathic) normal pressure hydrocephalus,G912,(Idiopathic) normal pressure hydrocephalus,1,6,Diseases of the nervous system and sense organs
1,10,Z3A10,10 weeks gestation of pregnancy,Z3A10,10 weeks gestation of pregnancy,0,18,Factors influencing health status and contact ...
2,10,Z3A11,11 weeks gestation of pregnancy,Z3A11,11 weeks gestation of pregnancy,0,18,Factors influencing health status and contact ...
3,10,Z3A12,12 weeks gestation of pregnancy,Z3A12,12 weeks gestation of pregnancy,0,18,Factors influencing health status and contact ...
4,10,Z3A13,13 weeks gestation of pregnancy,Z3A13,13 weeks gestation of pregnancy,0,18,Factors influencing health status and contact ...


In [343]:
# Clinical Classification Software (CCS) from ICD 9

#https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp
def load_ccs9():
    ccs9 = pd.read_csv(os.path.join(os.getcwd(),'$dxref 2015.csv'))
    ccs9 = ccs9.reset_index()
    for col in ccs9.columns:
        ccs9.loc[:,col]=ccs9[col].str.strip('\'')
    ccs9.columns=ccs9.iloc[0,:]
    ccs9 = ccs9.iloc[1:,:]
    ccs9 = ccs9.replace(r'^\s*$', np.nan, regex=True)
    ccs9 = ccs9.loc[ccs9['ICD-9-CM CODE'].notnull(),:]
    ccs9.loc[:,'ICD-9-CM CODE'] = ccs9['ICD-9-CM CODE'].str.replace(' ','')
    ccs9.loc[:,'CCS CATEGORY'] = ccs9['CCS CATEGORY'].str.replace(' ','')
    ccs9 = ccs9.iloc[:,0:4]    
    ccs9_labels = pd.read_csv(os.path.join(os.getcwd(),'dxlabel 2015.csv'))
    ccs9 = ccs9.merge(ccs9_labels,how='left',left_on='CCS CATEGORY',right_on='CCS DIAGNOSIS CATEGORIES')
    ccs9.drop('CCS CATEGORY DESCRIPTION',axis=1,inplace=True)
    ccs9.drop('CCS DIAGNOSIS CATEGORIES',axis=1,inplace=True)
    ccs9.columns = [i.replace('CCS DIAGNOSIS CATEGORIES LABELS','CCS CATEGORY DESCRIPTION') for i in ccs9.columns]
    return ccs9

#https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp
def icd9toccs(df,col_icd='icd9'):
    ccs9 = load_ccs9()
    output = df.merge(ccs9,how='left',left_on=col_icd,right_on='ICD-9-CM CODE')
    if col_icd!='ICD-9-CM CODE':
        output.drop('ICD-9-CM CODE',axis=1,inplace=True)
    return output

def onehotify(df,col_id,col_val):
    return pd.concat([df.loc[:,[col_id]],pd.get_dummies(df[col_val])],axis=1).groupby(col_id).max()

diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\DIAGNOSES_ICD.csv')
#onehotify(icd9toccs(diagdf,col_icd='ICD9_CODE'),'HADM_ID','CCS CATEGORY DESCRIPTION').mean().sort_values(ascending=False)
display(icd9toccs(diagdf,col_icd='ICD9_CODE').head())
display(onehotify(icd9toccs(diagdf,col_icd='ICD9_CODE'),col_id='HADM_ID',col_val='CCS CATEGORY DESCRIPTION').head())

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,CCS CATEGORY,ICD-9-CM CODE DESCRIPTION,CCS CATEGORY DESCRIPTION
0,112565,10032,140372,3,5070,129,FOOD/VOMIT PNEUMONITIS-,Aspiration pneumonitis; food/vomitus
1,112566,10032,140372,4,42830,108,UNSPECIFIED DIASTOLIC HEART FAILURE (Begin 2002),Congestive heart failure; nonhypertensive
2,112567,10032,140372,5,4280,108,CONGESTIVE HEART FAILURE,Congestive heart failure; nonhypertensive
3,112568,10032,140372,6,2851,60,AC POSTHEMORRHAG ANEMIA-,Acute posthemorrhagic anemia
4,112569,10032,140372,7,2765,55,HYPOVOLEMIA (End 2005),Fluid and electrolyte disorders


Unnamed: 0_level_0,Abdominal hernia,Abdominal pain,Acute and unspecified renal failure,Acute cerebrovascular disease,Acute myocardial infarction,Acute posthemorrhagic anemia,Administrative/social admission,Alcohol-related disorders,Allergic reactions,Anal and rectal conditions,...,Skull and face fractures,Spinal cord injury,Spondylosis; intervertebral disc disorders; other back problems,Substance-related disorders,Suicide and intentional self-inflicted injury,Syncope,Thyroid disorders,Tuberculosis,Urinary tract infections,Varicose veins of lower extremity
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100375,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
100969,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101361,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
102203,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
103379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [357]:
# Clinical Classification Software (CCS) from ICD 10

#https://www.hcup-us.ahrq.gov/toolssoftware/ccs10/ccs10.jsp


def load_ccs10():
    ccs10 = pd.read_csv(os.path.join(os.getcwd(),'ccs_dx_icd10cm_2019_1.csv'))
    ccs10.columns=[i.strip('\'') for i in ccs10.columns]
    for col in ccs10.columns:
        ccs10.loc[:,col]=ccs10[col].str.strip('\'')
    ccs10 = ccs10.replace(r'^\s*$', np.nan, regex=True)
    ccs10.loc[:,'ICD-10-CM CODE'] = ccs10['ICD-10-CM CODE'].str.replace(' ','')
    ccs10=ccs10.iloc[:,0:4]
    return ccs10
    
def icd10toccs(df,col_icd='icd10'):
    ccs10 = load_ccs10()
    output = df.merge(ccs10,how='left',left_on=col_icd,right_on='ICD-10-CM CODE')
    if col_icd!='ICD-10-CM CODE':
        output.drop('ICD-10-CM CODE',axis=1,inplace=True)
    return output

    

diagdf =loaddiagdf()
diagdf=diagdf.loc[diagdf['ICDX_VERSION_NO']=='10',:]
display(icd10toccs(diagdf,col_icd='ICDX_DIAGNOSIS_CODE').head())

Unnamed: 0,ICDX_VERSION_NO,ICDX_DIAGNOSIS_CODE,DESCRIPTION,CCS CATEGORY,ICD-10-CM CODE DESCRIPTION,CCS CATEGORY DESCRIPTION
0,10,G912,(Idiopathic) normal pressure hydrocephalus,95,(Idiopathic) normal pressure hydrocephalus,Other nervous system disorders
1,10,Z3A10,10 weeks gestation of pregnancy,259,10 weeks gestation of pregnancy,Residual codes; unclassified
2,10,Z3A11,11 weeks gestation of pregnancy,259,11 weeks gestation of pregnancy,Residual codes; unclassified
3,10,Z3A12,12 weeks gestation of pregnancy,259,12 weeks gestation of pregnancy,Residual codes; unclassified
4,10,Z3A13,13 weeks gestation of pregnancy,259,13 weeks gestation of pregnancy,Residual codes; unclassified


In [450]:
diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\DIAGNOSES_ICD.csv')
d_diagdf = pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\D_ICD_DIAGNOSES.csv')
diagdf = diagdf.drop('ROW_ID',axis=1).merge(d_diagdf.drop('ROW_ID',axis=1),how='left',on='ICD9_CODE')

# Everything together

def parsediag(dfin,col_icd,col_id,icd_version):
    df = dfin.copy()
    
    if icd_version==9:
        toccs = icd9toccs
    elif icd_version==10:
        toccs = icd10toccs
    else:
        print('ERROR: Please set icd_version to 9 or 10')
        return None
    
    df = toccs(df,col_icd=col_icd)
    
    if icd_version==9:
        tocci = icd9tocci
    elif icd_version==10:
        tocci = icd10tocci
    
    df = tocci(df,col_icd=col_icd)
    
    df = icdtoelixcomo(df,col_icd=col_icd)
    
    df = df.loc[:,list(dfin.columns)+['CCS CATEGORY','CCS CATEGORY DESCRIPTION','CHRONIC','ElixComo','ElixComoScore']]
    
    
    return df

parsediag(diagdf,col_icd='ICD9_CODE',col_id='HADM_ID',icd_version=9)


Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE,CCS CATEGORY,CCS CATEGORY DESCRIPTION,CHRONIC,ElixComo,ElixComoScore
0,10032,140372,3,5070,Food/vomit pneumonitis,Pneumonitis due to inhalation of food or vomitus,129,Aspiration pneumonitis; food/vomitus,0,,
1,10032,140372,4,42830,Diastolc hrt failure NOS,"Diastolic heart failure, unspecified",108,Congestive heart failure; nonhypertensive,1,Congestive Heart Failure,7
2,10032,140372,5,4280,CHF NOS,"Congestive heart failure, unspecified",108,Congestive heart failure; nonhypertensive,1,Congestive Heart Failure,7
3,10032,140372,6,2851,Ac posthemorrhag anemia,Acute posthemorrhagic anemia,60,Acute posthemorrhagic anemia,0,,
4,10032,140372,7,2765,,,55,Fluid and electrolyte disorders,0,Fluid and Electrolyte Disorders,5
5,10032,140372,8,2875,Thrombocytopenia NOS,"Thrombocytopenia, unspecified",62,Coagulation and hemorrhagic disorders,1,Coagulopathy,3
6,10032,140372,9,5849,Acute kidney failure NOS,"Acute kidney failure, unspecified",157,Acute and unspecified renal failure,0,,
7,10033,157235,1,24220,Tox multnod goit no cris,Toxic multinodular goiter without mention of t...,48,Thyroid disorders,1,,
8,10033,157235,2,51919,Trachea & bronch dis NEC,Other diseases of trachea and bronchus,134,Other upper respiratory disease,0,,
9,10033,157235,3,5849,Acute kidney failure NOS,"Acute kidney failure, unspecified",157,Acute and unspecified renal failure,0,,


In [148]:
# NDC to RXNORM to DRUG CLASS

def ndc2rxcui(df_med,col_ndc='ndc'):
    print('Converting NDC to RXCUI')
    output=[]
    ndclist=df_med[col_ndc].unique()
    lenndc = len(ndclist)
    for i in range(0,len(ndclist)):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenndc,(i+1)/lenndc*100), end='\r', flush=True)
        curndc=ndclist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/ndcstatus.json?ndc='+str(curndc)).json()['ndcStatus']
        if 'ndcHistory' in r:
            for entry in r['ndcHistory']:
                output.append({
                    'ndc':curndc,
                    'rxcui':entry['activeRxcui'],
                    'start':pd.to_datetime(entry['startDate']+'01'),
                    'end':pd.to_datetime(entry['endDate']+'01'),
                })
        else:
            print('NDC code [{}] was not able to be mapped to rxcui'.format(curndc))
        time.sleep(1/20)
    output=pd.DataFrame(output).replace({r'^\s*$':None}, regex=True).dropna()
    return output
def rxcui2class(df_mapin,getname=True):
    print('Converting rxcui to drug class')
    rxcuilist=df_mapin['rxcui'].unique()
    lenrxcui=len(rxcuilist)
    output=[]
    
    identifier='classId'
    if getname:
        identifier='className'
    
    for i in range(0,lenrxcui):
        print('{}/{}, {:.2f}% complete'.format((i+1),lenrxcui,(i+1)/lenrxcui*100), end='\r', flush=True)
        currxcui=rxcuilist[i]
        r=requests.get('https://rxnav.nlm.nih.gov/REST/rxclass/class/byRxcui.json?rxcui='+str(currxcui)).json()

        if 'rxclassDrugInfoList' in r:
            tempdict={'rxcui':currxcui}
            for curclass in r['rxclassDrugInfoList']['rxclassDrugInfo']:
                classtype=curclass['relaSource']+'_'+curclass['rxclassMinConceptItem']['classType']
                if classtype not in tempdict:
                    tempdict[classtype]=set([curclass['rxclassMinConceptItem'][identifier]])
                else:
                    tempdict[classtype].add(curclass['rxclassMinConceptItem'][identifier])
            output.append(tempdict)
        else:
            print('rxcui [{}] was not able to be mapped to drug class'.format(currxcui))
        time.sleep(1/20)
    return pd.DataFrame(output)
def ndc2class(df_med,col_ndc='ndc',getname=True,indexcol='ROWID',timecol='TIME'):
    map1=ndc2rxcui(df_med,col_ndc=col_ndc)
    map2=rxcui2class(map1,getname=getname)

    newmed=df_med.copy()
    
    #merge maps to go from ndc to rxnorm to class
    fullmap=map1.merge(map2,how='left',on='rxcui')
    
    #merge medication dataframe to full map
    temp=newmed.merge(fullmap,how='left',left_on=col_ndc,right_on='ndc')
    
    #filter based on start and end date
    temp=temp.loc[(temp[timecol]>=temp.start) & (temp[timecol] <=temp.end),:]
    
    #if multiple rxcui per ndc, take the one with the later end date
    temp=temp.sort_values('end')
    temp=temp.drop_duplicates(subset=df_med.columns,keep='last')

    #if no rxcui exist for ndc given the time range, try to just take the latest rxcui
    temp=pd.concat([temp,newmed.loc[~newmed[indexcol].isin(temp[indexcol]),:].merge(fullmap.sort_values(by='end').drop_duplicates(subset='ndc',keep='last'),how='left',left_on='NDC',right_on='ndc')],axis=0)
    temp=temp.sort_values(by=indexcol)
    return temp


meddf=pd.read_csv(r'C:\Users\Sean Yu\Documents\WUSTL\i2bmi\TESTING\lstm\mimictest\PRESCRIPTIONS.csv',dtype=object)
meddf=meddf.loc[(meddf.NDC.notnull()) & (meddf.DRUG_TYPE=='MAIN') & (meddf.NDC!='0'),:]
meddf.STARTDATE = pd.to_datetime(meddf.STARTDATE)
meddf.ENDDATE = pd.to_datetime(meddf.ENDDATE)
meddf=meddf.loc[:,['ROW_ID','SUBJECT_ID','HADM_ID','STARTDATE','ENDDATE','DRUG','NDC']]
#assigning arbitary time
meddf['TIME']=pd.to_datetime('2011-01-01')

output=ndc2class(meddf,col_ndc='NDC',indexcol='ROW_ID',timecol='TIME')

Converting NDC to RXCUI
NDC code [15127020017] was not able to be mapped to rxcui
NDC code [17314931102] was not able to be mapped to rxcui
NDC code [66591018442] was not able to be mapped to rxcui
NDC code [17191003500] was not able to be mapped to rxcui
NDC code [00338101102] was not able to be mapped to rxcui
NDC code [50383068304] was not able to be mapped to rxcui
NDC code [00338040360] was not able to be mapped to rxcui
NDC code [00245008201] was not able to be mapped to rxcui
NDC code [37205014472] was not able to be mapped to rxcui
NDC code [55499120401] was not able to be mapped to rxcui
NDC code [49452477503] was not able to be mapped to rxcui
NDC code [11980002515] was not able to be mapped to rxcui
NDC code [00074729501] was not able to be mapped to rxcui
NDC code [16837085839] was not able to be mapped to rxcui
NDC code [62174057751] was not able to be mapped to rxcui
NDC code [66689036430] was not able to be mapped to rxcui
NDC code [00065041435] was not able to be mapped

In [149]:
output

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,STARTDATE,ENDDATE,DRUG,NDC,TIME,end,ndc,...,FDASPL_MOA,FDASPL_PE,FMTSME_TC,MEDRT_CHEM,MEDRT_DISEASE,MEDRT_MOA,MEDRT_PE,MEDRT_PK,MESH_MESHPA,VA_VA
945,1093789,43748,121860,2179-04-17,2179-04-23,Influenza Virus Vaccine,58160087546,2011-01-01,NaT,,...,,,,,,,,,,
946,1093790,43748,121860,2179-04-17,2179-04-23,Senna,00904516561,2011-01-01,2018-11-01,00904516561,...,,,,{Senna Extract},"{Colic, Constipation}",{Unknown Cellular or Molecular Interaction},"{Increased Large Intestinal Muscle Tone, Incre...",,{Cathartics},{STIMULANT LAXATIVES}
944,1093791,43748,121860,2179-04-17,2179-04-19,Docusate Sodium (Liquid),00121054410,2011-01-01,2018-11-01,00121054410,...,,,,{Dioctyl Sulfosuccinic Acid},"{Abdominal Pain, Drug Hypersensitivity, Nausea...",{Surfactant Activity},{Stimulation Large Intestine Fluid/Electrolyte...,,,{STOOL SOFTENER}
18260,1093792,43748,121860,2179-04-17,2179-04-23,Simvastatin,51079045620,2011-01-01,2018-11-01,51079045620,...,{Hydroxymethylglutaryl-CoA Reductase Inhibitors},,,{Simvastatin},"{Hypercholesterolemia, Drug Hypersensitivity, ...",{Hydroxymethylglutaryl-CoA Reductase Inhibitors},,,"{Hypolipidemic Agents, Hydroxymethylglutaryl-C...",{ANTILIPEMIC AGENTS}
18252,1093793,43748,121860,2179-04-17,2179-04-17,Zolpidem Tartrate,60505260400,2011-01-01,2018-11-01,60505260400,...,{GABA A Agonists},{Central Nervous System Depression},,{Pyridines},"{Drug Hypersensitivity, Sleep Initiation and M...","{GABA A Modulators, GABA A Agonists}",{Decreased Central Nervous System Organized El...,,,"{SEDATIVES/HYPNOTICS,OTHER}"
18262,1093794,43748,121860,2179-04-17,2179-04-23,FoLIC Acid,00182050789,2011-01-01,2013-09-01,00182050789,...,,,,{Folic Acid},"{Anemia, Iron-Deficiency, Drug Hypersensitivit...",{Enzyme Interactions},{Cellular Activity Alteration},,"{Hematinics, Vitamin B Complex}",{FOLIC ACID/LEUCOVORIN}
18259,1093795,43748,121860,2179-04-17,2179-04-21,Acetaminophen,00182844789,2011-01-01,2013-01-01,00182844789,...,,,,{Acetaminophen},"{Drug Hypersensitivity, Glucosephosphate Dehyd...",{Prostaglandin Receptor Antagonists},"{Hypothalamic Endocrine Activity Alteration, P...","{Renal Excretion, Hepatic Metabolism}","{Antipyretics, Analgesics, Non-Narcotic}",{NON-OPIOID ANALGESICS}
18266,1093796,43748,121860,2179-04-17,2179-04-23,Amiodarone,51079090620,2011-01-01,2018-11-01,51079090620,...,"{Cytochrome P450 2C9 Inhibitors, Cytochrome P4...",,{Anti-arrhythmic Agent},"{Fluoroquinolones, Ritonavir, Amiodarone, Cisa...","{Tachycardia, Supraventricular, Lactation, Bra...","{Potassium Channel Interactions, Adrenergic be...","{Negative Chronotropy, Ventricular Repolarizat...",,"{Cytochrome P-450 CYP3A Inhibitors, Sodium Cha...",{ANTIARRHYTHMICS}
18267,1093797,43748,121860,2179-04-17,2179-04-23,Pantoprazole,00008084199,2011-01-01,2018-11-01,00008084199,...,{Proton Pump Inhibitors},,,{2-Pyridinylmethylsulfinylbenzimidazoles},"{Esophagitis, Drug Hypersensitivity, Zollinger...",{Proton Pump Inhibitors},{Inhibition Gastric Acid Secretion},{Hepatic Metabolism},,"{GASTRIC MEDICATIONS,OTHER}"
18254,1094506,43748,121860,2179-04-17,2179-04-18,Zolpidem Tartrate,60505260400,2011-01-01,2018-11-01,60505260400,...,{GABA A Agonists},{Central Nervous System Depression},,{Pyridines},"{Drug Hypersensitivity, Sleep Initiation and M...","{GABA A Modulators, GABA A Agonists}",{Decreased Central Nervous System Organized El...,,,"{SEDATIVES/HYPNOTICS,OTHER}"
