In [48]:
import pandas as pd
import numpy as np
import plotly.express as px
from mylib import flatten, EurekaUtil
from IPython.display import display, HTML


def printmd(string):
    display(string)

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)

medications_csv_path = "/Users/bubbles/src/python/OpenEpic/data/medications_2021-12-10.csv"
rxnorm_file = "/Users/bubbles/src/python/OpenEpic/libraries/RxTerms202203/RxTerms202203.txt"
rxnorm_ingredients_file = "/Users/bubbles/src/python/OpenEpic/libraries/RxTerms202203/RxTermsIngredients202203.txt"


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
## Load rxnorm_ingredients_file
print("Reading rxnorm file...")
eureka_util = EurekaUtil(rxnorm_ingredients_file)
print("Done.  List df_rxnormIngr headings:", list(df_rxnormIngr))
print("")
print("Reading medications_csv...")
## READ MEDICATIONS_CSV AND SUMMARIZE
df_meds = pd.read_csv(medications_csv_path)
print("Done")
print("")
print('Unique patients: ', len(df_meds['user_id'].unique()))
print('Unique meds: ', len(df_meds['rxcui'].unique()))
print(df_meds['medication_name'].value_counts())
print('\n')
print('Total records: ', len(df_meds))
print(list(df_meds))

Reading rxnorm file...
Done.  List df_rxnormIngr headings: ['RXCUI', 'INGREDIENT', 'ING_RXCUI']

Reading medications_csv...



Columns (16,20) have mixed types. Specify dtype option on import or set low_memory=False.



Done

Unique patients:  37073
Unique meds:  5403
Atorvastatin (Oral Pill)      46183
Levothyroxine (Oral Pill)     44222
Lisinopril (Oral Pill)        33204
ZyrTEC (Oral Pill)            29905
Aspirin (Oral Pill)           25636
                              ...  
allerfy nasal spray               1
prestic                           1
progesterone\nprogesterone        1
pantropazole                      1
Cloidogrel                        1
Name: medication_name, Length: 14897, dtype: int64


Total records:  2235683
['user_id', 'site', 'submitted_at', 'custom_entry', 'medication_name', 'brand_name', 'full_generic_name', 'product_name', 'sxdg_name', 'rxcui', 'generic_rxcui', 'sxdg_rxcui', 'route', 'dosage_form', 'strength', 'currently_taking', 'as_needed', 'frequency_number', 'frequency_every', 'frequency_unit', 'frequency_times_per_unit', 'variable_basis', 'reason']


In [50]:
df_meds.head(3)

Unnamed: 0,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason
0,14,covid,2021-09-27,True,ASPIRIN,,,,,,,,,,,False,,,,,,not known,
1,14,covid,2021-09-27,False,Atorvastatin (Oral Pill),,atorvastatin 40 MG Oral Tablet,atorvastatin 40 MG Oral Tablet,atorvastatin Pill,617311.0,,1158285.0,Oral Pill,Oral Tablet,40 mg,True,False,1.0,1.0,day,,,
2,14,covid,2021-09-27,False,Dofetilide (Oral Pill),,dofetilide 0.125 MG Oral Capsule,dofetilide 0.125 MG Oral Capsule,dofetilide Pill,310003.0,,1160621.0,Oral Pill,Oral Capsule,0.125 mg,True,False,3.0,12.0,hour,,,


In [51]:
## Pre-Process data
print("Records before filter:", len(df_meds))
## Convert submitted_at to datetime objects
df_meds["submitted_at"] = pd.to_datetime(df_meds["submitted_at"])
# df_meds.iloc[:, 16].value_counts()

## Keep only latest set of submitted_at date
latest_date_per_user = df_meds.groupby('user_id')['submitted_at'].max().reset_index()
df_meds_last = df_meds.merge(df_meds.groupby('user_id')['submitted_at'].max().reset_index(),
                            on=['user_id', 'submitted_at'], how='inner')



# Remove 'currently_taking' = False
df_meds_last = df_meds_last[df_meds_last['currently_taking'] == True]

print("Records after filter:", len(df_meds_last))
# df_meds_last.head(20)
# result.loc[result['user_id'] == 14]

Records before filter: 2235683
Records after filter: 110972


In [52]:
## Show final post-processed summaries
print('Show final post-processed summaries')
print('')
print('total records: ', len(df_meds_last))
df_meds_last['medication_name'].value_counts().head(10)
out = df_meds_last['user_id'].value_counts().value_counts().reset_index(name="records")
out = out.rename(columns={'index':'# of meds'})
out.sort_values(by=['# of meds'])

fig = px.bar(out, x="# of meds", y="records")
fig.update_yaxes(title="Number of patients")
fig.update_xaxes(title="Number of meds")
fig.show()

Show final post-processed summaries

total records:  110972


In [53]:
freq = df_meds_last['medication_name'].value_counts(normalize=True).head(20)
fig = px.bar(freq, orientation='h')
fig.update_yaxes(autorange="reversed", title="Top Medications in Eureka CCS")
fig.update_xaxes(title="Probability")
fig.show()


In [54]:
## THIS CELL IS FOR QUERYING THE RXCUI DATABASE
print(list(eureka_util.df_rxnormIngr))
# df_rxnormIngr[df_rxnormIngr['INGREDIENT'].str.contains('hydrochlorothiazide', case=False)].value_counts(['INGREDIENT', 'ING_RXCUI'])

query_ing = 'Norgestimate'
rxcui = None
query_drugname = "chole"
if query_ing is not None: 
    print("query result:")
    print(eureka_util.searchIngredientByName(query_ing))
    print("")
if rxcui is not None: 
    print("rxcui result:", eureka_util.getIngredientNameByRxcui(10582))
    print("")
# if query_drugname is not None: eureka_util.
if query_drugname is not None: 
    eureka_util.findIngredientsByProductName(query_drugname, verbose=True)


AttributeError: 'EurekaUtil' object has no attribute 'df_rxnormIngr'

In [55]:
# DEAL WITH CUSTOM ENTRIES 
## Adds mappings from rxcui_map to df_meds  (adds INGREDIENT_LIST and ING_RXCUI_LIST)
df_meds_mapped = eureka_util.addIngredientColumns(df_meds_last)


mapping_dictionary = {
    'ASPIRIN': 1191,
    'asprin': 1191,
    'thyroxine': 10582, 
    'synthroid': 10582,
    'albuterol': 435, 
    'ventolin': 435,
    'vitamin d': 2418,
    'vitamin c': 1151,
    'Ethinyl estradiol/Inert ingredients/Norgestimate': [4124, 31994]
}
eureka_util.addIngredientsForCustomMedications(df_meds_mapped, mapping_dictionary, verbose=1)
    
# Test
df_meds_mapped[df_meds_mapped['medication_name'] == 'ventolin']


Unnamed: 0,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason,INGREDIENT_LIST,ING_RXCUI_LIST
2107,132351,ucsfhealth,2021-04-04,True,ventolin,,,,,,,,,,,True,True,0.0,0.0,hour,,,,[albuterol],[435]
6299,371416,CCS10,2020-03-28,True,ventolin,,,,,,,,,,200micrograms,True,True,0.0,0.0,hour,,,,[albuterol],[435]
6651,371692,covid19,2020-04-04,True,ventolin,,,,,,,,,,,True,True,0.0,0.0,hour,,,Athletically induced asthma,[albuterol],[435]
17102,380655,covid19,2021-03-05,True,ventolin,,,,,,,,,,,True,True,0.0,0.0,hour,,,Asthma,[albuterol],[435]
27420,389759,ALA1,2021-11-24,True,ventolin,,,,,,,,,,0.21 mg/ml Sol,True,True,0.0,0.0,hour,,,Exercise Induced Asthma,[albuterol],[435]
48751,415895,bethematch,2021-11-30,True,ventolin,,,,,,,,,,90 mcg,True,True,0.0,0.0,hour,,,Asthma,[albuterol],[435]
58062,431010,cardiogram2,2021-04-08,True,ventolin,,,,,,,,,,,True,True,0.0,0.0,hour,,,,[albuterol],[435]


In [56]:
medlist = df_meds_mapped['INGREDIENT_LIST'].tolist()
medlist = pd.DataFrame(flatten(medlist), columns=['Drug Ingredient'])

In [57]:
freq = medlist.value_counts().reset_index(name="Counts").head(20)
fig = px.bar(freq, x="Counts", y="Drug Ingredient", orientation='h', title="Top Active Ingredients in Eureka CCS", text_auto=True)
fig.update_yaxes(autorange="reversed", title="Top Medication Ingredients")
fig.update_xaxes(title="Number")
fig.update_layout(showlegend=False)
fig.show()

In [58]:
# QUERY BY INGREDIENT (EXAMPLE)
# nan shows up as floats. 
mask = df_meds_mapped.INGREDIENT_LIST.apply(lambda x: not isinstance(x, float) and 'norgestimate' in x)
df_meds_mapped[mask]

Unnamed: 0,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason,INGREDIENT_LIST,ING_RXCUI_LIST
1125,36848,19covid,2021-11-29,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,day,,,Pre menopause symptoms,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
1615,64195,covid-19,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,0.0,0.0,day,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
2011,118480,CCS10,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,month,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
2589,169078,CCS10,2020-09-04,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,0.0,0.0,hour,,100mcg transdermal patch every 72 hours,Chronic severe back pain,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
3257,234742,CCS11,2021-03-06,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,7.0,day,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
4003,306450,CCS10,2020-11-10,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,24.0,hour,,,Acne,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
4211,309636,CCS10,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,month,,,Birth control,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
4316,319678,CCS10,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,month,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
5006,355805,covid19,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,month,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"
5039,359362,covid19,2020-08-25,True,Ethinyl estradiol/Inert ingredients/Norgestima...,,,,,,,,,,mixed Pack,True,False,1.0,1.0,month,,,,"[ethinyl estradiol, norgestimate]","[4124, 31994]"


In [59]:
df_custom = df_meds_mapped[(df_meds_mapped['custom_entry'] == True) & (df_meds_mapped['INGREDIENT_LIST'].isnull())]
print('Total number of records: ', len(df_meds_mapped))
print('Total number of custom entries: ', len(df_custom))
df_custom['medication_name'].value_counts().head(10)

Total number of records:  110972
Total number of custom entries:  14425


multivitamin                          362
prenatal vitamin                      177
MONISTAT 7 COMBINATION PACK (Pack)    166
birth control                         158
probiotic                             122
Multivitamin                           94
fish oil                               90
D3                                     89
multi vitamin                          77
trazadone                              72
Name: medication_name, dtype: int64

In [62]:
## Query RxClass API
rxcui = 5487
search_str = 'codeine'
result = eureka_util.searchIngredientByName(search_str)
print(result)
if len(result) == 0:
    print('rxcui not found')
else:
    rxcui = result['ING_RXCUI'][0]
    print(f'Using {rxcui}')
    classTypes_ignore = ['DISEASE', 'PE', 'MOA', 'CHEM', 'STRUCT', 'DISPOS', 'EPC', 'PK']
    classType = ['ATC1-4']
    filterByClassType = True
    
    url_s = f'https://rxnav.nlm.nih.gov/REST/rxclass/class/byRxcui.json?rxcui={str(rxcui)}'
    # with urllib.request.urlopen(url_s) as url:
    #     json = json.loads(url.read().decode())
    json = pd.read_json(url_s)
    for item in json['rxclassDrugInfoList']['rxclassDrugInfo']:
        if (item['minConcept']['rxcui'] == str(rxcui) 
            and (not filterByClassType or item['rxclassMinConceptItem']['classType'] in classType)):
    #         print(item['minConcept'])
            print(item['rxclassMinConceptItem'])

   ING_RXCUI      INGREDIENT
0       2670         codeine
1      23088  dihydrocodeine
Using 2670
{'classId': 'R05DA', 'className': 'Opium alkaloids and derivatives', 'classType': 'ATC1-4'}
