In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from lib.DrugUtil import flatten, DrugUtil

import plotly.express as px
from lib.FhirDataUtil import FhirDataUtil
from lib.CCSDataUtil import CCSDataUtil
from IPython.display import display, HTML


def printmd(string):
    display(string)

%load_ext autoreload
%autoreload 2

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)

_output_dir = 'output/'

_rxnorm_file = "input_static/RxTerms202203/RxTerms202203.txt"
_rxnorm_ingredients_file = "input_static/RxTerms202203/RxTermsIngredients202203.txt"
_rxclass_file = "input_static/_rxclass_2022-04-10.csv"

_ccs_demographics_file = "input/ccs/ccs_demographics.csv"
_ccs_medications_file = "input/ccs/ccs_medications.csv"
_ccs_conditions_file = "input/ccs/ccs_conditions.csv"

_fhir_demographics_file = "input/fhir_20230423/fhir_demographics.csv"

_cache_dir = 'cache/'

In [2]:
fhirUtil = FhirDataUtil()
drugUtil = DrugUtil()
drugUtil.load(rxnorm_file=_rxnorm_file,
                        rxnorm_ingredients_file=_rxnorm_ingredients_file, 
                        rxclass_file=_rxclass_file)

ccsUtil = CCSDataUtil(drugUtil)
ccsUtil.load_demographics(_ccs_demographics_file)
ccs_demo = ccsUtil.demographics

fhirUtil.load_demographics(fhir_demographics_file=_fhir_demographics_file)
fhir_demo = fhirUtil.demographics

Reading rxclass file...
Reading rxnorm file...
Loaded demographics file with entries:  99064


  self.demographics = pd.read_csv(ccs_demographics)


In [3]:

ccs_demo.drop_duplicates('user_id').merge(fhir_demo.drop_duplicates('user_id'), on='user_id', how="outer", indicator=True)['_merge'].value_counts()

left_only     97508
both           1555
right_only       48
Name: _merge, dtype: int64

In [4]:
chunksize = 500000

list_of_dataframes = []

for chunk in pd.read_csv(_ccs_medications_file, chunksize=chunksize, low_memory=False):
    # process your data frame here
    # then add the current data frame into the list
    print('loaded', len(list_of_dataframes)*chunksize, 'records')
    chunk = chunk[chunk['user_id'].isin(fhir_demo['user_id'])]
    list_of_dataframes.append(chunk)

# if you want all the dataframes together, here it is
ccs_meds = pd.concat(list_of_dataframes)

print("Total ccs_meds entries loaded:", len(ccs_meds))

loaded 0 records
loaded 500000 records
loaded 1000000 records
loaded 1500000 records
loaded 2000000 records
loaded 2500000 records
loaded 3000000 records
loaded 3500000 records
loaded 4000000 records
loaded 4500000 records
Total ccs_meds entries loaded: 296322


In [5]:
# Select only the FHIR users
ccs_meds = ccs_meds[ccs_meds['user_id'].isin(fhir_demo['user_id'])]
ccs_meds["submitted_at"] = pd.to_datetime(ccs_meds["submitted_at"])
print('Selected users:', ccs_meds['user_id'].nunique())
print('Total Meds:', len(ccs_meds))

Selected users: 709
Total Meds: 296322


In [6]:
# De-duplicate by rxcui
ccs_meds = ccs_meds[(~ccs_meds[['user_id', 'rxcui']].duplicated()) | ccs_meds['rxcui'].isna()]
ccs_meds["submitted_at"] = pd.to_datetime(ccs_meds["submitted_at"])
ccs_meds = ccs_meds.drop_duplicates(['user_id', 'medication_name'])
print('total meds after de-duplication:', ccs_meds.shape[0])

total meds after de-duplication: 6186


In [7]:
# Keep only latest entry for each user
# latest_date_per_user = ccs_meds.groupby('user_id')['submitted_at'].max().reset_index()
# ccs_meds = ccs_meds.merge(latest_date_per_user, on=['user_id', 'submitted_at'], how='inner')
# ccs_meds = ccs_meds.drop_duplicates(['user_id', 'rxcui'])
# print("Keeping only the latest set of entries for each user", len(ccs_meds))

In [8]:
ccs_meds['submitted_at'].sort_values()

163156    2020-01-23
19501     2020-01-23
163153    2020-01-23
163154    2020-01-23
163155    2020-01-23
             ...    
4556823   2023-05-16
2044994   2023-05-16
4129453   2023-05-16
3355537   2023-05-16
1467033   2023-05-16
Name: submitted_at, Length: 6186, dtype: datetime64[ns]

In [9]:
ccs_meds = ccs_meds.merge(drugUtil.rxnormIngr, right_on='RXCUI', left_on='rxcui', how='left', indicator='ingr_merge')
ccs_meds = ccs_meds.rename(columns={'ING_RXCUI': 'ccs_ing_code', 'INGREDIENT': 'ccs_ing_name'})
ccs_meds['ingr_merge'].value_counts()

both          5322
left_only     1246
right_only       0
Name: ingr_merge, dtype: int64

In [10]:
print('Number of custom entries:', len(ccs_meds[pd.isna(ccs_meds['ccs_ing_name'])]))

Number of custom entries: 1246


In [11]:
ccs_meds[1:3]

Unnamed: 0,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason,RXCUI,ccs_ing_name,ccs_ing_code,ingr_merge
1,707,covid19,2023-04-04,False,BENADRYL (Oral Pill),BENADRYL,diphenhydramine hydrochloride 25 MG Oral Capsule,diphenhydramine hydrochloride 25 MG Oral Capsule [Benadryl],Benadryl Pill,1049910.0,1049909.0,1170149.0,Oral Pill,Oral Capsule,25 mg,True,True,0.0,0.0,hour,,,,1049910.0,diphenhydramine,3498.0,both
2,707,covid19,2023-04-04,False,benadryl pill,BENADRYL,,,benadryl pill,,,1170149.0,Oral Pill,,,True,,,,,,not known,allergies,,,,left_only


In [12]:
# Here we add well-known misspellings for custom entries

ccs_meds_p = ccs_meds

mapping_dictionary = {
         'ASPIRIN': 1191,
         'asprin': 1191,
         'thyroxine': 10582, 
         'synthroid': 10582,
         'albuterol': 435, 
         'ventolin': 435,
         'vitamin d': 2418,
         'vitamin c': 1151,
         'Ethinyl estradiol/Inert ingredients/Norgestimate': [4124, 31994]
}

def add_custom_ingredients(df_meds, mapping_dictionary, 
                           search_column="medication_name",
                           ing_rxcui_column="rxcui_ing",
                           ing_name_column="ing_name", verbose=1):
    # ADDS [INGREDIENT_LIIST] AND [ING_RXCUI_LIST] COLUMNS TO DF, USING COLUMN LABELED '
    # df_meds = df with string column 'medication_name' <string> and 'custom_entry' <boolean>
    # verbose - verbose level (0 = no verbose), (1 - partial verbose), (2 - max verbose for debugging)
    for substring, ing_rxcui in mapping_dictionary.items():
        if (not isinstance(ing_rxcui, list)): ing_rxcui = [ing_rxcui]
        ing_rxcui_list = []
        ing_name_list = []
        for i in ing_rxcui:
            rxcui, ing_name = drugUtil.findIngredientByRxcui(rxcui_ing=i)
            if (ing_name == None):
                print(f'Unable to find ing_rxcui {i} in rxnormIngr')
            else:
                ing_rxcui_list.append(rxcui)
                ing_name_list.append(ing_name)
        
        if len(ing_rxcui_list) == 0: 
            print(f'NOT found {substring}, {ing_rxcui} records in ingredient database')
            continue
        
        search_filter = df_meds[search_column].str.contains(substring, na=False, case=False) & df_meds.custom_entry == True
        count = len(df_meds[search_filter])
#         if verbose: print(f'Found {count} df_meds records matching {substring}')
        df_meds.loc[search_filter, 
                    ing_name_column] = ','.join(ing_name_list)
        df_meds.loc[search_filter, 
                    ing_rxcui_column] = ','.join(map(str, ing_rxcui_list))
        if verbose > 0: display(HTML(f'Searched <b>{substring}</b> found <b>{ing_name}</b>, added ingredients to <b>{count}</b> records'))
    return df_meds

ccs_meds_p = add_custom_ingredients(ccs_meds_p, mapping_dictionary)

# Convert to array for explode
def unpack(x):
    if pd.isna(x): return x
    return x.split(',')
ccs_meds_p['ing_name'] = ccs_meds_p['ing_name'].apply(unpack)
ccs_meds_p['rxcui_ing'] = ccs_meds_p['rxcui_ing'].apply(unpack)

ccs_meds_p = ccs_meds_p.explode(['rxcui_ing', 'ing_name'])

ccs_meds_p['ccs_ing_code'] = ccs_meds_p['ccs_ing_code'].fillna(ccs_meds_p['rxcui_ing'])
ccs_meds_p['ccs_ing_name'] = ccs_meds_p['ccs_ing_name'].fillna(ccs_meds_p['ing_name'])
ccs_meds_p = ccs_meds_p.drop(['rxcui_ing', 'ing_name'], axis=1)
ccs_meds = ccs_meds_p

In [13]:
ccs_meds['idx'] = ccs_meds.index
nancodes = ccs_meds[pd.isna(ccs_meds['ccs_ing_code'])]
print('number of nancodes:', len(nancodes))

number of nancodes: 1045


In [17]:

nancodes = drugUtil.add_ingredient_columns(nancodes, 'medication_name', new_code_column='ing_code', new_name_column='ing_name', max_distance=1)
ccs_meds = ccs_meds.drop(['ing_code', 'ing_name'], axis=1) if 'ing_code' in ccs_meds else ccs_meds
ccs_meds = ccs_meds.merge(nancodes[['idx','ing_code', 'ing_name']], on='idx', how='left')
ccs_meds = ccs_meds.explode(['ing_code', 'ing_name'])
ccs_meds['ccs_ing_code'] = ccs_meds['ccs_ing_code'].fillna(ccs_meds['ing_code'])
ccs_meds['ccs_ing_name'] = ccs_meds['ccs_ing_name'].fillna(ccs_meds['ing_name'])


100 / 1045
200 / 1045
300 / 1045
400 / 1045
500 / 1045
600 / 1045
700 / 1045
800 / 1045
900 / 1045
1000 / 1045


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[new_name_column, new_code_column]] = df[med_name_column].apply(self.search_ingredient_by_substring, max_distance=max_distance)


In [20]:
ccs_meds[pd.isna(ccs_meds['ccs_ing_code'])].shape[0]

162

In [21]:
ccs_meds.to_csv(_cache_dir+'/ccs_meds_mapped.csv')

# Old functions, likely delete

In [25]:
def convert_stringarray(row):
    if ('[' in str(row['rxcui_ing'])):
        return str(row['rxcui_ing'])[1:-1].split(',')
    return row['rxcui_ing']
ccsUtil.medications['rxcui_ing'] = ccsUtil.medications.apply(lambda r: convert_stringarray(r), axis=1)

In [34]:
print('ccs medications len: ', len(ccsUtil.medications))
ccsUtil.medications = ccsUtil.medications.explode('rxcui_ing')
print('ccs medications len after: ', len(ccsUtil.medications))

ccs medications len:  140229
ccs medications len after:  143872


In [35]:
ccsUtil.medications.to_csv('input/ccs_medications_preprocessed.csv')

In [6]:
ccs_meds = pd.read_csv('input/ccs_medications_preprocessed.csv')

In [8]:
ccs_meds[ccs_meds['custom_entry'] == True].head(300)

Unnamed: 0.1,Unnamed: 0,index_orig,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason,index,rxcui_ing,ing_name
0,0,0,14,covid,2021-09-27,True,ASPIRIN,,,,,,,,,,,False,,,,,,not known,,0,1191.0,['aspirin']
10,10,49,61,covid19,2021-11-30,True,statin,,,,,,,,,,20mg,True,False,0.0,0.0,day,,,,10,7597.0,
27,26,691,615,ucsfhealth,2021-12-07,True,Small Dose Aspirin,,,,,,,,,,81 mg,False,False,2.0,1.0,day,,,,26,1191.0,['aspirin']
29,28,776,629,covid19,2021-11-24,True,ASPIRIN,,,,,,,,,,,True,,,,,,not known,,28,1191.0,['aspirin']
37,35,846,782,covid19,2021-12-08,True,ASPIRIN,,,,,,,,,,81mg,False,False,1.0,1.0,day,,,,35,1191.0,['aspirin']
55,53,1634,797,CCS10,2021-08-08,True,Sertraline (Oral Pill),,,,,,,,,,50 mg Cap,True,False,0.0,0.0,day,,,panic disorder,53,,
56,54,1643,1162,bethematch,2021-03-27,True,ASPIRIN,,,,,,,,,,,False,,,,,,not known,,54,1191.0,['aspirin']
69,67,1763,1398,CCS10,2020-01-23,True,ASPIRIN,,,,,,,,,,,False,False,,,,,not known,,67,1191.0,['aspirin']
72,70,1766,1422,CCS10,2021-11-01,True,vitamin D,,,,,,,,,,25 mcg,True,False,1.0,24.0,hour,,,Covid,70,2418.0,['cholecalciferol']
79,77,2044,1454,covid19,2021-12-08,True,ASPIRIN,,,,,,,,,,,False,,,,,,not known,,77,1191.0,['aspirin']


In [4]:
len(ccs_meds)

143872

In [5]:
ccs_meds

Unnamed: 0.1,Unnamed: 0,index_orig,user_id,site,submitted_at,custom_entry,medication_name,brand_name,full_generic_name,product_name,sxdg_name,rxcui,generic_rxcui,sxdg_rxcui,route,dosage_form,strength,currently_taking,as_needed,frequency_number,frequency_every,frequency_unit,frequency_times_per_unit,variable_basis,reason,index,rxcui_ing,ing_name
0,0,0,14,covid,2021-09-27,True,ASPIRIN,,,,,,,,,,,False,,,,,,not known,,0,1191.0,['aspirin']
1,1,1,14,covid,2021-09-27,False,Atorvastatin (Oral Pill),,atorvastatin 40 MG Oral Tablet,atorvastatin 40 MG Oral Tablet,atorvastatin Pill,617311.0,,1158285.0,Oral Pill,Oral Tablet,40 mg,True,False,1.0,1.0,day,,,,1,83367.0,
2,2,2,14,covid,2021-09-27,False,Dofetilide (Oral Pill),,dofetilide 0.125 MG Oral Capsule,dofetilide 0.125 MG Oral Capsule,dofetilide Pill,310003.0,,1160621.0,Oral Pill,Oral Capsule,0.125 mg,True,False,3.0,12.0,hour,,,,2,49247.0,
3,3,3,14,covid,2021-09-27,False,ELIQUIS (Oral Pill),ELIQUIS,apixaban 5 MG Oral Tablet,apixaban 5 MG Oral Tablet [Eliquis],Eliquis Pill,1364447.0,1364445.0,1364440.0,Oral Pill,Oral Tablet,5 mg,True,False,1.0,12.0,hour,,,Afib,3,1364430.0,
4,4,4,14,covid,2021-09-27,False,Ezetimibe (Oral Pill),,ezetimibe 10 MG Oral Tablet,ezetimibe 10 MG Oral Tablet,ezetimibe Pill,349556.0,,1163794.0,Oral Pill,Oral Tablet,10 mg,True,False,1.0,1.0,day,,,,4,341248.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143867,132078,2235678,556213,BA1,2021-12-04,True,Cloidogrel,,,,,,,,,,75mg,True,False,1.0,,day,,,,132078,,
143868,132079,2235679,556213,BA1,2021-12-04,False,Lansoprazole (Oral Disintegrating),,lansoprazole 15 MG Disintegrating Oral Tablet,lansoprazole 15 MG Disintegrating Oral Tablet,lansoprazole Disintegrating Oral Product,351261.0,,1295364.0,Oral Disintegrating,Disintegrating Oral Tablet,15 mg,True,False,2.0,,day,,,,132079,17128.0,
143869,132080,2235680,556213,BA1,2021-12-04,False,Levothyroxine (Oral Pill),,levothyroxine sodium 0.075 MG Oral Capsule,levothyroxine sodium 0.075 MG Oral Capsule,levothyroxine Pill,905458.0,,1602745.0,Oral Pill,Oral Capsule,0.075 mg,True,False,1.0,,day,,,,132080,10582.0,
143870,132081,2235681,556213,BA1,2021-12-04,False,Losartan (Oral Pill),,losartan potassium 100 MG Oral Tablet,losartan potassium 100 MG Oral Tablet,losartan Pill,979480.0,,1165343.0,Oral Pill,Oral Tablet,100 mg,True,False,1.0,,day,,,,132081,52175.0,
