In [18]:
import pyodbc
import pandas as pd
import numpy as np
import seaborn as sns
import urllib
import requests
import sqlalchemy as sqla
import matplotlib as mpl
import xml.etree.ElementTree as ET

from matplotlib import pyplot as plt
from importlib import reload
from tqdm import tqdm_notebook
from itertools import chain

In [2]:
with open('dbmi-aetna-cxn-str') as f:
    cxn_params = f.read().strip()
    gmw_cxn_str = urllib.parse.quote_plus(cxn_params + 'Database=gmw3')
    aetna_raw_cxn_str = urllib.parse.quote_plus(cxn_params + 'Database=AetnaDataWarehouse')
    hy_cxn_str = urllib.parse.quote_plus(cxn_params + 'Database=hy180')
    
engine = sqla.create_engine("mssql+pyodbc:///?odbc_connect=%s" % gmw_cxn_str, connect_args = {'autocommit':True})
cxn = engine.connect()
ins = sqla.inspect(engine)

In [3]:
hy_engine = sqla.create_engine("mssql+pyodbc:///?odbc_connect=%s" % hy_cxn_str, connect_args = {'autocommit': True, 'fast_executemany': True})
hy_cxn = hy_engine.connect()

# Helper tables for T2D 

## Visit count

In [71]:
# Visit count per patient

visit_count_query = ("drop table if exists hy180.dbo.VisitCount;"
                    " select MemberNum, count(*) as visits"
                    " into hy180.dbo.VisitCount"
                    " from VisitMedicalClaim with (nolock)"
                    " group by MemberNum")
cxn.execute(visit_count_query)

<sqlalchemy.engine.result.ResultProxy at 0x7fba42537ac8>

## Labs

In [72]:
# Patients with glucose/hba1c lab values

glucose_tests_loinc = ('1558-6', '2339-0', '2345-7')
a1c_tests_loinc = ('4548-4', '17856-6', '4549-2', '17855-8')
lab_tests_loinc = glucose_tests_loinc + a1c_tests_loinc

normal_labs_query = ("drop table if exists hy180.dbo.GlucoseLabs;"
                    " select MemberNum, LoincCode, NumberOfTests as numTests, MinResultValue as minValue, MaxResultValue as maxValue"
                    " into hy180.dbo.GlucoseLabs"
                    " from DistinctLab with (nolock)"
                    " where LoincCode in %s" % str(lab_tests_loinc))
cxn.execute(normal_labs_query)

<sqlalchemy.engine.result.ResultProxy at 0x7fba2f7567b8>

## Diagnosis

In [73]:
# Patients with diagnosis of diabetes

dm_dx_icd = str(('790.21', '790.22', '790.2', '790.29', '791.5', '277.7', 'V18.0', 'V77.1'))
dm_dx_icd_fuzzy = ' or '.join(["DiagnosisCode like '%s'" % s for s in ('250%', '648.8%', '648.0%')])

# ICD for Family hx included here 
diabetes_dx_query = ("drop table if exists hy180.dbo.DxDm;"
                    " select MemberNum, DiagnosisCode as diagCode, NumberOfClaims as numClaims"
                    " into hy180.dbo.DxDm"
                    " from DistinctDiagnosis with (nolock)"
                    " where DiagnosisCode in %s"
                    " or %s" % (dm_dx_icd, dm_dx_icd_fuzzy))

cxn.execute(diabetes_dx_query)

<sqlalchemy.engine.result.ResultProxy at 0x7fba422d42e8>

In [74]:
# Split into t1d and t2d tables
t1dm_icd_fuzzy = "diagCode like '250._1' or diagCode like '250._3'"
t2dm_icd_fuzzy = "diagCode like '250.[0,2-9]0' or diagCode like '250.[0,2-9]2'"

t1dm_query = ("drop table if exists hy180.dbo.DxT1dm;"
             " select MemberNum, diagCode, numClaims"
             " into hy180.dbo.DxT1dm"
             " from hy180.dbo.DxDm with (nolock)"
             " where %s" % t1dm_icd_fuzzy)
t2dm_query = ("drop table if exists hy180.dbo.DxT2dm;"
             " select MemberNum, diagCode, numClaims"
             " into hy180.dbo.DxT2dm"
             " from hy180.dbo.DxDm with (nolock)"
             " where %s;" % t2dm_icd_fuzzy)

cxn.execute(';'.join([t1dm_query, t2dm_query]))

<sqlalchemy.engine.result.ResultProxy at 0x7fba2f732cc0>

## Meds

In [4]:
t1dm_rxnorm = (139825, 274783, 314684, 352385, 400008, 51428, 5856, 86009, 139953)
t2dm_rxnorm = (173, 10633, 2404, 4821, 217360, 4815, 25789, 73044, 274332, 6809, 84108, 33738, 72610, 16681, 30009, 593411, 60548)
dm_rxnorm = (126958, 412956, 412959, 637321, 668291, 668370, 686655, 692383, 748611, 880998, 881056, 751128, 847187, 847191, 847197, 847203, 847207, 847211, 847230, 847239, 847252, 847256, 847259, 847263, 847278, 847416, 847417, 806905, 806903, 408119)

### Ingredient labeling

In [23]:
all_drugs = pd.read_sql('select NationalDrugCode,NdcDescription,NumberOfClaims from ConceptMedication', cxn, index_col='NationalDrugCode')
all_drugs.to_csv('all_drugs.csv')

In [77]:
drug_ingredients_create_table = ("drop table if exists RxIngredients;"
                                " create table RxIngredients ("
                                "  ndc bigint not null,"
                                "  rxcui bigint,"
                                "  ingredientRxcui bigint"
                                " );"
                                " create index indexByNdc on RxIngredients (ndc);")
hy_cxn.execute(drug_ingredients_create_table)

<sqlalchemy.engine.result.ResultProxy at 0x7fba42c708d0>

In [78]:
# Ingredient fetching happens in an external script
drug_ingredients = pd.read_csv('drug_ingredients.csv', usecols=['ndc', 'rxcui', 'ingredient_rxcui'], dtype=np.int_)

rows = [str(tuple(l)) for l in drug_ingredients.values]
i = 0
for i in tqdm_notebook(range(len(rows)//1000 + 1)):
    insert_sql = ("insert into RxIngredients (ndc, rxcui, ingredientRxcui) values %s;" % ','.join(rows[1000*i:1000*(i+1)]))
    hy_cxn.execute(insert_sql)





### Insulin supplies

DM med supplies listed in emerge algorithm are not ingredient-level, but SCDs, so we need explicitly find related NDCs

In [None]:
# Get NDCs relating to DM med supplies list
BASE_URL = "https://rxnav.nlm.nih.gov/REST"

explore_queue = list(dm_rxnorm)
all_rxcuis = set(dm_rxnorm)
insulin_ndcs = []

for rxcui in explore_queue:
    print(rxcui)
    # Get related RxCUIs 
    resp = requests.get(BASE_URL + '/rxcui/%s/allrelated' % rxcui)
    root = ET.fromstring(resp.text)
    related_rxcui = list(chain.from_iterable([[n.text for n in root.findall("./allRelatedGroup/conceptGroup/[tty='%s']/conceptProperties/rxcui" % tty)] for tty in ['SCD', 'GPCK', 'SBD', 'BPCK']]))
    explore_queue.extend(set(related_rxcui).difference(all_rxcuis))
    all_rxcuis = all_rxcuis.union(related_rxcui)
    
    # Get NDCs for RxCUI
    resp = requests.get(BASE_URL + '/rxcui/%s/allhistoricalndcs' % rxcui)
    root = ET.fromstring(resp.text)
    insulin_ndcs += [int(n.text) for n in root.findall("./historicalNdcConcept/historicalNdcTime/ndcTime/ndc")]
    
with open('insulin_ndcs', 'w') as f:
    for ndc in insulin_ndcs:
        f.write('%d\n' % ndc)

In [42]:
with open('insulin_ndcs') as f:
    insulin_ndcs = [int(l) for l in f.readlines()]

## Validation

In [79]:
# Claims coverage 
all_drugs.loc[drug_ingredients.ndc.unique()].NumberOfClaims.sum() / all_drugs.NumberOfClaims.sum()

0.9846076596542334

In [80]:
# Count DM-related drugs discovered
drug_ingredients[drug_ingredients.ingredient_rxcui.isin(t1dm_rxnorm) | drug_ingredients.ingredient_rxcui.isin(t2dm_rxnorm) | drug_ingredients.ingredient_rxcui.isin(dm_rxnorm)].shape[0]

2354

## Patients on DM meds

In [66]:
insulin_med_temp_query = ("drop table if exists #InsulinMeds;"
                          " select ingredientRxcui, ndc"
                          " into #InsulinMeds"
                          " from RxIngredients"
                          " where ndc in %s" % str(tuple(insulin_ndcs)))
insulin_rx_query = ("drop table if exists RxInsulin"
                    " select t1.MemberNum, t2.ndc, t2.ingredientRxcui, t1.NumberOfClaims, t1.FirstDispenseDate, t1.LastDispenseDate"
                    " into RxInsulin"
                    " from gmw3.dbo.DistinctMedication as t1"
                    " inner join #InsulinMeds as t2"
                    " on t1.NationalDrugCode=t2.ndc")

hy_cxn.execute(';'.join([insulin_med_temp_query, insulin_rx_query]))

<sqlalchemy.engine.result.ResultProxy at 0x7f4665885080>

In [83]:
t2dm_med_temp_query = ("drop table if exists #T2DmMeds;"
                    " select ingredientRxcui, ndc"
                    " into #T2DmMeds"
                    " from RxIngredients"
                    " where ingredientRxcui in %s" % str(t2dm_rxnorm))
t2dm_rx_query = ("drop table if exists RxT2dm"
              " select t1.MemberNum, t2.ndc, t2.ingredientRxcui, t1.NumberOfClaims, t1.FirstDispenseDate, t1.LastDispenseDate"
              " into RxT2dm"
              " from gmw3.dbo.DistinctMedication as t1"
              " inner join #T2DmMeds as t2"
              " on t1.NationalDrugCode=t2.ndc")

hy_cxn.execute(';'.join([t2dm_med_temp_query, t2dm_rx_query]))

<sqlalchemy.engine.result.ResultProxy at 0x7fba2f756c18>

In [82]:
t1dm_med_temp_query = ("drop table if exists #T1DmMeds;"
                    " select ingredientRxcui, ndc"
                    " into #T1DmMeds"
                    " from RxIngredients"
                    " where ingredientRxcui in %s" % str(t1dm_rxnorm))
t1dm_rx_query = ("drop table if exists RxT1Dm"
              " select t1.MemberNum, t2.ndc, t2.ingredientRxcui, t1.NumberOfClaims, t1.FirstDispenseDate, t1.LastDispenseDate"
              " into RxT1Dm"
              " from gmw3.dbo.DistinctMedication as t1"
              " inner join #T1DmMeds as t2"
              " on t1.NationalDrugCode=t2.ndc")

hy_cxn.execute(';'.join([t1dm_med_temp_query, t1dm_rx_query]))

<sqlalchemy.engine.result.ResultProxy at 0x7fba2f756d68>

In [67]:
sqla.inspect(hy_engine).get_table_names()

['DxDm',
 'DxT1dm',
 'DxT2dm',
 'GlucoseLabs',
 'RxDm',
 'RxIngredients',
 'RxInsulin',
 'RxT1Dm',
 'RxT2dm',
 'VisitCount']

In [68]:
dm_rx_query = ("drop table if exists RxDm;"
              " select *"
              " into RxDm"
              " from (select * from RxT2dm union select * from RxInsulin union select * from RxT1dm) as t1")
hy_cxn.execute(dm_rx_query)

<sqlalchemy.engine.result.ResultProxy at 0x7f46658fcef0>