In [1]:
import pandas as pd
import json

all_labels = pd.read_csv("./datasets/ePillID_data/all_labels.csv")
with open('./drug/ndc/drug-ndc-0001-of-0001.json', 'r') as file:
    data = json.loads(file.read())
file.close()
df = pd.DataFrame(data['results'])

#filter out anything that isn't a tablet or capsule
pills = df[df['dosage_form'].str.contains('TABLET') | df['dosage_form'].str.contains('CAPSULE')]

#filter out anything that isn't for humans
pills = pills[pills['product_type'].str.contains('HUMAN')]

#pillbox retired on date Jan. 29 2021 as per https://www.nlm.nih.gov/pubs/techbull/ja20/ja20_pillbox_discontinue.html
pillbox_retirement = 20210129
x=pills["product_ndc"].str.split("-", expand=True).astype(int).rename(columns={0:"label_code_id", 1: "prod_code_id"}).join(pills["marketing_start_date"])
merged = pd.merge(x, all_labels.get(["label_code_id", "prod_code_id"]), on=["label_code_id", "prod_code_id"], how="outer", indicator=True)
ndc_not_in_dataset = merged[merged['_merge'] == "left_only"].drop_duplicates()
print(ndc_not_in_dataset.shape)


(45313, 4)


In [None]:
from download import downloadZip

downloadZip("https://www.accessdata.fda.gov/cder/ndc_excluded.zip", "./excluded_drugs/")

In [None]:
excluded = pd.read_table('/Users/Amanda/Desktop/PillRecognition/excluded_drugs/Products_excluded.xls', encoding='Windows-1252')
excluded_ndcs = excluded["PRODUCTNDC"]
excluded_ndcs = excluded_ndcs[excluded_ndcs.str.contains('^(\d+)-(\d+)$', regex=True)].str.split("-", expand=True).astype(int).rename(columns={0:"label_code_id", 1: "prod_code_id"})

ex = pd.merge(merged[merged['_merge'] == "right_only"].drop_duplicates().get(["label_code_id", "prod_code_id"]), excluded_ndcs, on=["label_code_id", "prod_code_id"], how="outer", indicator=True)
ex[ex["_merge"] == "left_only"]

In [None]:
all_labels.get(["label_code_id", "prod_code_id"]).drop_duplicates()
all_labels["label"].drop_duplicates()

In [None]:
merged[merged['_merge'] == "left_only"].drop_duplicates()

In [None]:
merged[merged['_merge'] == "right_only"].drop_duplicates()

In [None]:
merged[merged['_merge'] == "both"].drop_duplicates()

In [2]:
import pandas as pd
import pathlib
import re

properties = pd.read_json('/Users/Amanda/Desktop/PillRecognition/parsedProperties.json')

all_labels = pd.read_csv("./datasets/ePillID_data/all_labels.csv")


def checkNDCMatch(ndc1,ndc2):
    ndc1Parts = list(map(int, ndc1.split('-')))
    ndc2Parts = list(map(int, ndc2.split('-')))
    return ndc1Parts == ndc2Parts

ndcs = all_labels["pilltype_id"].map(lambda x: x.split('_')[0])


def zeroPadFront(str, desiredLength):
    while len(str) < desiredLength:
        str = '0' + str
    return str

def removePadding(str, desiredLength):
    while len(str) > desiredLength:
        if (str[0] == '0'):
            str = str[1:]
        else:
            print('ERROR: desiredLength is too short')
            return None
    return str
    
class NDC:
    def __init__(self, ndc):
        ndc_parts = ndc.split('-')
        
        self.labeler = str(int(ndc_parts[0]))
        self.productCode = str(int(ndc_parts[1]))
        if (len(ndc_parts) < 3):
            self.packageCode = None
        else:
            self.packageCode = str(int(ndc_parts[2]))

    def getFormat(self, asString=False):
        if self.packageCode:
            format = (len(self.labeler), len(self.productCode), len(self.packageCode))
        else:
            format = (len(self.labeler), len(self.productCode))
        if asString:
            return '-'.join(map(str, format))
        else:
            return format
        
    def getNDCSegments(self):
        if (self.packageCode):
            return [self.labeler, self.productCode, self.packageCode]
        else:
            return [self.labeler, self.productCode]
    
    def getNDCString(self, format=None):
        ndc_segments = self.getNDCSegments()
        if format:
            if isinstance(format, str):
                format = list(map(int, format.split('-')))
            elif isinstance(format, tuple):
                format = list(format)
            
            if (len(format) > len(ndc_segments)):
                format.pop()

            for idx in range(len(format)):
                if len(ndc_segments[idx]) < format[idx]:
                    ndc_segments[idx] = zeroPadFront(ndc_segments[idx], format[idx])
        return '-'.join(ndc_segments)
    
    

ndcs = [NDC(row['pilltype_id'].split('_')[0]) if row['image_path'].split('/')[0] == 'fcn_mix_weight' else NDC(row['images'].split('_')[0]) for idx, row in all_labels.iterrows()]


In [58]:
all_labels['ndc'] = pd.Series([NDC(row['pilltype_id'].split('_')[0]).getNDCString() if row['image_path'].split('/')[0] == 'fcn_mix_weight' else NDC(row['images'].split('_')[0]).getNDCString() for idx, row in all_labels.iterrows()])

In [3]:
ndc_strings = pd.Series(list(map(lambda x: x.getNDCString(), ndcs)))

properties.index = properties.index.map(lambda x: NDC(x).getNDCString())

In [4]:
import requests as r
from bs4 import BeautifulSoup
import json

def makeCall(path, base='https://rxnav.nlm.nih.gov/REST/', query=''):
    return json.loads(r.get(base + path + query).content)

def getNDCProps(codes):
    return codes.map(lambda x: makeCall('ndcproperties.json', query='?id=' + x))

ndc_strings = pd.Series(list(map(lambda x: x.getNDCString(), ndcs)))

# 
# props = getNDCProps(ndc_strings)
# props.index = ndc_strings
# props.to_json('epillid_props.json')

In [5]:
import json
import pandas as pd

with open('epillid_props.json', 'r') as file:
    data = json.loads(file.read())
file.close()
props = pd.Series(data)
props = pd.Series([x['ndcPropertyList']['ndcProperty'][0]['propertyConceptList']['propertyConcept'] if len(x) != 0  else None for x in props])
makePropTuple = lambda x: (x['propName'], x['propValue'])
props_parsed = pd.json_normalize(props.map(lambda x: dict(map(makePropTuple, x)), na_action='ignore'))
props_parsed['ndc'] = ndc_strings
props_parsed.to_json('epillid_props_parsed.json')


In [6]:
props_parsed.get(['COLOR', 'COLORTEXT', 'SHAPE', 'SHAPETEXT', 'SIZE', 'IMPRINT_CODE', 'ndc']).value_counts()

COLOR   COLORTEXT                             SHAPE   SHAPETEXT                          SIZE   IMPRINT_CODE  ndc         
C48332  BROWN                                 C48348  circular                           12 mm  S;712         62756-712-86    7
C48325  WHITE(off-white to slightly greyish)  C48348  biconvex                           6 mm   2858          59762-2858-1    7
C48333  BLUE                                  C48336  Modified Capsule Shaped, Biconvex  11 mm  F;12          65862-156-30    7
C48332  BROWN(beige)                          C48348  barrel shape                       9 mm   W;936         64679-936-3     7
C48325  white                                 C48345  elliptical                         15 mm  RX829         63304-829-90    7
                                                                                                                             ..
        white(White to off white)             C48348  round, biconvex                    13 mm  121          

In [87]:
color_groups = {}
for group in props_parsed.groupby('COLOR'):
    color_groups[group[0]] = group[1].get(['ndc'])

shape_groups = {}
for group in props_parsed.groupby('SHAPE'):
    shape_groups[group[0]] = group[1].get(['ndc'])

size_groups = {}
for group in props_parsed.groupby('SIZE'):
    size_groups[group[0]] = group[1].get(['ndc'])

ndc_image_paths = {}
for group in all_labels.groupby('ndc'):
    ndc_image_paths[group[0]] = group[1].get(['image_path'])
