# Goal of this notebook is to prepare a lexicon a Drug terms (and possibly phrases) for use as features in a model

In [68]:
import sys
import os

import pandas as pd

In [69]:
from IPython.display import display

In [70]:
import nltk
from nltk.corpus import stopwords

In [71]:
time nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\slick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Wall time: 1.02 ms


True

In [72]:
stopword_set = set(stopwords.words('english'))

In [73]:
MEDEX_DIR = r'resources/medex'
MEDEX_BRAND_FILE = 'brand_generic.cfg'
MEDEX_GENERIC_FILE = 'rxcui_generic.cfg'

brand_file_path = os.path.join(MEDEX_DIR, MEDEX_BRAND_FILE)
print('Preparing to load drug terms from : {}'.format(brand_file_path))

generic_file_path = os.path.join(MEDEX_DIR, MEDEX_GENERIC_FILE)
print('Preparing to load drug terms from : {}'.format(generic_file_path))

drug_file_paths = []
drug_file_paths.append(brand_file_path)
drug_file_paths.append(generic_file_path)

Preparing to load drug terms from : resources/medex\brand_generic.cfg
Preparing to load drug terms from : resources/medex\rxcui_generic.cfg


In [74]:
# curated a set of "single term" words which are not directly applicable to medications
drug_stopword_set = set()
with open(os.path.join(MEDEX_DIR, 'drug_stop_words.txt'), 'r') as f:
    for line in f:
        line = line.strip()
        drug_stopword_set.add(line)
        
print(sorted(list(drug_stopword_set)))
print('Total drug stop words : {}'.format(len(drug_stopword_set)))

['ache', 'acne', 'act', 'active', 'acute', 'air', 'alert', 'allergies', 'avocado', 'back', 'bag', 'bandage', 'beef', 'beer', 'black', 'blade', 'bleeding', 'blood', 'bone', 'bran', 'brand', 'breath', 'breathe', 'broccoli', 'brown', 'cabbage', 'calcium', 'cane', 'cantaloupe', 'care', 'catheter', 'celery', 'cell', 'cheese', 'chest', 'cholesterol', 'clamp', 'clams', 'clear', 'cold', 'colon', 'control', 'copd', 'cope', 'cough', 'count', 'daily', 'dates', 'deep', 'deliver', 'disease', 'drain', 'dressing', 'duration', 'dye', 'fat', 'fevers', 'fiber', 'filter', 'flu', 'food', 'form', 'free', 'gauze', 'glove', 'gloves', 'gown', 'gum', 'headache', 'heat', 'high-dose', 'honeydew', 'hose', 'impact', 'information', 'inhaler', 'insect', 'insects', 'leg', 'level', 'light', 'liner', 'little', 'liver', 'long', 'lung', 'mask', 'melon', 'milk', 'multiple', 'nature', 'nausea', 'needle', 'new', 'night', 'oil', 'one', 'onion', 'orange', 'pad', 'patient', 'patients', 'peanut', 'performance', 'pink', 'plan', 

In [75]:
%%time

phrase_dict = {}
term_dict = {}
for drug_file_path in drug_file_paths:
    with open(drug_file_path, 'r') as f:
        for line in f:
            line = line.lower()
            tab_parts = line.split('\t')
            for tab_part in tab_parts:
                phrase_terms = tab_part.split()
                
                # one of these files has only one integer (identifier) in it, so let's skip those
                if len(phrase_terms) == 1 and phrase_terms[0].isdigit():
                    continue
                
                for token_idx in range(len(phrase_terms)):
                    token = phrase_terms[token_idx]
                    # remove the () parens
                    if token.startswith('(') or token.endswith(')'):
                        phrase_terms[token_idx] = ''
                    elif token.startswith('[') or token.endswith(']'):
                        phrase_terms[token_idx] = ''

                # now keep only phrases with non-empty tokens
                phrase_terms = [x for x in phrase_terms if len(x) > 0]
                if len(phrase_terms) <= 0:
                    continue

                phrase_str = ' '.join(phrase_terms)
                phrase_dict.setdefault(phrase_str, 0)
                phrase_dict[phrase_str] += 1
                
                if len(phrase_terms) == 1:
                    term_dict.setdefault(phrase_terms[0], 0)
                    term_dict[phrase_terms[0]] += 1
                
print('DONE reading phrases and terms')

DONE reading phrases and terms
Wall time: 4.63 s


In [76]:
phrase_df_dicts = []
for phrase, count in phrase_dict.items():
    phrase_df_dict = {'Phrase' : phrase, 'Count' : count, 'Length' : len(phrase)}
    
    phrase_df_dicts.append(phrase_df_dict)
    
phrase_df = pd.DataFrame(phrase_df_dicts)

print('Phrase DF length : {}'.format(len(phrase_df)))

Phrase DF length : 52617


In [77]:
term_df_dicts = []
for term, count in term_dict.items():
    term_df_dict = {'Term' : term, 'Count' : count, 'Length' : len(term), 'AnyAlpha' : any(c.isalpha() for c in term),
                   'Stopword' : term in stopword_set , 'DrugStopword' : term in drug_stopword_set,
                   'SingleTerm' : (' ' not in term)}
    
    term_df_dicts.append(term_df_dict)
    
term_df = pd.DataFrame(term_df_dicts)

print('Term DF length : {}'.format(len(term_df)))

Term DF length : 17759


In [78]:
display(phrase_df)

Unnamed: 0,Count,Length,Phrase
0,2360,15,diphenhydramine
1,17,8,aldex ct
2,353,7,leucine
3,120,27,"aminosyn 3.5%, sulfite-free"
4,53,15,mechlorethamine
5,13,51,merck frosst brand of mechlorethamine hydrochl...
6,842,59,"2-pyridinepropanamine, gamma-(4-chlorophenyl)-..."
7,45,8,z-cof hc
8,87,18,interferon beta-1a
9,5,5,rebif


In [79]:
MIN_DRUG_TERM_LENGTH = 3

print('Total terms before filtering : {}'.format(len(term_df)))

term_filtered_df = term_df[(term_df['Stopword'] == False) & \
                           (term_df['DrugStopword'] == False) & \
                           (term_df['AnyAlpha'] == True) & \
                           (term_df['SingleTerm'] == True) & \
                          (term_df['Length'] >= MIN_DRUG_TERM_LENGTH)]

print('Total terms after filtering : {}'.format(len(term_filtered_df)))

term_filtered_df = term_filtered_df.sort_values('Count', ascending = False)
display(term_filtered_df)

Total terms before filtering : 17759
Total terms after filtering : 17643


Unnamed: 0,AnyAlpha,Count,DrugStopword,Length,SingleTerm,Stopword,Term
13,True,9647,False,11,True,False,guaifenesin
65,True,8485,False,16,True,False,chlorpheniramine
39,True,7159,False,15,True,False,pseudoephedrine
36,True,6783,False,16,True,False,dextromethorphan
41,True,6754,False,11,True,False,paracetamol
107,True,6727,False,13,True,False,acetaminophen
32,True,6580,False,13,True,False,phenylephrine
201,True,3070,False,7,True,False,menthol
182,True,2752,False,14,True,False,hydrocortisone
229,True,2737,False,9,True,False,lidocaine


In [80]:
# now let's get these terms and write them to a file
drug_filtered_terms = term_filtered_df['Term'].tolist()
drug_filtered_terms = sorted(drug_filtered_terms)

In [81]:
drug_term_file = 'medex_filtered_terms.txt'
drug_term_path = os.path.join(MEDEX_DIR, drug_term_file)
print('Writing drug terms to : {}'.format(drug_term_path))
with open(drug_term_path, 'w') as f:
    for term in drug_filtered_terms:
        f.write(term)
        f.write('\n')

Writing drug terms to : resources/medex\medex_filtered_terms.txt
