In [41]:
import nltk
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from datetime import datetime
import re
import pickle

# Load Data

In [7]:
df = pd.read_pickle('../data/interim/drugs.pkl')

# Classifier Prep - Target Variable

In [8]:
## Currently, there are many labels in the target variable column.
## 5 of these account for more than 90% of the data. As such, anything outside the top 5 labels by count
## will be reclassified as OTHER

list(df['target'].unique())

['ORAL',
 'OPHTHALMIC',
 'TOPICAL',
 'INTRAVENOUS',
 'RESPIRATORY (INHALATION)',
 'VAGINAL',
 'SUBLINGUAL',
 'INTRAMUSCULAR',
 'DENTAL',
 'IRRIGATION',
 'INTRATHECAL',
 'EPIDURAL',
 'SUBCUTANEOUS',
 'NASAL',
 'RECTAL',
 'CUTANEOUS',
 'INTRA-ARTICULAR',
 'TRANSDERMAL',
 'INTRAOCULAR',
 'PERCUTANEOUS',
 'INTRACARDIAC',
 'INTRAVITREAL',
 'AURICULAR (OTIC)',
 'SUBMUCOSAL',
 'BUCCAL',
 'PERINEURAL',
 'INFILTRATION',
 'INTRALESIONAL',
 'PERIODONTAL',
 'PARENTERAL',
 'INTRACAVITARY',
 'INTRAVASCULAR',
 'ENDOTRACHEAL',
 'INTRACAVERNOUS',
 'EXTRACORPOREAL',
 'INTRADERMAL',
 'INTRA-ARTERIAL',
 'SUBARACHNOID',
 'INTRAUTERINE',
 'OROPHARYNGEAL',
 'INTRATYMPANIC',
 'INTRACAMERAL',
 'HEMODIALYSIS',
 'URETHRAL',
 'INTRAPERITONEAL',
 'TRANSMUCOSAL',
 'INTRAVESICAL',
 'ENTERAL',
 'INTRABRONCHIAL',
 'INTRACANALICULAR',
 'URETERAL',
 'RETROBULBAR',
 'INTRAPLEURAL',
 'INTRASPINAL',
 'SUBGINGIVAL',
 'INTRASINAL',
 'INTRAVENTRICULAR']

In [21]:
## As seen below, the top 5 target values are ORAL, TOPICAL, INTRAVENOUS, DENTAL and INTRAMUSCULAR
## All others will be converted to OTHER
df.groupby('target') \
   .count() \
   .sort_values('text', ascending=False) \
   .head(20)

Unnamed: 0_level_0,text,tokens,tokens_slash,tokens_final,tokens_no_eng_stopwords,tokens_no_stopwords
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ORAL,46773,46773,46773,46773,46773,46773
TOPICAL,27962,27962,27962,27962,27962,27962
INTRAVENOUS,2871,2871,2871,2871,2871,2871
DENTAL,1402,1402,1402,1402,1402,1402
INTRAMUSCULAR,1386,1386,1386,1386,1386,1386
OPHTHALMIC,1347,1347,1347,1347,1347,1347
SUBLINGUAL,798,798,798,798,798,798
NASAL,644,644,644,644,644,644
SUBCUTANEOUS,327,327,327,327,327,327
RESPIRATORY (INHALATION),326,326,326,326,326,326


In [28]:
df.loc[~df['target'].isin(['ORAL', 'TOPICAL', 'INTRAVENOUS', 'DENTAL', 'INTRAMUSCULAR']), 'target'] = 'OTHER'

list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

# Classifier Prep - Feature Words

In [29]:
## Covnert final tokens column in df into a list of key-value pairs containing text tokens (as string) and 
## the target variable

df['tokens_str'] = df.apply(lambda row : (' '.join(row['tokens_no_stopwords'])), axis = 1)
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final,tokens_no_eng_stopwords,tokens_no_stopwords,tokens_str
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, or, pellets, by, mouth,...","[dosage, adults, take, pellets, mouth, three, ...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, to, under, the,...","[directions, adults, dissolve, tongue, three, ...","[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...
2,OTHER,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, administration, recommended, dosage, ...","[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[dosage, and, administration, use, the, lowest...","[dosage, administration, use, lowest, effectiv...","[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa...","[directions, wet, face, apply, hand, massage, ...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...


In [30]:
drugs_data = []

for i in range(len(df)):
    tokens = df['tokens_str'][i]
    target = df['target'][i]
    
    drugs_data.append([tokens, target])

random.choices(drugs_data,k=5)

  'TOPICAL'],
 ['initial prednisone tablets usp may vary mg mg prednisone per day depending specific disease entity treated situations less severity lower doses generally suffice selected patients higher initial doses may required initial maintained adjusted satisfactory response noted reasonable period time lack satisfactory clinical response prednisone discontinued patient transferred appropriate therapy emphasized requirements variable must individualized basis disease treatment response patient favorable response noted proper maintenance determined decreasing initial drug small decrements appropriate time intervals lowest maintain adequate clinical response reached kept mind constant monitoring needed regard drug included situations may make adjustments necessary changes clinical status secondary remissions exacerbations disease process patient’s individual drug responsiveness effect patient exposure stressful situations directly related disease entity treatment latter situation ma

In [31]:
word_cutoff = 5
tokens = [w for t, p in drugs_data for w in t.split()]
word_dist = nltk.FreqDist(tokens)
feature_words = set()


for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 15829 as features in the model.


In [32]:
def tokenize(text) :
    """ Splitting on whitespace. """
    
    tk = WhitespaceTokenizer()
    final_text = tk.tokenize(text)
    
    return(final_text)



def drugs_features(text,fw) :
    """Given some text, this returns a dictionary holding the
    feature words.
    Args:
    * text: a piece of text in a continuous string. Assumes
    text has been cleaned and case folded.
    * fw: the *feature words* that we're considering. A word
    in `text` must be in fw in order to be returned. This
    prevents us from considering very rarely occurring words.
    Returns:
    A dictionary with the words in `text` that appear in `fw`.
    Words are only counted once.
    If `text` were "quick quick brown fox" and `fw` = {'quick','fox','jumps'},
    then this would return a dictionary of
    {'quick' : True,
    'fox' : True}
    """
    dict_list =[]
    tokens = tokenize(text)

    for i in tokens:
        if i in fw:
            dict_list.append([i, True])

    ret_dict = dict(dict_list)
    return(ret_dict)

In [33]:
featuresets = [(drugs_features(text,feature_words), target) for (text, target) in drugs_data]

# Classifier - Modeling

In [34]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 500

In [35]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.726


In [36]:
classifier.show_most_informative_features(25)

Most Informative Features
                  stable = True           INTRAV : TOPICA =   5770.8 : 1.0
                      iv = True           INTRAM : TOPICA =   5687.5 : 1.0
                 reapply = True           TOPICA : ORAL   =   5671.8 : 1.0
                swimming = True           TOPICA : ORAL   =   5490.1 : 1.0
                injected = True           INTRAM : TOPICA =   5391.4 : 1.0
                 diluted = True           INTRAV : TOPICA =   5356.4 : 1.0
                   aging = True           TOPICA : ORAL   =   4969.5 : 1.0
                spectrum = True           TOPICA : ORAL   =   4596.0 : 1.0
          reconstitution = True           INTRAV : TOPICA =   4484.8 : 1.0
                lactated = True           INTRAV : TOPICA =   4335.4 : 1.0
          individualized = True           INTRAM : TOPICA =   4193.3 : 1.0
                 divided = True           INTRAM : TOPICA =   4085.6 : 1.0
                     rub = True           TOPICA : ORAL   =   4044.6 : 1.0

In [37]:
list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

In [38]:
# dictionary of counts by actual drug category vs. predicted / classified. 
# first key is actual, second is estimated
drug_types = list(df['target'].unique())
results = defaultdict(lambda: defaultdict(int))

for d in drug_types :
    for d1 in drug_types :
        results[d][d1] = 0
        
random.shuffle(drugs_data)

for idx, dd in enumerate(drugs_data) :
    text, target = dd
    estimated_party = classifier.classify(drugs_features(text, feature_words))
    results[target][estimated_party] += 1

In [39]:
results

defaultdict(<function __main__.<lambda>()>,
            {'ORAL': defaultdict(int,
                         {'ORAL': 29469,
                          'OTHER': 372,
                          'TOPICAL': 11,
                          'INTRAVENOUS': 10593,
                          'INTRAMUSCULAR': 5763,
                          'DENTAL': 565}),
             'OTHER': defaultdict(int,
                         {'ORAL': 406,
                          'OTHER': 3339,
                          'TOPICAL': 104,
                          'INTRAVENOUS': 926,
                          'INTRAMUSCULAR': 472,
                          'DENTAL': 6}),
             'TOPICAL': defaultdict(int,
                         {'ORAL': 250,
                          'OTHER': 3092,
                          'TOPICAL': 22276,
                          'INTRAVENOUS': 1402,
                          'INTRAMUSCULAR': 802,
                          'DENTAL': 140}),
             'INTRAVENOUS': defaultdict(int,
            

# Output Model -> Pickle

In [44]:
## Trained model now being output to pickle file - allowing it to be applied to new test cases in Dash app.
filename = 'classifier.pkl'
pickle.dump(classifier, open('../models/' + filename, 'wb'))