In [1]:
import nltk
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from datetime import datetime
import re
import pickle
from cleaning import prepare, remove_punctuation, remove_numbers, tokenize, remove_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
df = pd.read_pickle('../data/interim/drugs.pkl')

# Process Text

In [3]:
pipeline = [str.lower, remove_punctuation, remove_numbers, tokenize, remove_stopwords]
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

# Classifier Prep - Target Variable

In [4]:
## Currently, there are many labels in the target variable column.
## 5 of these account for more than 90% of the data. As such, anything outside the top 5 labels by count
## will be reclassified as OTHER

list(df['target'].unique())

['ORAL',
 'OPHTHALMIC',
 'TOPICAL',
 'INTRAVENOUS',
 'RESPIRATORY (INHALATION)',
 'VAGINAL',
 'SUBLINGUAL',
 'INTRAMUSCULAR',
 'DENTAL',
 'IRRIGATION',
 'INTRATHECAL',
 'EPIDURAL',
 'SUBCUTANEOUS',
 'NASAL',
 'RECTAL',
 'CUTANEOUS',
 'INTRA-ARTICULAR',
 'TRANSDERMAL',
 'INTRAOCULAR',
 'PERCUTANEOUS',
 'INTRACARDIAC',
 'INTRAVITREAL',
 'AURICULAR (OTIC)',
 'SUBMUCOSAL',
 'BUCCAL',
 'PERINEURAL',
 'INFILTRATION',
 'INTRALESIONAL',
 'PERIODONTAL',
 'PARENTERAL',
 'INTRACAVITARY',
 'INTRAVASCULAR',
 'ENDOTRACHEAL',
 'INTRACAVERNOUS',
 'EXTRACORPOREAL',
 'INTRADERMAL',
 'INTRA-ARTERIAL',
 'SUBARACHNOID',
 'INTRAUTERINE',
 'OROPHARYNGEAL',
 'INTRATYMPANIC',
 'INTRACAMERAL',
 'HEMODIALYSIS',
 'URETHRAL',
 'INTRAPERITONEAL',
 'TRANSMUCOSAL',
 'INTRAVESICAL',
 'ENTERAL',
 'INTRABRONCHIAL',
 'INTRACANALICULAR',
 'URETERAL',
 'RETROBULBAR',
 'INTRAPLEURAL',
 'INTRASPINAL',
 'SUBGINGIVAL',
 'INTRASINAL',
 'INTRAVENTRICULAR']

In [5]:
## As seen below, the top 5 target values are ORAL, TOPICAL, INTRAVENOUS, DENTAL and INTRAMUSCULAR
## All others will be converted to OTHER
df.groupby('target') \
   .count() \
   .sort_values('text', ascending=False) \
   .head(20)

Unnamed: 0_level_0,text,tokens
target,Unnamed: 1_level_1,Unnamed: 2_level_1
ORAL,46715,46715
TOPICAL,27972,27972
INTRAVENOUS,2868,2868
DENTAL,1403,1403
INTRAMUSCULAR,1384,1384
OPHTHALMIC,1344,1344
SUBLINGUAL,798,798
NASAL,644,644
SUBCUTANEOUS,326,326
RESPIRATORY (INHALATION),326,326


In [6]:
df.loc[~df['target'].isin(['ORAL', 'TOPICAL', 'INTRAVENOUS', 'DENTAL', 'INTRAMUSCULAR']), 'target'] = 'OTHER'

list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

# Classifier Prep - Feature Words

In [7]:
## Covnert final tokens column in df into a list of key-value pairs containing text tokens (as string) and 
## the target variable

df['tokens_str'] = df.apply(lambda row : (' '.join(row['tokens'])), axis = 1)
df.head()

Unnamed: 0,target,text,tokens,tokens_str
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...
2,OTHER,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...


In [8]:
drugs_data = []

for i in range(len(df)):
    tokens = df['tokens_str'][i]
    target = df['target'][i]
    
    drugs_data.append([tokens, target])

random.choices(drugs_data,k=5)

  'ORAL'],
 ['parenteral intramuscular im intravenous iv subcutaneous sc routes may used disease exceptionally severe gastrointestinal absorption may known impaired usual therapeutic — adults children regardless age mg daily resistant cases may require larger doses maintenance level clinical symptoms subsided blood picture become normal maintenance level used ie mg infants mg children four years age mg adults children four years age mg pregnant lactating women per day never less mg per day patient kept close supervision adjustment maintenance level made relapse appears imminent presence alcoholism hemolytic anemia anticonvulsant therapy chronic infection maintenance level may need increased parenteral drug products inspected visually particulate matter discoloration prior whenever solution container permit',
  'INTRAMUSCULAR'],
 ['exceed recommended age dose adults children years age older teaspoonful ml every hours exceed teaspoonfuls ml hour period directed doctor children years age 

In [9]:
word_cutoff = 5
tokens = [w for t, p in drugs_data for w in t.split()]
word_dist = nltk.FreqDist(tokens)
feature_words = set()


for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 15823 as features in the model.


In [10]:
def tokenize(text) :
    """ Splitting on whitespace. """
    
    tk = WhitespaceTokenizer()
    final_text = tk.tokenize(text)
    
    return(final_text)



def drugs_features(text,fw) :
    """Given some text, this returns a dictionary holding the
    feature words.
    Args:
    * text: a piece of text in a continuous string. Assumes
    text has been cleaned and case folded.
    * fw: the *feature words* that we're considering. A word
    in `text` must be in fw in order to be returned. This
    prevents us from considering very rarely occurring words.
    Returns:
    A dictionary with the words in `text` that appear in `fw`.
    Words are only counted once.
    If `text` were "quick quick brown fox" and `fw` = {'quick','fox','jumps'},
    then this would return a dictionary of
    {'quick' : True,
    'fox' : True}
    """
    dict_list =[]
    tokens = tokenize(text)

    for i in tokens:
        if i in fw:
            dict_list.append([i, True])

    ret_dict = dict(dict_list)
    return(ret_dict)

In [11]:
featuresets = [(drugs_features(text,feature_words), target) for (text, target) in drugs_data]

# Classifier - Modeling

In [12]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 500

In [13]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.69


In [14]:
classifier.show_most_informative_features(25)

Most Informative Features
                  stable = True           INTRAV : TOPICA =   5801.7 : 1.0
                      iv = True           INTRAM : TOPICA =   5709.5 : 1.0
                 reapply = True           TOPICA : ORAL   =   5660.6 : 1.0
                swimming = True           TOPICA : ORAL   =   5473.7 : 1.0
                injected = True           INTRAM : TOPICA =   5427.0 : 1.0
                 diluted = True           INTRAV : TOPICA =   5359.2 : 1.0
                   aging = True           TOPICA : ORAL   =   4955.3 : 1.0
                spectrum = True           TOPICA : ORAL   =   4588.2 : 1.0
          reconstitution = True           INTRAV : TOPICA =   4500.2 : 1.0
                lactated = True           INTRAV : TOPICA =   4363.5 : 1.0
          individualized = True           INTRAM : TOPICA =   4189.6 : 1.0
                 divided = True           INTRAM : TOPICA =   4068.6 : 1.0
                     rub = True           TOPICA : ORAL   =   4055.8 : 1.0

In [15]:
list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

In [16]:
# dictionary of counts by actual drug category vs. predicted / classified. 
# first key is actual, second is estimated
drug_types = list(df['target'].unique())
results = defaultdict(lambda: defaultdict(int))

for d in drug_types :
    for d1 in drug_types :
        results[d][d1] = 0
        
random.shuffle(drugs_data)

for idx, dd in enumerate(drugs_data) :
    text, target = dd
    estimated_party = classifier.classify(drugs_features(text, feature_words))
    results[target][estimated_party] += 1

In [17]:
results

defaultdict(<function __main__.<lambda>()>,
            {'ORAL': defaultdict(int,
                         {'ORAL': 29475,
                          'OTHER': 375,
                          'TOPICAL': 11,
                          'INTRAVENOUS': 10623,
                          'INTRAMUSCULAR': 5668,
                          'DENTAL': 563}),
             'OTHER': defaultdict(int,
                         {'ORAL': 406,
                          'OTHER': 3346,
                          'TOPICAL': 104,
                          'INTRAVENOUS': 920,
                          'INTRAMUSCULAR': 463,
                          'DENTAL': 6}),
             'TOPICAL': defaultdict(int,
                         {'ORAL': 249,
                          'OTHER': 3112,
                          'TOPICAL': 22266,
                          'INTRAVENOUS': 1417,
                          'INTRAMUSCULAR': 788,
                          'DENTAL': 140}),
             'INTRAVENOUS': defaultdict(int,
            

# Output Model -> Pickle

In [19]:
## Trained model now being output to pickle file - allowing it to be applied to new test cases in Dash app.
with open('../models/classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)
with open('../models/classifier_features.pkl', 'wb') as f:
    pickle.dump(feature_words, f)