In [37]:
import nltk
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from datetime import datetime
import re

# Load Data

In [26]:
df = pd.read_pickle('../data/interim/drugs.pkl')

# Classifier Prep - Feature Words

In [27]:
## Covnert final tokens column in df into a list of key-value pairs containing text tokens (as string) and 
## the target variable

df['tokens_str'] = df.apply(lambda row : (' '.join(row['tokens_no_stopwords'])), axis = 1)
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final,tokens_no_eng_stopwords,tokens_no_stopwords,tokens_str
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, or, pellets, by, mouth,...","[dosage, adults, take, pellets, mouth, three, ...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, to, under, the,...","[directions, adults, dissolve, tongue, three, ...","[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, administration, recommended, dosage, ...","[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[dosage, and, administration, use, the, lowest...","[dosage, administration, use, lowest, effectiv...","[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa...","[directions, wet, face, apply, hand, massage, ...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...


In [28]:
drugs_data = []

for i in range(len(df)):
    tokens = df['tokens_str'][i]
    target = df['target'][i]
    
    drugs_data.append([tokens, target])

random.choices(drugs_data,k=5)

[['squeeze significant amount palm rub hands fully dry rinse free', 'TOPICAL'],
  'ORAL'],
  'INTRAVENOUS'],
  'ORAL'],
 ['adults children years age take tablet every hours needed children years age take tablet every hours needed children years age consult doctor exceed doses hour period directed doctor',
  'ORAL']]

In [29]:
word_cutoff = 5
tokens = [w for t, p in drugs_data for w in t.split()]
word_dist = nltk.FreqDist(tokens)
feature_words = set()


for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 15829 as features in the model.


In [33]:
def tokenize(text) :
    """ Splitting on whitespace rather than the book's tokenize function. That
    function will drop tokens like '#hashtag' or '2A', which we need for Twitter. """
    
    tk = WhitespaceTokenizer()
    final_text = tk.tokenize(text)
    
    return(final_text)



def drugs_features(text,fw) :
    """Given some text, this returns a dictionary holding the
    feature words.
    Args:
    * text: a piece of text in a continuous string. Assumes
    text has been cleaned and case folded.
    * fw: the *feature words* that we're considering. A word
    in `text` must be in fw in order to be returned. This
    prevents us from considering very rarely occurring words.
    Returns:
    A dictionary with the words in `text` that appear in `fw`.
    Words are only counted once.
    If `text` were "quick quick brown fox" and `fw` = {'quick','fox','jumps'},
    then this would return a dictionary of
    {'quick' : True,
    'fox' : True}
    """
    dict_list =[]
    tokens = tokenize(text)

    for i in tokens:
        if i in fw:
            dict_list.append([i, True])

    ret_dict = dict(dict_list)
    return(ret_dict)

In [34]:
featuresets = [(drugs_features(text,feature_words), target) for (text, target) in drugs_data]

# Classifier - Modeling

In [36]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 500

In [38]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.162


In [39]:
classifier.show_most_informative_features(25)

Most Informative Features
                   depth = True           INFILT : ORAL   =  29960.6 : 1.0
         intraperitoneal = True           INTRAP : ORAL   =  28789.4 : 1.0
              cartridges = True           SUBMUC : ORAL   =  28420.3 : 1.0
                 cleanse = True           INTRAU : ORAL   =  28420.3 : 1.0
               expulsion = True           INTRAU : ORAL   =  28420.3 : 1.0
                   thumb = True           INTRAU : ORAL   =  28420.3 : 1.0
                   finds = True           INTRAC : ORAL   =  27128.5 : 1.0
               injecting = True           RETROB : ORAL   =  27128.5 : 1.0
                    slit = True           INTRAP : ORAL   =  24360.3 : 1.0
                   field = True           INTRAV : ORAL   =  23957.6 : 1.0
                   optic = True           INTRAV : ORAL   =  23957.6 : 1.0
              suggestive = True           INTRAV : ORAL   =  23957.6 : 1.0
               tonometry = True           INTRAV : ORAL   =  23957.6 : 1.0

In [42]:
list(df['target'].unique())

['ORAL',
 'OPHTHALMIC',
 'TOPICAL',
 'INTRAVENOUS',
 'RESPIRATORY (INHALATION)',
 'VAGINAL',
 'SUBLINGUAL',
 'INTRAMUSCULAR',
 'DENTAL',
 'IRRIGATION',
 'INTRATHECAL',
 'EPIDURAL',
 'SUBCUTANEOUS',
 'NASAL',
 'RECTAL',
 'CUTANEOUS',
 'INTRA-ARTICULAR',
 'TRANSDERMAL',
 'INTRAOCULAR',
 'PERCUTANEOUS',
 'INTRACARDIAC',
 'INTRAVITREAL',
 'AURICULAR (OTIC)',
 'SUBMUCOSAL',
 'BUCCAL',
 'PERINEURAL',
 'INFILTRATION',
 'INTRALESIONAL',
 'PERIODONTAL',
 'PARENTERAL',
 'INTRACAVITARY',
 'INTRAVASCULAR',
 'ENDOTRACHEAL',
 'INTRACAVERNOUS',
 'EXTRACORPOREAL',
 'INTRADERMAL',
 'INTRA-ARTERIAL',
 'SUBARACHNOID',
 'INTRAUTERINE',
 'OROPHARYNGEAL',
 'INTRATYMPANIC',
 'INTRACAMERAL',
 'HEMODIALYSIS',
 'URETHRAL',
 'INTRAPERITONEAL',
 'TRANSMUCOSAL',
 'INTRAVESICAL',
 'ENTERAL',
 'INTRABRONCHIAL',
 'INTRACANALICULAR',
 'URETERAL',
 'RETROBULBAR',
 'INTRAPLEURAL',
 'INTRASPINAL',
 'SUBGINGIVAL',
 'INTRASINAL',
 'INTRAVENTRICULAR']

In [52]:
# dictionary of counts by actual party and estimated party.
# first key is actual, second is estimated
drug_types = list(df['target'].unique())
results = defaultdict(lambda: defaultdict(int))

for d in drug_types :
    for d1 in drug_types :
        results[d][d1] = 0
        
random.shuffle(drugs_data)

for idx, dd in enumerate(drugs_data) :
    text, target = dd
    estimated_party = classifier.classify(drugs_features(text, feature_words))
    results[target][estimated_party] += 1

In [53]:
results

defaultdict(<function __main__.<lambda>()>,
            {'ORAL': defaultdict(int,
                         {'ORAL': 6845,
                          'OPHTHALMIC': 0,
                          'TOPICAL': 6,
                          'INTRAVENOUS': 8,
                          'RESPIRATORY (INHALATION)': 1,
                          'VAGINAL': 0,
                          'SUBLINGUAL': 650,
                          'INTRAMUSCULAR': 22,
                          'DENTAL': 75,
                          'IRRIGATION': 0,
                          'INTRATHECAL': 0,
                          'EPIDURAL': 1,
                          'SUBCUTANEOUS': 0,
                          'NASAL': 29,
                          'RECTAL': 0,
                          'CUTANEOUS': 0,
                          'INTRA-ARTICULAR': 101,
                          'TRANSDERMAL': 1,
                          'INTRAOCULAR': 0,
                          'PERCUTANEOUS': 0,
                          'INTRACARDIAC': 0,
 