In [1]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from cleaning import prepare

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bikramgill/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load Data

In [2]:
df = pd.read_pickle('../data/interim/drugs.pkl')

In [3]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


# Split on Whitespace

The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [4]:
pipeline = [str.lower, str.split]

In [5]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [6]:
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas..."


In [7]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24499663


In [8]:
token_counts = Counter(all_tokens)

In [9]:
types = token_counts.keys()

In [10]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

AttributeError: 'Counter' object has no attribute 'total'

In [11]:
token_counts.most_common(50)

[('the', 851313),
 ('of', 769929),
 ('to', 635572),
 ('and', 578225),
 ('in', 430990),
 ('for', 369814),
 ('a', 365992),
 ('be', 333439),
 ('mg', 325918),
 ('or', 317316),
 ('with', 314990),
 ('dose', 313916),
 ('is', 275027),
 ('patients', 242024),
 ('should', 200870),
 ('dosage', 197133),
 ('not', 181487),
 ('daily', 148418),
 ('2', 147927),
 ('may', 143902),
 ('as', 141944),
 ('tablets', 136898),
 ('use', 121003),
 ('at', 120090),
 ('recommended', 115620),
 ('(', 107651),
 ('by', 99063),
 ('on', 96482),
 ('than', 92303),
 ('every', 91043),
 ('treatment', 89280),
 ('if', 88449),
 ('once', 88127),
 ('1', 86685),
 ('years', 86413),
 ('hours', 85069),
 ('administration', 82629),
 ('12', 81223),
 ('10', 78944),
 ('doses', 76425),
 ('children', 75797),
 ('after', 75606),
 ('are', 75106),
 ('clinical', 74705),
 ('day', 73022),
 ('4', 70770),
 (')', 70421),
 ('•', 69119),
 ('[see', 67551),
 ('5', 63834)]

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [12]:
punct_set = set(punctuation)

In [13]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [14]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [15]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [16]:
types_with_punct[:50]

[('(', 107651),
 (')', 70421),
 ('[see', 67551),
 ('mg/day', 45655),
 ('.', 44273),
 ('2.1', 37163),
 ('daily.', 36225),
 ('2.2', 33110),
 ('mg/kg', 32266),
 (',', 31453),
 ('extended-release', 28925),
 ('(see', 27747),
 (').', 24318),
 ('2.5', 24258),
 ('2.3', 24246),
 ('day.', 22132),
 (']', 20109),
 ('].', 19507),
 ('dose.', 18658),
 ('2.4', 18591),
 ('days.', 17345),
 ('hours.', 15796),
 (')]', 15077),
 ('-', 15056),
 ('mg/kg/day', 13264),
 ('injection,', 13092),
 ('tablets,', 12751),
 ('(e.g.,', 12041),
 ('mg,', 11733),
 ('delayed-release', 11726),
 ('dose,', 11645),
 ('age:', 11606),
 ('daily,', 11521),
 ('patients,', 11107),
 ('mg.', 10756),
 ('however,', 10608),
 ('and/or', 10369),
 ('doses.', 10289),
 (')].', 10197),
 ('mg/m', 10142),
 ('day,', 10005),
 ('therapy.', 9622),
 ('mg/day.', 9464),
 ('hours,', 9330),
 ('weeks.', 9276),
 ('[', 9138),
 ('patients.', 9019),
 ('response.', 8984),
 ('recommended.', 8944),
 ('patient.', 8829)]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [17]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

[('mg/day', 45655),
 ('mg/kg', 32266),
 ('mg/kg/day', 13264),
 ('and/or', 10369),
 ('mg/m', 10142),
 ('mg/day.', 9464),
 ('ml/min', 8175),
 ('mg/ml', 6105),
 ('mg/day,', 4380),
 ('ml/min/1.73', 3477),
 ('mg/kg/day,', 3102),
 ('mg/5', 3023),
 ('mcg/kg/day', 2786),
 ('mg/125', 2449),
 ('ml/min)', 2327),
 ('1/2', 2205),
 ('ml/min,', 2070),
 ('mcg/kg/min', 2007),
 ('mcg/ml', 1945),
 ('mg/kg/day.', 1935),
 ('lopinavir/ritonavir', 1880),
 ('mg/day)', 1838),
 ('(ml/min)', 1779),
 ('ml/min.', 1746),
 ('ml/minute/1.73', 1728),
 ('mg/day).', 1680),
 ('atazanavir/ritonavir', 1666),
 ('(mg/day)', 1637),
 ('/', 1584),
 ('mg/ml)', 1529),
 ('/l', 1505),
 ('mg/dl', 1446),
 ('pharyngitis/tonsillitis', 1301),
 ('mcg/kg', 1173),
 ('mg/25', 1158),
 ('ng/ml', 1064),
 ('ml/min/1.73m', 910),
 ('cells/mm', 892),
 ('mg/kg)', 888),
 ('ml/min),', 872),
 ('mcg/day', 869),
 ('mg/kg,', 859),
 ('ml/minute', 845),
 ('ml/h', 807),
 ('caregiver/family', 794),
 ('mg/kg.', 793),
 ('mg/ml.', 771),
 ('(olanzapine/fluoxetin

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

# Split on Whitespace and '/'

In [18]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [19]:
tokenize('mg/day foo bar')

['mg', 'day', 'foo', 'bar']

In [20]:
pipeline = [str.lower, tokenize]

In [21]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [22]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas..."


In [23]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24805740


In [24]:
token_counts = Counter(all_tokens)

In [25]:
types = token_counts.keys()

In [31]:
#print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [32]:
token_counts.most_common(50)

[('the', 851313),
 ('of', 769929),
 ('to', 635572),
 ('and', 588603),
 ('mg', 486074),
 ('in', 430993),
 ('for', 369828),
 ('a', 366507),
 ('be', 333439),
 ('or', 327740),
 ('with', 314991),
 ('dose', 314943),
 ('is', 275027),
 ('patients', 242226),
 ('should', 200870),
 ('dosage', 197158),
 ('not', 181489),
 ('2', 151150),
 ('daily', 148496),
 ('may', 143902),
 ('as', 141944),
 ('day', 138649),
 ('tablets', 137016),
 ('use', 121024),
 ('at', 120090),
 ('recommended', 115620),
 ('(', 107653),
 ('by', 99063),
 ('on', 96495),
 ('than', 92303),
 ('every', 91050),
 ('1', 89606),
 ('treatment', 89337),
 ('if', 88449),
 ('kg', 88369),
 ('once', 88130),
 ('ml', 87858),
 ('years', 86413),
 ('hours', 85072),
 ('administration', 82638),
 ('12', 81250),
 ('10', 80055),
 ('doses', 76432),
 ('after', 76291),
 ('children', 75823),
 ('are', 75106),
 ('clinical', 74705),
 ('4', 71419),
 (')', 70421),
 ('•', 69119)]

# Split on Whitespace and '-'

Some of the most common types also included '-'. Let's take a look at them. 

In [33]:
{t: count for t, count in token_counts.items() if '-' in t}

{'adults-': 112,
 'older-': 108,
 '(one-half': 184,
 'naproxen-containing': 202,
 'long-term': 5266,
 'anti-inflammatory': 815,
 'solid-oral': 128,
 'weight-based': 1216,
 'one-half': 2510,
 'non-interchangeability': 232,
 '3-4': 1266,
 '2-11:': 524,
 'fda-approved': 210,
 '(weight-adjusted': 2,
 '-': 15069,
 'non-cutaneous': 4,
 '0-1,': 26,
 'half-life': 2097,
 'radio-isotope': 25,
 'thyroid-pituitary': 50,
 '26-ml': 11,
 'back-and-forth': 58,
 '24-hour': 4559,
 'high-potency': 19,
 'hypothalamic-pituitary-adrenal': 379,
 '2-3': 1166,
 'angiotensin-converting': 1,
 'twice-daily.': 2,
 'film-coated': 197,
 'ora-plus': 491,
 'ora-sweet': 774,
 'press-in': 12,
 '(child-pugh': 3236,
 '5-10': 471,
 '1-5': 541,
 'well-controlled': 461,
 'end-stage': 921,
 'pre-': 132,
 'high-pressure': 50,
 '(mria-sp).': 5,
 'cyclosporine-therapeutic': 5,
 '50-200': 78,
 '50-150': 11,
 'obsessive-compulsive': 51,
 '6-12)': 35,
 '13-17).': 11,
 '25-200': 11,
 '(6-17': 11,
 'non-intravenous': 1291,
 '14-day':

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

# Final Tokenization

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [34]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [35]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [36]:
def remove_numbers(text):
    pattern = re.compile(r'[0-9]+')
    return re.sub(pattern, '', text)

In [37]:
pipeline = [str.lower, remove_punctuation, remove_numbers, tokenize]

In [38]:
prepare('This is a fool-proof example/test sentence!', pipeline=pipeline)

['this', 'is', 'a', 'foolproof', 'example', 'test', 'sentence']

In [39]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [40]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, or, pellets, by, mouth,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, to, under, the,..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[dosage, and, administration, use, the, lowest..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa..."


In [41]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

22392729


In [42]:
token_counts = Counter(all_tokens)

In [43]:
types = token_counts.keys()

In [44]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

AttributeError: 'Counter' object has no attribute 'total'

In [45]:
token_counts.most_common(50)

[('the', 851763),
 ('of', 770927),
 ('to', 638219),
 ('and', 590314),
 ('mg', 531977),
 ('in', 432501),
 ('for', 372512),
 ('a', 370018),
 ('dose', 357183),
 ('be', 333468),
 ('or', 332076),
 ('with', 316467),
 ('is', 276235),
 ('patients', 276199),
 ('day', 209111),
 ('dosage', 207731),
 ('daily', 206446),
 ('should', 202409),
 ('not', 182768),
 ('tablets', 162831),
 ('may', 144454),
 ('as', 143043),
 ('use', 132511),
 ('recommended', 127900),
 ('at', 120583),
 ('hours', 114658),
 ('see', 113661),
 ('ml', 111466),
 ('treatment', 104352),
 ('kg', 101556),
 ('by', 99585),
 ('years', 98411),
 ('on', 96664),
 ('administration', 95627),
 ('doses', 95256),
 ('every', 93024),
 ('than', 92362),
 ('if', 90342),
 ('once', 89319),
 ('children', 84487),
 ('therapy', 77352),
 ('after', 76816),
 ('are', 75336),
 ('clinical', 74753),
 ('age', 74679),
 ('injection', 73196),
 ('days', 69646),
 ('•', 69140),
 ('patient', 69137),
 ('adults', 62372)]

In [50]:
df.to_pickle('../data/interim/drugs.pkl') # save the dataframe as a pickle file to the data/raw directory|

In [51]:
df

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, or, pellets, by, mouth,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, to, under, the,..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[dosage, and, administration, use, the, lowest..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa..."
...,...,...,...,...,...
85642,TOPICAL,DOSAGE AND ADMINISTRATION: Comb the hair to re...,"[dosage, and, administration:, comb, the, hair...","[dosage, and, administration:, comb, the, hair...","[dosage, and, administration, comb, the, hair,..."
85643,ORAL,DOSAGE AND ADMINISTRATION Hypertension Individ...,"[dosage, and, administration, hypertension, in...","[dosage, and, administration, hypertension, in...","[dosage, and, administration, hypertension, in..."
85644,ORAL,Take 3-4 times daily. Ages 12 and older: 10 dr...,"[take, 3-4, times, daily., ages, 12, and, olde...","[take, 3-4, times, daily., ages, 12, and, olde...","[take, times, daily, ages, and, older, drops, ..."
85645,TOPICAL,Directions apply to underarms only,"[directions, apply, to, underarms, only]","[directions, apply, to, underarms, only]","[directions, apply, to, underarms, only]"
