In [1]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from cleaning import prepare

# Load Data

In [2]:
df = pd.read_pickle('../data/interim/drugs.pkl')

In [3]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


# Split on Whitespace

The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [4]:
pipeline = [str.lower, str.split]

In [5]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [6]:
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas..."


In [7]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24467953


In [8]:
token_counts = Counter(all_tokens)

In [9]:
types = token_counts.keys()

In [10]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace yields 24467953 tokens and 64262 types.


In [11]:
token_counts.most_common(50)

[('the', 850184),
 ('of', 768935),
 ('to', 634724),
 ('and', 577277),
 ('in', 430271),
 ('for', 369376),
 ('a', 365484),
 ('be', 333125),
 ('mg', 325610),
 ('or', 316871),
 ('with', 314637),
 ('dose', 313408),
 ('is', 274686),
 ('patients', 241710),
 ('should', 200695),
 ('dosage', 196874),
 ('not', 181195),
 ('daily', 148222),
 ('2', 147710),
 ('may', 143670),
 ('as', 141740),
 ('tablets', 136657),
 ('use', 120833),
 ('at', 119880),
 ('recommended', 115470),
 ('(', 107477),
 ('by', 98976),
 ('on', 96400),
 ('than', 92146),
 ('every', 90880),
 ('treatment', 89194),
 ('if', 88382),
 ('once', 88039),
 ('1', 86618),
 ('years', 86345),
 ('hours', 84965),
 ('administration', 82490),
 ('12', 81122),
 ('10', 78851),
 ('doses', 76312),
 ('children', 75687),
 ('after', 75463),
 ('are', 74990),
 ('clinical', 74576),
 ('day', 73005),
 ('4', 70599),
 (')', 70331),
 ('•', 69060),
 ('[see', 67380),
 ('5', 63751)]

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [12]:
punct_set = set(punctuation)

In [13]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [14]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [15]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [16]:
types_with_punct[:50]

[('(', 107477),
 (')', 70331),
 ('[see', 67380),
 ('mg/day', 45589),
 ('.', 44128),
 ('2.1', 37106),
 ('daily.', 36201),
 ('2.2', 33066),
 ('mg/kg', 32223),
 (',', 31411),
 ('extended-release', 28895),
 ('(see', 27728),
 (').', 24304),
 ('2.5', 24229),
 ('2.3', 24212),
 ('day.', 22081),
 (']', 19999),
 ('].', 19507),
 ('dose.', 18630),
 ('2.4', 18557),
 ('days.', 17334),
 ('hours.', 15779),
 (')]', 15048),
 ('-', 14992),
 ('mg/kg/day', 13250),
 ('injection,', 13089),
 ('tablets,', 12776),
 ('(e.g.,', 12025),
 ('mg,', 11713),
 ('delayed-release', 11711),
 ('dose,', 11619),
 ('age:', 11587),
 ('daily,', 11518),
 ('patients,', 11099),
 ('mg.', 10741),
 ('however,', 10602),
 ('and/or', 10343),
 ('doses.', 10279),
 (')].', 10164),
 ('mg/m', 10139),
 ('day,', 10005),
 ('therapy.', 9625),
 ('mg/day.', 9451),
 ('hours,', 9311),
 ('weeks.', 9269),
 ('[', 9127),
 ('patients.', 8973),
 ('response.', 8964),
 ('recommended.', 8928),
 ('patient.', 8818)]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [17]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

[('mg/day', 45589),
 ('mg/kg', 32223),
 ('mg/kg/day', 13250),
 ('and/or', 10343),
 ('mg/m', 10139),
 ('mg/day.', 9451),
 ('ml/min', 8165),
 ('mg/ml', 6103),
 ('mg/day,', 4371),
 ('ml/min/1.73', 3481),
 ('mg/kg/day,', 3106),
 ('mg/5', 3024),
 ('mcg/kg/day', 2730),
 ('mg/125', 2429),
 ('ml/min)', 2324),
 ('1/2', 2208),
 ('ml/min,', 2070),
 ('mcg/kg/min', 2007),
 ('mcg/ml', 1949),
 ('mg/kg/day.', 1942),
 ('lopinavir/ritonavir', 1880),
 ('mg/day)', 1838),
 ('(ml/min)', 1780),
 ('ml/min.', 1748),
 ('ml/minute/1.73', 1728),
 ('mg/day).', 1676),
 ('atazanavir/ritonavir', 1666),
 ('(mg/day)', 1634),
 ('/', 1584),
 ('mg/ml)', 1532),
 ('/l', 1505),
 ('mg/dl', 1447),
 ('pharyngitis/tonsillitis', 1301),
 ('mcg/kg', 1173),
 ('mg/25', 1154),
 ('ng/ml', 1064),
 ('ml/min/1.73m', 909),
 ('cells/mm', 892),
 ('mg/kg)', 888),
 ('ml/min),', 872),
 ('mcg/day', 861),
 ('mg/kg,', 859),
 ('ml/minute', 842),
 ('ml/h', 807),
 ('caregiver/family', 793),
 ('mg/kg.', 787),
 ('mg/ml.', 770),
 ('(mg/dl)', 763),
 ('(o

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

# Split on Whitespace and '/'

In [18]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [19]:
tokenize('mg/day foo bar')

['mg', 'day', 'foo', 'bar']

In [20]:
pipeline = [str.lower, tokenize]

In [21]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [22]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas..."


In [23]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24773618


In [24]:
token_counts = Counter(all_tokens)

In [25]:
types = token_counts.keys()

In [26]:
print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace and '/' yields 24773618 tokens and 61806 types.


In [27]:
token_counts.most_common(50)

[('the', 850184),
 ('of', 768935),
 ('to', 634724),
 ('and', 587629),
 ('mg', 485575),
 ('in', 430274),
 ('for', 369390),
 ('a', 365999),
 ('be', 333125),
 ('or', 327269),
 ('with', 314638),
 ('dose', 314435),
 ('is', 274686),
 ('patients', 241910),
 ('should', 200695),
 ('dosage', 196899),
 ('not', 181197),
 ('2', 150935),
 ('daily', 148300),
 ('may', 143670),
 ('as', 141740),
 ('day', 138484),
 ('tablets', 136775),
 ('use', 120854),
 ('at', 119880),
 ('recommended', 115470),
 ('(', 107479),
 ('by', 98976),
 ('on', 96413),
 ('than', 92146),
 ('every', 90887),
 ('1', 89542),
 ('treatment', 89251),
 ('if', 88382),
 ('kg', 88155),
 ('once', 88042),
 ('ml', 87802),
 ('years', 86345),
 ('hours', 84968),
 ('administration', 82499),
 ('12', 81149),
 ('10', 79928),
 ('doses', 76319),
 ('after', 76148),
 ('children', 75713),
 ('are', 74990),
 ('clinical', 74576),
 ('4', 71248),
 (')', 70331),
 ('•', 69060)]

# Split on Whitespace and '-'

Some of the most common types also included '-'. Let's take a look at them. 

In [28]:
{t: count for t, count in token_counts.items() if '-' in t}

{'adults-': 112,
 'older-': 108,
 '(one-half': 184,
 'naproxen-containing': 202,
 'long-term': 5255,
 'anti-inflammatory': 815,
 'solid-oral': 128,
 'weight-based': 1215,
 'one-half': 2500,
 'non-interchangeability': 232,
 '3-4': 1262,
 '2-11:': 524,
 'fda-approved': 210,
 '(weight-adjusted': 2,
 '-': 15005,
 'non-cutaneous': 4,
 '0-1,': 26,
 'half-life': 2097,
 'radio-isotope': 25,
 'thyroid-pituitary': 50,
 '26-ml': 11,
 'back-and-forth': 58,
 '24-hour': 4567,
 'high-potency': 19,
 'hypothalamic-pituitary-adrenal': 379,
 '2-3': 1167,
 'angiotensin-converting': 1,
 'twice-daily.': 2,
 'film-coated': 197,
 'ora-plus': 491,
 'ora-sweet': 774,
 'press-in': 12,
 '(child-pugh': 3222,
 '5-10': 471,
 '1-5': 542,
 'well-controlled': 462,
 'end-stage': 921,
 'pre-': 132,
 'high-pressure': 50,
 '(mria-sp).': 5,
 'cyclosporine-therapeutic': 5,
 '50-200': 78,
 '50-150': 11,
 'obsessive-compulsive': 51,
 '6-12)': 35,
 '13-17).': 11,
 '25-200': 11,
 '(6-17': 11,
 'non-intravenous': 1288,
 '14-day':

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

# Final Tokenization

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [29]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [30]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [31]:
pipeline = [str.lower, remove_punctuation, tokenize]

In [32]:
prepare('This is a fool-proof example/test sentence!', pipeline=pipeline)

['this', 'is', 'a', 'foolproof', 'example', 'test', 'sentence']

In [None]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [None]:
df.head()

In [None]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

In [None]:
token_counts = Counter(all_tokens)

In [None]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)