# Tokenization

This code performs EDA on the interim data to determine a tokenization strategy. That strategy will be implemented in the cleaning.py module.

In [1]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
from cleaning import prepare

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load Data

In [2]:
df = pd.read_pickle('../data/interim/drugs.pkl')

In [3]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


## Split on Whitespace

The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [4]:
pipeline = [str.lower, str.split]

In [5]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [6]:
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas..."


In [7]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24401364


In [8]:
token_counts = Counter(all_tokens)

In [9]:
types = token_counts.keys()

In [10]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace yields 24401364 tokens and 64181 types.


In [11]:
token_counts.most_common(50)

[('the', 847932),
 ('of', 766973),
 ('to', 632956),
 ('and', 576062),
 ('in', 429308),
 ('for', 368177),
 ('a', 364638),
 ('be', 332047),
 ('mg', 324408),
 ('or', 316099),
 ('with', 313602),
 ('dose', 312502),
 ('is', 273852),
 ('patients', 241008),
 ('should', 200041),
 ('dosage', 196182),
 ('not', 180740),
 ('daily', 147828),
 ('2', 147291),
 ('may', 143369),
 ('as', 141395),
 ('tablets', 136328),
 ('use', 120466),
 ('at', 119615),
 ('recommended', 115019),
 ('(', 107096),
 ('by', 98693),
 ('on', 96128),
 ('than', 91923),
 ('every', 90531),
 ('treatment', 88934),
 ('if', 88123),
 ('once', 87774),
 ('1', 86247),
 ('years', 86111),
 ('hours', 84542),
 ('administration', 82241),
 ('12', 80765),
 ('10', 78624),
 ('doses', 76136),
 ('children', 75528),
 ('after', 75312),
 ('are', 74809),
 ('clinical', 74445),
 ('day', 72680),
 ('4', 70445),
 (')', 70067),
 ('•', 68981),
 ('[see', 67321),
 ('5', 63609)]

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [12]:
punct_set = set(punctuation)

In [13]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [14]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [15]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [16]:
types_with_punct[:50]

[('(', 107096),
 (')', 70067),
 ('[see', 67321),
 ('mg/day', 45478),
 ('.', 44102),
 ('2.1', 36997),
 ('daily.', 36043),
 ('2.2', 32955),
 ('mg/kg', 32076),
 (',', 31332),
 ('extended-release', 28791),
 ('(see', 27654),
 (').', 24188),
 ('2.5', 24164),
 ('2.3', 24126),
 ('day.', 22041),
 (']', 20045),
 ('].', 19417),
 ('dose.', 18565),
 ('2.4', 18508),
 ('days.', 17240),
 ('hours.', 15746),
 (')]', 15001),
 ('-', 14985),
 ('mg/kg/day', 13208),
 ('injection,', 13024),
 ('tablets,', 12702),
 ('(e.g.,', 12002),
 ('mg,', 11676),
 ('delayed-release', 11673),
 ('dose,', 11585),
 ('age:', 11564),
 ('daily,', 11471),
 ('patients,', 11062),
 ('mg.', 10720),
 ('however,', 10551),
 ('and/or', 10336),
 ('doses.', 10237),
 (')].', 10141),
 ('mg/m', 10090),
 ('day,', 9974),
 ('therapy.', 9582),
 ('mg/day.', 9427),
 ('hours,', 9296),
 ('weeks.', 9241),
 ('[', 9064),
 ('patients.', 8966),
 ('response.', 8960),
 ('recommended.', 8908),
 ('patient.', 8801)]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [17]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

[('mg/day', 45478),
 ('mg/kg', 32076),
 ('mg/kg/day', 13208),
 ('and/or', 10336),
 ('mg/m', 10090),
 ('mg/day.', 9427),
 ('ml/min', 8135),
 ('mg/ml', 6076),
 ('mg/day,', 4360),
 ('ml/min/1.73', 3452),
 ('mg/kg/day,', 3080),
 ('mg/5', 3002),
 ('mcg/kg/day', 2786),
 ('mg/125', 2449),
 ('ml/min)', 2313),
 ('1/2', 2197),
 ('ml/min,', 2067),
 ('mcg/kg/min', 1996),
 ('mcg/ml', 1935),
 ('mg/kg/day.', 1918),
 ('lopinavir/ritonavir', 1880),
 ('mg/day)', 1835),
 ('(ml/min)', 1764),
 ('ml/min.', 1741),
 ('ml/minute/1.73', 1728),
 ('mg/day).', 1677),
 ('atazanavir/ritonavir', 1666),
 ('(mg/day)', 1633),
 ('/', 1570),
 ('mg/ml)', 1527),
 ('/l', 1501),
 ('mg/dl', 1443),
 ('pharyngitis/tonsillitis', 1301),
 ('mcg/kg', 1168),
 ('mg/25', 1158),
 ('ng/ml', 1063),
 ('ml/min/1.73m', 905),
 ('cells/mm', 892),
 ('mg/kg)', 887),
 ('mcg/day', 869),
 ('ml/min),', 866),
 ('mg/kg,', 855),
 ('ml/minute', 839),
 ('ml/h', 807),
 ('caregiver/family', 792),
 ('mg/kg.', 789),
 ('(olanzapine/fluoxetine)', 766),
 ('mg/m

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

## Split on Whitespace and '/'

In [18]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [19]:
tokenize('mg/day foo bar')

['mg', 'day', 'foo', 'bar']

In [20]:
pipeline = [str.lower, tokenize]

In [21]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [22]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas..."


In [23]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24706162


In [24]:
token_counts = Counter(all_tokens)

In [25]:
types = token_counts.keys()

In [26]:
print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace and '/' yields 24706162 tokens and 61730 types.


In [27]:
token_counts.most_common(50)

[('the', 847932),
 ('of', 766973),
 ('to', 632956),
 ('and', 586407),
 ('mg', 483864),
 ('in', 429311),
 ('for', 368191),
 ('a', 365153),
 ('be', 332047),
 ('or', 326489),
 ('with', 313603),
 ('dose', 313524),
 ('is', 273852),
 ('patients', 241210),
 ('should', 200041),
 ('dosage', 196207),
 ('not', 180742),
 ('2', 150502),
 ('daily', 147906),
 ('may', 143369),
 ('as', 141395),
 ('day', 138059),
 ('tablets', 136446),
 ('use', 120487),
 ('at', 119615),
 ('recommended', 115019),
 ('(', 107098),
 ('by', 98693),
 ('on', 96141),
 ('than', 91923),
 ('every', 90538),
 ('1', 89159),
 ('treatment', 88991),
 ('if', 88123),
 ('kg', 87936),
 ('once', 87777),
 ('ml', 87525),
 ('years', 86111),
 ('hours', 84545),
 ('administration', 82250),
 ('12', 80792),
 ('10', 79699),
 ('doses', 76143),
 ('after', 75997),
 ('children', 75554),
 ('are', 74809),
 ('clinical', 74445),
 ('4', 71093),
 (')', 70067),
 ('•', 68981)]

## Split on Whitespace and '-'

Some of the most common types also included '-'. Let's take a look at them. 

In [28]:
{t: count for t, count in token_counts.items() if '-' in t}

{'adults-': 112,
 'older-': 108,
 '(one-half': 184,
 'naproxen-containing': 202,
 'long-term': 5248,
 'anti-inflammatory': 813,
 'solid-oral': 128,
 'weight-based': 1212,
 'one-half': 2497,
 'non-interchangeability': 232,
 '3-4': 1266,
 '2-11:': 524,
 'fda-approved': 210,
 '(weight-adjusted': 2,
 '-': 14998,
 'non-cutaneous': 4,
 '0-1,': 26,
 'half-life': 2089,
 'radio-isotope': 25,
 'thyroid-pituitary': 50,
 '26-ml': 11,
 'back-and-forth': 58,
 '24-hour': 4547,
 'high-potency': 19,
 'hypothalamic-pituitary-adrenal': 378,
 '2-3': 1161,
 'angiotensin-converting': 1,
 'twice-daily.': 2,
 'film-coated': 196,
 'ora-plus': 489,
 'ora-sweet': 773,
 'press-in': 12,
 '(child-pugh': 3220,
 '5-10': 470,
 '1-5': 541,
 'well-controlled': 458,
 'end-stage': 917,
 'pre-': 130,
 'high-pressure': 49,
 '(mria-sp).': 5,
 'cyclosporine-therapeutic': 5,
 '50-200': 78,
 '50-150': 11,
 'obsessive-compulsive': 51,
 '6-12)': 35,
 '13-17).': 11,
 '25-200': 11,
 '(6-17': 11,
 'non-intravenous': 1288,
 '14-day':

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

## Final Tokenization

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [29]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [30]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [31]:
def remove_numbers(text):
    pattern = re.compile(r'[0-9]+')
    return re.sub(pattern, '', text)

In [32]:
pipeline = [str.lower, remove_punctuation, remove_numbers, tokenize]

In [33]:
prepare('This is a fool-proof example/test sentence!', pipeline=pipeline)

['this', 'is', 'a', 'foolproof', 'example', 'test', 'sentence']

In [34]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [35]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, or, pellets, by, mouth,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, to, under, the,..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[dosage, and, administration, use, the, lowest..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa..."


In [36]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

22303955


In [37]:
token_counts = Counter(all_tokens)

In [38]:
types = token_counts.keys()

In [39]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace and '/' and removing punctuation yields 22303955 tokens and 28396 types.


In [40]:
token_counts.most_common(50)

[('the', 848381),
 ('of', 767971),
 ('to', 635599),
 ('and', 588116),
 ('mg', 529540),
 ('in', 430809),
 ('for', 370858),
 ('a', 368651),
 ('dose', 355568),
 ('be', 332076),
 ('or', 330811),
 ('with', 315072),
 ('is', 275053),
 ('patients', 275027),
 ('day', 208233),
 ('dosage', 206718),
 ('daily', 205567),
 ('should', 201577),
 ('not', 182016),
 ('tablets', 162167),
 ('may', 143908),
 ('as', 142490),
 ('use', 131942),
 ('recommended', 127247),
 ('at', 120103),
 ('hours', 114018),
 ('see', 113210),
 ('ml', 111037),
 ('treatment', 103960),
 ('kg', 101015),
 ('by', 99215),
 ('years', 98067),
 ('on', 96310),
 ('administration', 95180),
 ('doses', 94885),
 ('every', 92502),
 ('than', 91982),
 ('if', 90011),
 ('once', 88956),
 ('children', 84188),
 ('therapy', 77042),
 ('after', 76516),
 ('are', 75039),
 ('clinical', 74493),
 ('age', 74382),
 ('injection', 72893),
 ('days', 69300),
 ('•', 69002),
 ('patient', 68905),
 ('adults', 62119)]