In [140]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from src.data.cleaning import prepare

# Load Data

In [11]:
df = pd.read_csv('../data/processed/drugs.csv', usecols=['target', 'text'])

In [12]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


# Split on Whitespace

The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [67]:
pipeline = [str.lower, str.split]

In [73]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [74]:
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas..."


In [75]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24417311


In [76]:
token_counts = Counter(all_tokens)

In [77]:
types = token_counts.keys()

In [78]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace yields 24417311 tokens and 64217 types.


In [117]:
token_counts.most_common(50)

[('the', 848509),
 ('of', 767519),
 ('to', 633353),
 ('and', 575928),
 ('in', 429169),
 ('for', 368567),
 ('a', 364741),
 ('be', 332954),
 ('mg', 324956),
 ('or', 316372),
 ('with', 314054),
 ('dose', 312668),
 ('is', 274114),
 ('patients', 240936),
 ('should', 200650),
 ('dosage', 196129),
 ('not', 180926),
 ('daily', 147901),
 ('2', 147387),
 ('may', 143437),
 ('as', 141395),
 ('tablets', 136138),
 ('use', 120613),
 ('at', 119566),
 ('recommended', 115114),
 ('(', 107196),
 ('by', 98774),
 ('on', 96163),
 ('than', 91941),
 ('every', 90590),
 ('treatment', 89060),
 ('if', 88278),
 ('once', 87691),
 ('1', 86374),
 ('years', 86196),
 ('hours', 84761),
 ('administration', 82238),
 ('12', 80947),
 ('10', 78650),
 ('doses', 76299),
 ('children', 75698),
 ('after', 75312),
 ('are', 74851),
 ('clinical', 74396),
 ('day', 72844),
 ('4', 70366),
 (')', 70238),
 ('•', 68972),
 ('[see', 67113),
 ('5', 63631)]

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [152]:
punct_set = set(punctuation)

In [81]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [104]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [109]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [118]:
types_with_punct[:50]

[('(', 107196),
 (')', 70238),
 ('[see', 67113),
 ('mg/day', 45543),
 ('.', 44068),
 ('2.1', 37037),
 ('daily.', 36106),
 ('2.2', 32983),
 ('mg/kg', 32103),
 (',', 31304),
 ('extended-release', 28875),
 ('(see', 27702),
 (').', 24255),
 ('2.3', 24155),
 ('2.5', 24137),
 ('day.', 21954),
 (']', 19901),
 ('].', 19495),
 ('dose.', 18599),
 ('2.4', 18500),
 ('days.', 17312),
 ('hours.', 15777),
 (')]', 15025),
 ('-', 14980),
 ('mg/kg/day', 13248),
 ('injection,', 13092),
 ('tablets,', 12756),
 ('(e.g.,', 11997),
 ('mg,', 11711),
 ('delayed-release', 11687),
 ('dose,', 11585),
 ('age:', 11575),
 ('daily,', 11490),
 ('patients,', 11078),
 ('mg.', 10734),
 ('however,', 10581),
 ('and/or', 10315),
 ('doses.', 10265),
 ('mg/m', 10133),
 (')].', 10112),
 ('day,', 9980),
 ('therapy.', 9594),
 ('mg/day.', 9437),
 ('hours,', 9301),
 ('weeks.', 9253),
 ('[', 9148),
 ('response.', 8953),
 ('recommended.', 8929),
 ('patients.', 8914),
 ('patient.', 8805)]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [116]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

[('mg/day', 45543),
 ('mg/kg', 32103),
 ('mg/kg/day', 13248),
 ('and/or', 10315),
 ('mg/m', 10133),
 ('mg/day.', 9437),
 ('ml/min', 8163),
 ('mg/ml', 6077),
 ('mg/day,', 4372),
 ('ml/min/1.73', 3477),
 ('mg/kg/day,', 3105),
 ('mg/5', 3033),
 ('mcg/kg/day', 2646),
 ('mg/125', 2465),
 ('ml/min)', 2317),
 ('1/2', 2201),
 ('ml/min,', 2060),
 ('mcg/kg/min', 2004),
 ('mcg/ml', 1951),
 ('mg/kg/day.', 1939),
 ('lopinavir/ritonavir', 1880),
 ('mg/day)', 1827),
 ('(ml/min)', 1774),
 ('ml/min.', 1750),
 ('ml/minute/1.73', 1724),
 ('mg/day).', 1667),
 ('atazanavir/ritonavir', 1666),
 ('(mg/day)', 1632),
 ('/', 1584),
 ('/l', 1505),
 ('mg/ml)', 1496),
 ('mg/dl', 1450),
 ('pharyngitis/tonsillitis', 1293),
 ('mcg/kg', 1165),
 ('mg/25', 1155),
 ('ng/ml', 1057),
 ('ml/min/1.73m', 907),
 ('cells/mm', 891),
 ('mg/kg)', 888),
 ('ml/min),', 874),
 ('mg/kg,', 858),
 ('mcg/day', 858),
 ('ml/minute', 836),
 ('caregiver/family', 791),
 ('mg/kg.', 786),
 ('mg/ml.', 773),
 ('(olanzapine/fluoxetine)', 762),
 ('(m

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

# Split on Whitespace and '/'

In [186]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [154]:
tokenize('mg/day foo bar')

['mg', 'day', 'foo', 'bar']

In [155]:
pipeline = [str.lower, tokenize]

In [156]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [157]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas..."


In [158]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24731152


In [159]:
token_counts = Counter(all_tokens)

In [160]:
types = token_counts.keys()

In [161]:
print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace and '/' yields 24731152 tokens and 61765 types.


In [162]:
token_counts.most_common(50)

[('the', 848509),
 ('of', 767519),
 ('to', 633353),
 ('and', 586252),
 ('mg', 484695),
 ('in', 429172),
 ('for', 368581),
 ('a', 365248),
 ('be', 332954),
 ('or', 326739),
 ('with', 314055),
 ('dose', 313695),
 ('is', 274114),
 ('patients', 241135),
 ('should', 200650),
 ('dosage', 196154),
 ('not', 180928),
 ('2', 150602),
 ('daily', 147979),
 ('may', 143437),
 ('as', 141395),
 ('day', 138186),
 ('tablets', 136256),
 ('use', 120634),
 ('at', 119566),
 ('recommended', 115114),
 ('(', 107198),
 ('by', 98774),
 ('on', 96176),
 ('than', 91941),
 ('every', 90597),
 ('1', 89290),
 ('treatment', 89117),
 ('if', 88278),
 ('kg', 87791),
 ('once', 87694),
 ('ml', 87559),
 ('years', 86196),
 ('hours', 84764),
 ('administration', 82247),
 ('12', 80974),
 ('10', 79724),
 ('doses', 76306),
 ('after', 75997),
 ('children', 75723),
 ('are', 74851),
 ('clinical', 74396),
 ('4', 71014),
 (')', 70238),
 ('•', 68972)]

# Split on Whitespace and '-'

Some of the most common types also included '-'. Let's take a look at them. 

In [164]:
{t: count for t, count in token_counts.items() if '-' in t}

{'adults-': 112,
 'older-': 108,
 '(one-half': 184,
 'naproxen-containing': 202,
 'long-term': 5254,
 'anti-inflammatory': 817,
 'solid-oral': 128,
 'weight-based': 1215,
 'one-half': 2485,
 'non-interchangeability': 233,
 '3-4': 1244,
 '2-11:': 523,
 'fda-approved': 209,
 '(weight-adjusted': 2,
 '-': 14993,
 'non-cutaneous': 4,
 '0-1,': 26,
 'half-life': 2096,
 'radio-isotope': 25,
 'thyroid-pituitary': 50,
 '26-ml': 11,
 'back-and-forth': 58,
 '24-hour': 4566,
 'high-potency': 19,
 'hypothalamic-pituitary-adrenal': 379,
 '2-3': 1166,
 'angiotensin-converting': 1,
 'twice-daily.': 2,
 'film-coated': 197,
 'ora-plus': 493,
 'ora-sweet': 773,
 'press-in': 12,
 '(child-pugh': 3215,
 '5-10': 473,
 '1-5': 542,
 'well-controlled': 462,
 'end-stage': 918,
 'pre-': 132,
 'high-pressure': 50,
 '(mria-sp).': 5,
 'cyclosporine-therapeutic': 5,
 '50-200': 78,
 '50-150': 11,
 'obsessive-compulsive': 51,
 '6-12)': 35,
 '13-17).': 11,
 '25-200': 11,
 '(6-17': 11,
 'non-intravenous': 1289,
 '14-day':

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

# Final Tokenization

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [168]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [169]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [170]:
pipeline = [str.lower, remove_punctuation, tokenize]

In [171]:
prepare('This is an example/test sentence!', pipeline=pipeline)

['this', 'is', 'an', 'example', 'test', 'sentence']

In [172]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [173]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, 4, or, 6, pellets, by, ..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, 3, to, 5, under..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa..."


In [174]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24731152


In [175]:
token_counts = Counter(all_tokens)

In [176]:
types = token_counts.keys()

In [178]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

Splitting on whitespace and '/' and removing punctuation yields 24731152 tokens and 34108 types.


In [179]:
token_counts.most_common(50)

[('the', 848938),
 ('of', 768502),
 ('to', 635549),
 ('and', 587906),
 ('mg', 523998),
 ('in', 430655),
 ('', 426939),
 ('for', 371221),
 ('a', 368349),
 ('dose', 355718),
 ('be', 332983),
 ('or', 330966),
 ('with', 315488),
 ('is', 275316),
 ('patients', 274922),
 ('dosage', 206659),
 ('daily', 205683),
 ('day', 203796),
 ('should', 202188),
 ('not', 182203),
 ('2', 171258),
 ('tablets', 162035),
 ('may', 143986),
 ('as', 142470),
 ('use', 132080),
 ('recommended', 127337),
 ('at', 120059),
 ('1', 115875),
 ('hours', 114164),
 ('see', 113131),
 ('ml', 109633),
 ('treatment', 104055),
 ('kg', 100482),
 ('by', 99287),
 ('years', 98070),
 ('on', 96324),
 ('administration', 95210),
 ('doses', 95075),
 ('every', 92464),
 ('than', 91943),
 ('if', 90163),
 ('once', 88880),
 ('10', 88818),
 ('12', 87342),
 ('children', 84313),
 ('5', 81334),
 ('4', 78610),
 ('therapy', 77186),
 ('after', 76517),
 ('are', 75080)]