In [1]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from cleaning import remove_punctuation, remove_numbers, tokenize, remove_stopwords, prepare

import os
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aabel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
df = pd.read_pickle('../data/interim/drugs.pkl')

In [3]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


Tokenize text

In [4]:
pipeline = [str.lower, remove_punctuation, remove_numbers, tokenize, remove_stopwords]
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently..."


The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [None]:
pipeline = [str.lower, str.split]

In [None]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [None]:
df.head()

In [None]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

In [None]:
token_counts = Counter(all_tokens)

In [None]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [None]:
punct_set = set(punctuation)

In [None]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [None]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [None]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [None]:
types_with_punct[:50]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [None]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

In [None]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [None]:
tokenize('mg/day foo bar')

In [None]:
pipeline = [str.lower, tokenize]

In [None]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [None]:
df.head()

In [None]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

In [None]:
token_counts = Counter(all_tokens)

In [None]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)

Some of the most common types also included '-'. Let's take a look at them. 

In [None]:
{t: count for t, count in token_counts.items() if '-' in t}

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [None]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [None]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [None]:
pipeline = [str.lower, remove_punctuation, tokenize]

In [None]:
prepare('This is an example/test sentence!', pipeline=pipeline)

In [None]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [None]:
df.head()

In [None]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

In [None]:
token_counts = Counter(all_tokens)

In [None]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)

# Vectorization - BERT (Google NLP Model)

In [5]:
## Create final sentence, based on final tokens, to vectorize via semantic model

def join_text(tokens):
    new_text = ' '.join(tokens)
    
    return new_text

df['final_text'] = df.apply(lambda row: join_text(row['tokens']), axis=1)
df.head()

Unnamed: 0,target,text,tokens,final_text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...


In [7]:
## Utilizing Google's NLP vectorization model - applied via SentenceTransformer. 
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

In [8]:
## Create vector using BERT Model

bert_vector = model.encode(df['final_text'].astype(str))
df['bert_vector'] = list(bert_vector)

df.head()

Unnamed: 0,target,text,tokens,final_text,bert_vector
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...,"[-0.25533915, 0.98093414, 0.47458115, -0.46995..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...,"[-0.32527256, 0.80431247, 0.6453381, 0.1251983..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...,"[0.18958697, 0.0632698, 0.662766, 0.13524653, ..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...,"[-0.6896909, 0.2681557, 0.34398147, -0.2088281..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...,"[0.16621587, 0.8684128, 1.0205474, 0.43868583,..."


In [None]:
df.to_pickle('../data/processed/drugs.pkl')