In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import string
import random

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#from nltk.collocations import *
#from nltk import FreqDist, word_tokenize

from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing

from keras import optimizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Activation, LSTM, GRU, Dense, GlobalMaxPool1D, Embedding, Dropout, Conv1D
from keras.models import Sequential, Model
from keras import backend

Using TensorFlow backend.


In [2]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [3]:
df = pd.read_csv('data/drug_review_clean.csv')

In [4]:
drugs = df

In [29]:
top_10_conditions[top_10_conditions.index=='Birth Control'].drugName.value_counts()[:1].keys()[0]

'Etonogestrel'

In [115]:
top_10_conditions = drugs.set_index('condition').loc[drugs.condition.value_counts()[:10].keys()]

In [116]:
#top_10_conditions.groupby(top_10_conditions.index)['drugName'].value_counts()

In [117]:
top_drug_for_condition = []
for i in top_10_conditions.index.unique():
    drug_name = top_10_conditions[top_10_conditions.index==i].drugName.value_counts()[:1].keys()[0]
    top_drug_for_condition.append(drug_name)

In [118]:
top_drug_for_condition

['Etonogestrel',
 'Bupropion',
 'Tramadol',
 'Escitalopram',
 'Isotretinoin',
 'Lamotrigine',
 'Zolpidem',
 'Phentermine',
 'Bupropion / naltrexone',
 'Lisdexamfetamine']

In [119]:
top_drugs = pd.DataFrame()
for i in top_drug_for_condition:
    top_drugs = pd.concat([top_drugs, top_10_conditions[top_10_conditions.drugName==i]])

In [129]:
def hot_tokenization(text, num_words=3000):
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(text)
    return tokenizer.texts_to_matrix(text, mode='binary')

def hot_pad_sequences(text, num_words=3000, maxlen=100):    
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(list(text))
    list_tokenized_headlines = tokenizer.texts_to_sequences(text)
    return sequence.pad_sequences(list_tokenized_headlines, maxlen) 

def hot_label_encoding(labels):
    le = preprocessing.LabelEncoder()
    le.fit(labels)
    labels_cat = le.transform(labels)
    return to_categorical(labels_cat) 

In [121]:
X_t = hot_tokenization(top_drugs.lemm_review, 3000)
y = hot_label_encoding(top_drugs.drugName)
train, test, label_train, label_test = train_test_split(X_t, y, test_size=0.2, random_state=42)

In [None]:
X_t = hot_pad_sequences(top_drugs.lemm_review, 3000, maxlen=100)
y = hot_label_encoding(top_drugs.drugName)
train, test, label_train, label_test = train_test_split(X_t, y, test_size=0.2, random_state=42)

In [124]:
model = Sequential()
model.add(Dense(50, activation='relu', input_shape=(3000,)))
model.add(Dense(25, activation='relu'))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer=optimizers.SGD(), loss='categorical_crossentropy', metrics=['accuracy'])

In [127]:
history = model.fit(train, label_train, epochs=20, batch_size=256, validation_split=0.1)
history_dict = history.history

Train on 9332 samples, validate on 1037 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [130]:
X_t = hot_pad_sequences(top_drugs.lemm_review, 3000, maxlen=100)
y = hot_label_encoding(top_drugs.drugName)
train, test, label_train, label_test = train_test_split(X_t, y, test_size=0.2, random_state=42)

In [144]:
backend.clear_session()

model = Sequential()
model.add(Embedding(3000, 80, input_length=100))
model.add(Conv1D(60, 2, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.1))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 80)           240000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 99, 60)            9660      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                1830      
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                310       
Total para

In [145]:
history = model.fit(train, label_train, epochs=6, validation_split=0.1, batch_size=80)

Train on 9332 samples, validate on 1037 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [147]:
drugs[drugs.condition=='Birth Control'].drugName.value_counts()[:10]

Etonogestrel                          4394
Ethinyl estradiol / norethindrone     3081
Levonorgestrel                        2884
Nexplanon                             2883
Ethinyl estradiol / levonorgestrel    2107
Ethinyl estradiol / norgestimate      2097
Implanon                              1496
Mirena                                1320
Skyla                                 1074
Lo Loestrin Fe                         896
Name: drugName, dtype: int64

In [153]:
top_birthcontrol_drugs = drugs.set_index('drugName').loc[drugs[drugs.condition=='Birth Control'].drugName.value_counts()[:10].keys()]

In [164]:
X_t = hot_tokenization(top_birthcontrol_drugs.lemm_review, 3000)
y = hot_label_encoding(top_birthcontrol_drugs.index)
train, test, label_train, label_test = train_test_split(X_t, y, test_size=0.2, random_state=42)

In [165]:
model = Sequential()
model.add(Dense(50, activation='relu', input_shape=(3000,)))
model.add(Dense(25, activation='relu'))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer=optimizers.SGD(), loss='categorical_crossentropy', metrics=['accuracy'])

In [166]:
history = model.fit(train, label_train, epochs=40, batch_size=256, validation_split=0.1)
history_dict = history.history

Train on 18763 samples, validate on 2085 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [169]:
X_t = hot_pad_sequences(top_birthcontrol_drugs.lemm_review, 3000, maxlen=100)
y = hot_label_encoding(top_birthcontrol_drugs.index)
train, test, label_train, label_test = train_test_split(X_t, y, test_size=0.2, random_state=42)

In [170]:
backend.clear_session()

model = Sequential()
model.add(Embedding(3000, 80, input_length=100))
model.add(Conv1D(60, 2, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.4))
model.add(Dense(30, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 80)           240000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 99, 60)            9660      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                1830      
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                310       
Total para

In [171]:
history = model.fit(train, label_train, epochs=6, validation_split=0.1, batch_size=80)

Train on 18763 samples, validate on 2085 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [192]:
def hot_pad_sequences(text, num_words=2, maxlen=10):    
    tokenizer = Tokenizer(num_words)
    tokenizer.fit_on_texts(list(text))
    list_tokenized_headlines = tokenizer.texts_to_sequences(text)
    return sequence.pad_sequences(list_tokenized_headlines, maxlen)

In [193]:
hot_pad_sequences(['sean','bill'])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)