In [1]:
%matplotlib inline
import sys, os, re, csv, math, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import BatchNormalization
from keras.layers.advanced_activations import ELU
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D
from keras.layers import MaxPool1D, Flatten, Conv1D, GRU, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras.layers import concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras import initializers, regularizers, constraints, optimizers, layers

In [2]:
df=pd.read_csv('cancer_clinical_trials.csv')
df

Unnamed: 0,study,condition,qualification
0,study interventions are recombinant CD40-ligand,melanoma skin diagnosis and no active cns met...,0
1,study interventions are Liposomal doxorubicin,colorectal cancer diagnosis and cardiovascular,0
2,study interventions are BI 836909,multiple myeloma diagnosis and indwelling cen...,0
3,study interventions are Immunoglobulins,recurrent fallopian tube carcinoma diagnosis ...,0
4,study interventions are Paclitaxel,stage ovarian cancer diagnosis and patients m...,0
...,...,...,...
999995,study interventions are Pazopanib,carcinoma renal cell diagnosis and pregnant o...,1
999996,study interventions are Dexamethasone 21-phosp...,uveal melanoma diagnosis and presence of any ...,1
999997,study interventions are Camptothecin,rectal cancer diagnosis and creatinine cleara...,1
999998,study interventions are Cyclophosphamide,stage iii non hodgkin lymphoma diagnosis and ...,1


In [3]:
embed_size = 50 
max_features = 20000 
maxlen = 100

In [4]:
list_sentences_train = df["condition"].fillna("_na_").values 
list_classes = ["qualification"] 
y = df[list_classes].values 
#list_sentences_test = test["comment_text"].fillna("_na_").values

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

In [6]:
print(np.reshape(list_tokenized_train[0], (1,-1)))

[[ 97 157   2   1  36  62 251 146  54 301 331   3 391]]


In [7]:
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

In [8]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.6B.50d.txt',encoding="utf8"))

In [9]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if (await self.run_code(code, result,  async_=asy)):


(0.020940226, 0.64410394)

In [10]:
word_index = tokenizer.word_index 
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

In [11]:
for word, i in word_index.items():
    if i >= max_features: 
        continue 
    embedding_vector = embeddings_index.get(word) 
    
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

# Binary_Crossentropy computes the crossentropy metric between the labels and predictions. I use this because its work well  when there are only two label classes (0 and 1).

In [12]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x) 
avg_pool = GlobalAveragePooling1D()(x) 
max_pool = GlobalMaxPooling1D()(x)

x = concatenate([avg_pool, max_pool])  

preds = Dense(1, activation="sigmoid")(x) 



model = Model(inp, preds) 
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-4),metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [14]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00000001)

callbacks = [learning_rate_reduction, 
             EarlyStopping('val_loss', patience=3), 
             ModelCheckpoint('ic_model.h5', save_best_only=True)]

history = model.fit(X_t, y, batch_size=64, epochs=2, validation_split=0.1, callbacks=callbacks);

Epoch 1/2
Epoch 2/2


In [15]:
test = pd.read_csv('assignment_test.csv')
list_sentences_test = test["condition"].fillna("_na_").values
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [16]:
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [17]:
X_te

array([[   0,    0,    0, ...,   82,   79,   22],
       [   0,    0,    0, ...,   42, 1519, 5397],
       [   0,    0,    0, ..., 1431,   30,  202],
       ...,
       [   0,    0,    0, ...,   12,  142,  146],
       [   0,    0,    0, ...,    3,  521,  529],
       [   0,    0,    0, ...,    2,    1,  144]])

In [21]:
y_pred = model.predict([X_te], batch_size=1024, verbose=1)



In [23]:
y_pred

array([[0.9954495 ],
       [0.3494191 ],
       [0.9680229 ],
       ...,
       [0.00700411],
       [0.25054577],
       [0.67711437]], dtype=float32)