# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import global_utils

random_state_number = 967898

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

In [5]:
store = pd.HDFStore('../../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

In [6]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


In [7]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('../../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))

352220 352220


# Data Prep

To control the vocabulary pass in updated corpus_wordidx

In [8]:
from sklearn.model_selection import train_test_split
x_train_df, x_val_df = train_test_split(train_df,
                                         test_size=0.10, random_state=random_state_number,
                                         stratify=train_df.Class)

print(x_train_df.shape)
print(x_val_df.shape)

(2988, 5)
(333, 5)


In [9]:
from tensorflow.contrib.keras.python.keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [10]:
vocab_size=len(corpus_vocab_list)

## T:sent_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "sentences",
         "divide_document": "multiple_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_21_T, x_train_21_G, x_train_21_V, x_train_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print(np.array(x_train_21_T).shape, x_train_21_T[0])
print(np.array(x_train_21_G).shape, x_train_21_G[0])
print(np.array(x_train_21_V).shape, x_train_21_V[0])
print(np.array(x_train_21_C).shape, x_train_21_C[0])

Train data
(2622081,) [364606, 113692, 197002, 330024, 326252, 151042, 75648, 1818, 276247, 61043, 228115, 326252, 74974, 301275, 76659, 326252, 361104, 329709, 253643, 205596, 153283, 326252, 80594, 326252, 113692, 18820, 349251, 59442, 123801, 228752, 245229, 307200, 17105, 60555, 69032, 1818, 274163, 151942, 246684, 222367, 253643, 243777, 274163, 50915, 274163, 12413, 1818, 228752, 364603, 232434, 214275, 235155, 163151, 123801, 101614, 101366, 364607]
(2622081, 3) [364606, 97957, 364607]
(2622081,) [364606, 326252, 364607]
(2622081,) 6


In [14]:
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_21_T, x_val_21_G, x_val_21_V, x_val_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [15]:
print("Val data")
print("text",np.array(x_val_21_T).shape)
print("gene",np.array(x_val_21_G).shape, x_val_21_G[0])
print("variation",np.array(x_val_21_V).shape, x_val_21_V[0])
print("classes",np.array(x_val_21_C).shape, x_val_21_C[0])

Val data
text (293702,)
gene (293702, 3) [364606, 112978, 364607]
variation (293702,) [364606, 295010, 364607]
classes (293702,) 2


### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_SENT_LEN = 60

In [18]:
x_train_21_T = pad_sequences(x_train_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_21_T = pad_sequences(x_val_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_21_T.shape, x_val_21_T.shape)

(2622081, 60) (293702, 60)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [19]:
x_train_21_C = np.array(x_train_21_C) - 1
x_val_21_C = np.array(x_val_21_C) - 1

In [20]:
x_train_21_C = np_utils.to_categorical(np.array(x_train_21_C), 9)
x_val_21_C = np_utils.to_categorical(np.array(x_val_21_C), 9)
print(x_train_21_C.shape, x_val_21_C.shape)

(2622081, 9) (293702, 9)


## T:text_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "text",
         "divide_document": "single_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_22_T, x_train_22_G, x_train_22_V, x_train_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print("text",np.array(x_train_22_T).shape)
print("gene",np.array(x_train_22_G).shape, x_train_22_G[0])
print("variation",np.array(x_train_22_V).shape, x_train_22_V[0])
print("classes",np.array(x_train_22_C).shape, x_train_22_C[0])

Train data
text (2988,)
gene (2988, 3) [352216, 164788, 352217]
variation (2988,) [352216, 86196, 352217]
classes (2988,) 4


In [14]:
%autoreload
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_22_T, x_val_22_G, x_val_22_V, x_val_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [15]:
print("Val data")
print("text",np.array(x_val_22_T).shape)
print("gene",np.array(x_val_22_G).shape, x_val_22_G[0])
print("variation",np.array(x_val_22_V).shape, x_val_22_V[0])
print("classes",np.array(x_val_22_C).shape, x_val_22_C[0])

Val data
text (333,)
gene (333, 3) [352216, 217983, 352217]
variation (333,) [352216, 41934, 352217]
classes (333,) 4


### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_TEXT_LEN = 5000

In [18]:
x_train_22_T = pad_sequences(x_train_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_22_T = pad_sequences(x_val_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_22_T.shape, x_val_22_T.shape)

(2988, 5000) (333, 5000)


In [19]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_train_22_G = pad_sequences(x_train_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_train_22_V = pad_sequences(x_train_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

x_val_22_G = pad_sequences(x_val_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_val_22_V = pad_sequences(x_val_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_train_22_G.shape, x_train_22_V.shape)
print(x_val_22_G.shape, x_val_22_V.shape)

(2988, 1) (2988, 4)
(333, 1) (333, 4)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [20]:
x_train_22_C = np.array(x_train_22_C) - 1
x_val_22_C = np.array(x_val_22_C) - 1

In [21]:
x_train_22_C = np_utils.to_categorical(np.array(x_train_22_C), 9)
x_val_22_C = np_utils.to_categorical(np.array(x_val_22_C), 9)
print(x_train_22_C.shape, x_val_22_C.shape)

(2988, 9) (333, 9)


### test Data setup

In [22]:
gen_data = global_utils.GenerateDataset(test_df, corpus_wordidx)
x_test_22_T, x_test_22_G, x_test_22_V, _ = gen_data.generate_data(custom_unit_dict, 
                                                                has_class=False,
                                                                add_start_end_tag=True)
del gen_data

In [23]:
print("Test data")
print("text",np.array(x_test_22_T).shape)
print("gene",np.array(x_test_22_G).shape, x_test_22_G[0])
print("variation",np.array(x_test_22_V).shape, x_test_22_V[0])

Test data
text (986,)
gene (986, 3) [364606, 188717, 364607]
variation (986,) [364606, 317947, 364607]


In [24]:
x_test_22_T = pad_sequences(x_test_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_test_22_T.shape)

(986, 5000)


In [25]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_test_22_G = pad_sequences(x_test_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_test_22_V = pad_sequences(x_test_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_test_22_G.shape, x_test_22_V.shape)

(986, 1) (986, 4)


## Embedding layer

### for words

In [22]:
WORD_EMB_SIZE = 200

In [23]:
%autoreload
import global_utils
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape

(352220, 200)

### for characters

In [33]:
CHAR_EMB_SIZE = 100

In [34]:
char_embeddings = np.random.randn(global_utils.CHAR_ALPHABETS_LEN, CHAR_EMB_SIZE)
char_embeddings.shape

(75, 100)

# Models

## prep

In [24]:
%autoreload
from utils import MedCNN

## model_1: paper

In [25]:
model = MedCNN(n_cnn2_pool_pair_layers=2, fc_layer_len=128, n_filters=256, kernel_size=5,
               dropout_porb=0.5, input_sentence_len=MAX_TEXT_LEN, 
               output_label_size=9, word_vectors=trained_embeddings)

### training

In [26]:
model.train((x_train_22_T, x_train_22_C),
            (x_val_22_T, x_val_22_C),
            num_epochs=16,
            checkpoint_every_n_epoch=1,
            evaluate_every_n_epoch=1,
            learning_rate=0.01, batch_size=32)

Writing logs to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519



A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-94

epoch 0 train_loss:4949.01986167 train_accuracy:0.169326241188 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-188

epoch 1 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.161931818182


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-282

epoch 2 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-376

epoch 3 train_loss:2.197224617 train_accuracy:0.17043439719 test_loss:2.197224617 test_accuracy:0.186844408512


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-470

epoch 4 train_loss:2.197224617 train_accuracy:0.171542553191 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-564

epoch 5 train_loss:2.197224617 train_accuracy:0.17043439719 test_loss:2.197224617 test_accuracy:0.166083916344


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-658

epoch 6 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-752

epoch 7 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.166083916344


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-846

epoch 8 train_loss:2.197224617 train_accuracy:0.172096631311 test_loss:2.197224617 test_accuracy:0.174388113347


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-940

epoch 9 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.178540210832


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1034

epoch 10 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1128

epoch 11 train_loss:2.197224617 train_accuracy:0.169880319149 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1222

epoch 12 train_loss:2.197224617 train_accuracy:0.17098847523 test_loss:2.197224617 test_accuracy:0.182692311027


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1316

epoch 13 train_loss:2.197224617 train_accuracy:0.169880319149 test_loss:2.197224617 test_accuracy:0.174388113347


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1410

epoch 14 train_loss:2.197224617 train_accuracy:0.169880319149 test_loss:2.197224617 test_accuracy:0.170236014507


A Jupyter Widget


Saved model checkpoint to /home/bicepjai/Projects/Deep-Survey-Text-Classification/deep_models/paper_3_medical_cnn/runs/1509045519/checkpoints/model-1504

epoch 15 train_loss:2.197224617 train_accuracy:0.171542553191 test_loss:2.197224617 test_accuracy:0.166083916344
