# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import global_utils

random_state_number = 967898

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

In [5]:
store = pd.HDFStore('../../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

In [6]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


In [7]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('../../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))

352220 352220


# Data Prep

To control the vocabulary pass in updated corpus_wordidx

In [8]:
from sklearn.model_selection import train_test_split
x_train_df, x_val_df = train_test_split(train_df,
                                         test_size=0.10, random_state=random_state_number,
                                         stratify=train_df.Class)

print(x_train_df.shape)
print(x_val_df.shape)

(2988, 5)
(333, 5)


In [9]:
from tensorflow.contrib.keras.python.keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [10]:
vocab_size=len(corpus_vocab_list)

## T:sent_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "sentences",
         "divide_document": "multiple_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_21_T, x_train_21_G, x_train_21_V, x_train_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print(np.array(x_train_21_T).shape, x_train_21_T[0])
print(np.array(x_train_21_G).shape, x_train_21_G[0])
print(np.array(x_train_21_V).shape, x_train_21_V[0])
print(np.array(x_train_21_C).shape, x_train_21_C[0])

Train data
(2622081,) [364606, 113692, 197002, 330024, 326252, 151042, 75648, 1818, 276247, 61043, 228115, 326252, 74974, 301275, 76659, 326252, 361104, 329709, 253643, 205596, 153283, 326252, 80594, 326252, 113692, 18820, 349251, 59442, 123801, 228752, 245229, 307200, 17105, 60555, 69032, 1818, 274163, 151942, 246684, 222367, 253643, 243777, 274163, 50915, 274163, 12413, 1818, 228752, 364603, 232434, 214275, 235155, 163151, 123801, 101614, 101366, 364607]
(2622081, 3) [364606, 97957, 364607]
(2622081,) [364606, 326252, 364607]
(2622081,) 6


In [14]:
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_21_T, x_val_21_G, x_val_21_V, x_val_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [15]:
print("Val data")
print("text",np.array(x_val_21_T).shape)
print("gene",np.array(x_val_21_G).shape, x_val_21_G[0])
print("variation",np.array(x_val_21_V).shape, x_val_21_V[0])
print("classes",np.array(x_val_21_C).shape, x_val_21_C[0])

Val data
text (293702,)
gene (293702, 3) [364606, 112978, 364607]
variation (293702,) [364606, 295010, 364607]
classes (293702,) 2


### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_SENT_LEN = 60

In [18]:
x_train_21_T = pad_sequences(x_train_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_21_T = pad_sequences(x_val_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_21_T.shape, x_val_21_T.shape)

(2622081, 60) (293702, 60)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [19]:
x_train_21_C = np.array(x_train_21_C) - 1
x_val_21_C = np.array(x_val_21_C) - 1

In [20]:
x_train_21_C = np_utils.to_categorical(np.array(x_train_21_C), 9)
x_val_21_C = np_utils.to_categorical(np.array(x_val_21_C), 9)
print(x_train_21_C.shape, x_val_21_C.shape)

(2622081, 9) (293702, 9)


## T:text_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "text",
         "divide_document": "single_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_22_T, x_train_22_G, x_train_22_V, x_train_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print("text",np.array(x_train_22_T).shape)
print("gene",np.array(x_train_22_G).shape, x_train_22_G[0])
print("variation",np.array(x_train_22_V).shape, x_train_22_V[0])
print("classes",np.array(x_train_22_C).shape, x_train_22_C[0])

Train data
text (2988,)
gene (2988, 3) [352216, 164788, 352217]
variation (2988,) [352216, 86196, 352217]
classes (2988,) 4


In [14]:
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_22_T, x_val_22_G, x_val_22_V, x_val_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [None]:
print("Val data")
print("text",np.array(x_val_22_T).shape)
print("gene",np.array(x_val_22_G).shape, x_val_22_G[0])
print("variation",np.array(x_val_22_V).shape, x_val_22_V[0])
print("classes",np.array(x_val_22_C).shape, x_val_22_C[0])

### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_TEXT_LEN = 5000

In [18]:
x_train_22_T = pad_sequences(x_train_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_22_T = pad_sequences(x_val_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_22_T.shape, x_val_22_T.shape)

(2988, 5000) (333, 5000)


In [19]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_train_22_G = pad_sequences(x_train_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_train_22_V = pad_sequences(x_train_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

x_val_22_G = pad_sequences(x_val_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_val_22_V = pad_sequences(x_val_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_train_22_G.shape, x_train_22_V.shape)
print(x_val_22_G.shape, x_val_22_V.shape)

(2988, 1) (2988, 4)
(333, 1) (333, 4)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [20]:
x_train_22_C = np.array(x_train_22_C) - 1
x_val_22_C = np.array(x_val_22_C) - 1

In [21]:
x_train_22_C = np_utils.to_categorical(np.array(x_train_22_C), 9)
x_val_22_C = np_utils.to_categorical(np.array(x_val_22_C), 9)
print(x_train_22_C.shape, x_val_22_C.shape)

(2988, 9) (333, 9)


### test Data setup

In [22]:
gen_data = global_utils.GenerateDataset(test_df, corpus_wordidx)
x_test_22_T, x_test_22_G, x_test_22_V, _ = gen_data.generate_data(custom_unit_dict, 
                                                                has_class=False,
                                                                add_start_end_tag=True)
del gen_data

In [23]:
print("Test data")
print("text",np.array(x_test_22_T).shape)
print("gene",np.array(x_test_22_G).shape, x_test_22_G[0])
print("variation",np.array(x_test_22_V).shape, x_test_22_V[0])

Test data
text (986,)
gene (986, 3) [364606, 188717, 364607]
variation (986,) [364606, 317947, 364607]


In [24]:
x_test_22_T = pad_sequences(x_test_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_test_22_T.shape)

(986, 5000)


In [25]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_test_22_G = pad_sequences(x_test_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_test_22_V = pad_sequences(x_test_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_test_22_G.shape, x_test_22_V.shape)

(986, 1) (986, 4)


## T:text_chars

### generate data

In [83]:
custom_unit_dict = {
         "gene_unit"          : "raw_chars",
         "variation_unit"     : "raw_chars",
         # text transformed to sentences attribute
         "doc_unit"           : "raw_chars",
         "doc_form"           : "text",
         "divide_document"    : "multiple_unit"
      }

In [84]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_33_T, x_train_33_G, x_train_33_V, x_train_33_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [85]:
print("Train data")
print("text",np.array(x_train_33_T).shape, x_train_33_T[0])
print("gene",np.array(x_train_33_G).shape, x_train_33_G[0])
print("variation",np.array(x_train_33_V).shape, x_train_33_V[0])
print("classes",np.array(x_train_33_C).shape, x_train_33_C[0])

Train data
text (1086419,) [74, 71, 19, 7, 4, 72, 71, 19, 20, 12, 14, 17, 72, 71, 18, 20, 15, 15, 17, 4, 18, 18, 14, 17, 72, 71, 6, 4, 13, 4, 72, 71, 15, 19, 4, 13, 72, 71, 8, 18, 72, 71, 5, 17, 4, 16, 20, 4, 13, 19, 11, 24, 72, 71, 12, 20, 19, 0, 19, 4, 3, 72, 71, 8, 13, 72, 71, 3, 8, 21, 4, 17, 18, 4, 72, 71, 7, 20, 12, 0, 13, 72, 71, 2, 0, 13, 2, 4, 17, 18, 72, 71, 0, 13, 3, 72, 71, 8, 13, 72, 71, 0, 20, 19, 14, 18, 14, 12, 0, 11, 72, 71, 3, 14, 12, 8, 13, 0, 13, 19, 72, 71, 2, 0, 13, 2, 4, 17, 72, 71, 15, 17, 4, 3, 8, 18, 15, 14, 18, 8, 19, 8, 14, 13, 72, 71, 3, 8, 18, 14, 17, 3, 4, 17, 18, 72, 71, 72, 75]
gene (1086419,) [74, 71, 15, 19, 4, 13, 72, 75]
variation (1086419,) [74, 71, 24, 27, 32, 2, 72, 75]
classes (1086419,) 4


In [86]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_33_T, x_val_33_G, x_val_33_V, x_val_33_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [87]:
print("Val data")
print("text",np.array(x_val_33_T).shape, x_val_33_T[98])
print("gene",np.array(x_val_33_G).shape, x_val_33_G[0])
print("variation",np.array(x_val_33_V).shape, x_val_33_V[0])
print("classes",np.array(x_val_33_C).shape, x_val_33_C[0])


Val data
text (128341,) [74, 71, 0, 19, 72, 71, 19, 7, 8, 18, 72, 71, 19, 8, 12, 4, 72, 71, 15, 14, 8, 13, 19, 72, 71, 72, 71, 19, 7, 4, 72, 71, 4, 23, 15, 17, 4, 18, 18, 8, 14, 13, 72, 71, 14, 5, 72, 71, 22, 8, 11, 3, 36, 19, 24, 15, 4, 72, 71, 15, 27, 32, 8, 13, 10, 30, 0, 72, 71, 8, 13, 72, 71, 20, 28, 14, 18, 72, 71, 2, 4, 11, 11, 18, 72, 71, 8, 13, 3, 20, 2, 4, 3, 72, 71, 15, 14, 19, 4, 13, 19, 72, 71, 2, 4, 11, 11, 72, 71, 2, 24, 2, 11, 4, 72, 71, 0, 17, 17, 4, 18, 19, 72, 71, 0, 19, 72, 71, 1, 14, 19, 7, 72, 71, 19, 4, 12, 15, 4, 17, 0, 19, 20, 17, 4, 18, 72, 71, 72, 71, 15, 27, 32, 8, 13, 10, 30, 0, 72, 71, 8, 13, 3, 20, 2, 4, 3, 72, 71, 18, 36, 15, 7, 0, 18, 4, 72, 71, 8, 13, 7, 8, 1, 8, 19, 8, 14, 13, 72, 71, 14, 5, 72, 71, 30, 28, 33, 29, 72, 71, 72, 71, 0, 13, 3, 72, 71, 30, 35, 33, 29, 72, 71, 72, 71, 0, 19, 72, 71, 29, 33, 27, 2, 72, 71, 0, 13, 3, 72, 71, 30, 26, 27, 2, 72, 71, 72, 71, 17, 4, 18, 15, 4, 2, 19, 8, 21, 4, 11, 24, 72, 71, 72, 75]
gene (128341,) [74, 71, 2, 3

### format data

In [88]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [89]:
MAX_CHAR_IN_SENT_LEN = 150

In [90]:
x_train_33_T = pad_sequences(x_train_33_T, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_33_T = pad_sequences(x_val_33_T, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_33_T.shape, x_val_33_T.shape)

(1086419, 150) (128341, 150)


In [91]:
x_train_33_G = pad_sequences(x_train_33_G, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx)
x_train_33_V = pad_sequences(x_train_33_V, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx)

x_val_33_G = pad_sequences(x_val_33_G, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx)
x_val_33_V = pad_sequences(x_val_33_V, maxlen=MAX_CHAR_IN_SENT_LEN, value=char_unknown_tag_idx)

print(x_train_33_G.shape, x_train_33_V.shape)
print(x_val_33_G.shape, x_val_33_V.shape)

(1086419, 150) (1086419, 150)
(128341, 150) (128341, 150)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [92]:
x_train_33_C = np.array(x_train_33_C) - 1
x_val_33_C = np.array(x_val_33_C) - 1

In [93]:
x_train_33_C = np_utils.to_categorical(np.array(x_train_33_C), 9)
x_val_33_C = np_utils.to_categorical(np.array(x_val_33_C), 9)
print(x_train_33_C.shape, x_val_33_C.shape)

(1086419, 9) (128341, 9)


## T:text_sent_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"          : "words",
         "variation_unit"     : "words",
         # text transformed to sentences attribute
         "doc_unit"           : "word_list",
         "doc_form"           : "text",
         "divide_document"    : "single_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_34_T, x_train_34_G, x_train_34_V, x_train_34_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print("text",np.array(x_train_34_T).shape, x_train_34_T[0][:1])
print("gene",np.array(x_train_34_G).shape, x_train_34_G[0])
print("variation",np.array(x_train_34_V).shape, x_train_34_V[0])
print("classes",np.array(x_train_34_C).shape, x_train_34_C[0])

Train data
text (2988,) [[352216, 252037, 202038, 70974, 86431, 164788, 109857, 338562, 123191, 209585, 221967, 49123, 331220, 140212, 209585, 229015, 140770, 182848, 111721, 8208, 0, 352217]]
gene (2988, 3) [352216, 164788, 352217]
variation (2988,) [352216, 86196, 352217]
classes (2988,) 4


In [14]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_34_T, x_val_34_G, x_val_34_V, x_val_34_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [15]:
print("Val data")
print("text",np.array(x_val_34_T).shape, x_val_34_T[98][:1])
print("gene",np.array(x_val_34_G).shape, x_val_34_G[0])
print("variation",np.array(x_val_34_V).shape, x_val_34_V[0])
print("classes",np.array(x_val_34_C).shape, x_val_34_C[0])


Val data
text (333,) [[352216, 252037, 156537, 91785, 67201, 109857, 123191, 209585, 213751, 5638, 0, 126280, 49123, 331220, 0, 352217]]
gene (333, 3) [352216, 217983, 352217]
variation (333,) [352216, 41934, 352217]
classes (333,) 4


### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_DOC_LEN = 500 # no of sentences in a document
MAX_SENT_LEN = 80 # no of words in a sentence

In [18]:
for doc_i, doc in enumerate(x_train_34_T):
    x_train_34_T[doc_i] = x_train_34_T[doc_i][:MAX_DOC_LEN]
    # padding sentences
    if len(x_train_34_T[doc_i]) < MAX_DOC_LEN:
        for not_used_i in range(0,MAX_DOC_LEN - len(x_train_34_T[doc_i])):
            x_train_34_T[doc_i].append([word_unknown_tag_idx]*MAX_SENT_LEN)
    # padding words
    x_train_34_T[doc_i] = pad_sequences(x_train_34_T[doc_i], maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)
    
for doc_i, doc in enumerate(x_val_34_T):
    x_val_34_T[doc_i] = x_val_34_T[doc_i][:MAX_DOC_LEN]
    # padding sentences
    if len(x_val_34_T[doc_i]) < MAX_DOC_LEN:
        for not_used_i in range(0,MAX_DOC_LEN - len(x_val_34_T[doc_i])):
            x_val_34_T[doc_i].append([word_unknown_tag_idx]*MAX_SENT_LEN)
    # padding words
    x_val_34_T[doc_i] = pad_sequences(x_val_34_T[doc_i], maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)
    
x_train_34_T = np.array(x_train_34_T)
x_val_34_T = np.array(x_val_34_T)

In [19]:
print(x_val_34_T.shape, x_train_34_T.shape)

(333, 500, 80) (2988, 500, 80)


In [20]:
x_train_34_G = pad_sequences(x_train_34_G, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)
x_train_34_V = pad_sequences(x_train_34_V, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)

x_val_34_G = pad_sequences(x_val_34_G, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)
x_val_34_V = pad_sequences(x_val_34_V, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx)

print(x_train_34_G.shape, x_train_34_V.shape)
print(x_val_34_G.shape, x_val_34_V.shape)

(2988, 80) (2988, 80)
(333, 80) (333, 80)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [21]:
x_train_34_C = np.array(x_train_34_C) - 1
x_val_34_C = np.array(x_val_34_C) - 1

In [22]:
x_train_34_C = np_utils.to_categorical(np.array(x_train_34_C), 9)
x_val_34_C = np_utils.to_categorical(np.array(x_val_34_C), 9)
print(x_train_34_C.shape, x_val_34_C.shape)

(2988, 9) (333, 9)


Need to form 3 dimensional target data for rationale model training

In [23]:
temp = (x_train_34_C.shape[0],1,x_train_34_C.shape[1])
x_train_34_C_sent = np.repeat(x_train_34_C.reshape(temp[0],temp[1],temp[2]), MAX_DOC_LEN, axis=1)

#sentence test targets
temp = (x_val_34_C.shape[0],1,x_val_34_C.shape[1])
x_val_34_C_sent = np.repeat(x_val_34_C.reshape(temp[0],temp[1],temp[2]), MAX_DOC_LEN, axis=1)

print(x_train_34_C_sent.shape, x_val_34_C_sent.shape)

(2988, 500, 9) (333, 500, 9)


## Embedding layer

### for words

In [24]:
WORD_EMB_SIZE = 200

In [25]:
%autoreload
import global_utils
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape

(352220, 200)

### for characters

In [94]:
CHAR_EMB_SIZE = 64

In [95]:
char_embeddings = np.random.randn(global_utils.CHAR_ALPHABETS_LEN, CHAR_EMB_SIZE)
char_embeddings.shape

(75, 64)

# Models

## prep

In [26]:
%autoreload
import tensorflow.contrib.keras as keras
import tensorflow as tf

from keras import backend as K

from keras.engine import Layer, InputSpec, InputLayer

from keras.models import Model, Sequential

from keras.layers import Dropout, Embedding, concatenate
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D, ZeroPadding1D
from keras.layers import Dense, Input, Flatten, BatchNormalization
from keras.layers import Concatenate, Dot, Merge, Multiply, RepeatVector
from keras.layers import Bidirectional, TimeDistributed
from keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute

from keras.layers.core import Reshape, Activation
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
from keras.constraints import maxnorm
from keras.regularizers import l2

## model_1: paper

refer https://github.com/bwallace/rationale-CNN

### Doc-CNN

#### model

In [71]:
doc_input = Input(shape=(MAX_DOC_LEN,MAX_SENT_LEN,), dtype="int16")
reshape_1d = Reshape([MAX_DOC_LEN * MAX_SENT_LEN])(doc_input)
doc_embedding_1d = Embedding(vocab_size, WORD_EMB_SIZE, weights=[trained_embeddings], trainable=True)(reshape_1d)
# data_format='channels_first' for conv2d
doc_embedding = Reshape([1, MAX_DOC_LEN, MAX_SENT_LEN * WORD_EMB_SIZE])(doc_embedding_1d)

sent_convs_in_doc = []
ngram_filters = [2,3]
n_filters = 32

# using Conv2D instead of Conv1D since we need to deal with sentences and not the whole document
# All Input shape: 4D tensor with shape: (samples, channels, rows, cols) if data_format='channels_first'
for n_gram in ngram_filters:
    l_conv = Conv2D(filters = n_filters, 
                    kernel_size = (1, n_gram * WORD_EMB_SIZE), # n_gram words
                    strides = (1, WORD_EMB_SIZE), # one word
                    data_format='channels_first',
                    activation="relu")(doc_embedding)
    # this output (n_filters x max_doc_len x 1)
    l_pool = MaxPooling2D(pool_size=(1, (MAX_SENT_LEN - n_gram + 1)),
                       data_format='channels_first')(l_conv)
    
    # flip around, to get (1 x DOC_SEQ_LEN x n_filters)
    permuted = Permute((2,1,3)) (l_pool)
    
    # drop extra dimension
    reshaped = Reshape((MAX_DOC_LEN, n_filters))(permuted)
    sent_convs_in_doc.append(reshaped)
    
l_concat = Concatenate(axis=1)(sent_convs_in_doc)
l_dropout = Dropout(0.5)(l_concat)
                       
def sum_sent_vecs(x):
    return K.sum(x, axis=1)

def sum_sent_vec_output_shape(input_shape): 
    # should be (batch x MAX_DOC_LEN x MAX_SENT_LEN)
    shape = list(input_shape) 
    # something like (None, 96), where 96 is the
    # length of induced sentence vectors
    return (shape[0], shape[-1])

doc_vector = Lambda(sum_sent_vecs, 
                    output_shape=sum_sent_vec_output_shape)(l_dropout)
l_softmax = Dense(9, activation='softmax')(doc_vector)
doc_cnn_model = Model(inputs=[doc_input], outputs=[l_softmax])

#### training

In [72]:
doc_cnn_model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])
doc_cnn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 500, 80)       0                                            
____________________________________________________________________________________________________
reshape_3 (Reshape)              (None, 40000)         0           input_2[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 40000, 200)    70444000    reshape_3[0][0]                  
____________________________________________________________________________________________________
reshape_4 (Reshape)              (None, 1, 500, 16000) 0           embedding_2[0][0]                
___________________________________________________________________________________________

In [73]:
%rm -rf ./tb_graphs/*

In [74]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [75]:
checkpointer = ModelCheckpoint(filepath="doc_cnn_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [78]:
with tf.Session() as sess:
    # model = keras.models.load_model('current_model.h5')
    sess.run(tf.global_variables_initializer())
    try:
        doc_cnn_model.load_weights("model_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    doc_cnn_model.fit(x_train_34_T, x_train_34_C, 
          validation_data=(x_val_34_T, x_val_34_C),
          epochs=5, batch_size=32, shuffle=True,
          callbacks=[tb_callback,checkpointer])
    #model.save('current_sent_model.h5')

no checkpoints available !
Train on 2988 samples, validate on 333 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### RA-CNN

#### model

In [27]:
doc_input = Input(shape=(MAX_DOC_LEN,MAX_SENT_LEN,), dtype="int32")
reshape_1d = Reshape([MAX_DOC_LEN * MAX_SENT_LEN])(doc_input)
doc_embedding_1d = Embedding(vocab_size, WORD_EMB_SIZE, weights=[trained_embeddings], trainable=True)(reshape_1d)
# data_format='channels_first' for conv2d
doc_embedding = Reshape([1, MAX_DOC_LEN, MAX_SENT_LEN * WORD_EMB_SIZE])(doc_embedding_1d)

sent_convs_in_doc = []
ngram_filters = [2,3]
n_filters = 32 # nof features
final_doc_dims = len(ngram_filters) * n_filters 

# using Conv2D instead of Conv1D since we need to deal with sentences and not the whole document
# All Input shape: 4D tensor with shape: (samples, channels, rows, cols) if data_format='channels_first'
for n_gram in ngram_filters:
    l_conv = Conv2D(filters = n_filters, 
                    kernel_size = (1, n_gram * WORD_EMB_SIZE), # n_gram words
                    strides = (1, WORD_EMB_SIZE), # one word
                    data_format='channels_first',
                    activation="relu")(doc_embedding)
    # this output (n_filters x max_doc_len x 1)
    l_pool = MaxPooling2D(pool_size=(1, (MAX_SENT_LEN - n_gram + 1)),
                       data_format='channels_first')(l_conv)
    
    # flip around, to get (1 x DOC_SEQ_LEN x n_filters)
    permuted = Permute((2,1,3)) (l_pool)
    
    # drop extra dimension
    reshaped = Reshape((MAX_DOC_LEN, n_filters))(permuted)
    sent_convs_in_doc.append(reshaped)
    
sent_vectors = concatenate(sent_convs_in_doc)
# do we need dropout here, we might lose information
# l_dropout = Dropout(0.5)(l_concat)

sentence_softmax = Dense(9, activation='softmax', kernel_regularizer=l2(0.01), name="sentence_prediction")
doc_sent_output_layer = TimeDistributed(sentence_softmax, name="sentence_predictions")(sent_vectors)

# weights are set to the estimated probabilities that 
# corresponding sentences are rationales in the most likely direction
sum_weighting_probs = Lambda(lambda x: K.max(x, axis=1))

# distributing over sentences in the document
sent_weights = TimeDistributed(sum_weighting_probs)(doc_sent_output_layer)

# reshaping the weights to perform matrix dot product
reshaped_sent_weights = Reshape((1, MAX_DOC_LEN))(sent_weights)

# along the last 2 axes and not including the batch axis
doc_vector = Dot((1,2))([sent_vectors, reshaped_sent_weights])
doc_vector = Reshape((final_doc_dims,))(doc_vector)

l_dropout = Dropout(0.5)(doc_vector)
doc_output_layer = Dense(9, activation="softmax")(l_dropout)
        

In [28]:
sentence_model = Model(inputs=doc_input, outputs=doc_sent_output_layer)
sentence_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 500, 80)       0                                            
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 40000)         0           input_1[0][0]                    
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 40000, 200)    70444000    reshape_1[0][0]                  
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 1, 500, 16000) 0           embedding_1[0][0]                
___________________________________________________________________________________________

In [29]:
ra_cnn_model = Model(inputs=doc_input, outputs=doc_output_layer)
ra_cnn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 500, 80)       0                                            
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 40000)         0           input_1[0][0]                    
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 40000, 200)    70444000    reshape_1[0][0]                  
____________________________________________________________________________________________________
reshape_2 (Reshape)              (None, 1, 500, 16000) 0           embedding_1[0][0]                
___________________________________________________________________________________________

#### training sentence model

In [30]:
sentence_model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])

the accuracy is not improving anymore

In [31]:
%rm -rf ./tb_graphs/*

In [32]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [33]:
checkpointer = ModelCheckpoint(filepath="sentence_model_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [35]:
with tf.Session() as sess:
    # model = keras.models.load_model('current_model.h5')
    sess.run(tf.global_variables_initializer())
    try:
        sentence_model.load_weights("sentence_model_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    sentence_model.fit(x_train_34_T, x_train_34_C_sent, 
          validation_data=(x_val_34_T, x_val_34_C_sent),
          epochs=10, batch_size=32, shuffle=True,
          callbacks=[tb_callback,checkpointer])
    #sentence_model.save('current_sent_model.h5')

no checkpoints available !
Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
with tf.Session() as sess:
    # model = keras.models.load_model('current_model.h5')
    sess.run(tf.global_variables_initializer())
    try:
        sentence_model.load_weights("sentence_model_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    sentence_model.fit(x_train_34_T, x_train_34_C_sent, 
          validation_data=(x_val_34_T, x_val_34_C_sent),
          epochs=10, batch_size=32, shuffle=True,
          callbacks=[tb_callback,checkpointer])
    #sentence_model.save('current_sent_model.h5')

Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### training ra_cnn model

we are freezing the sentence level trained model which will be used as input for rationale model

In [30]:
sentence_model.load_weights("sentence_model_weights.hdf5")
sent_softmax_layer = ra_cnn_model.get_layer("sentence_predictions")
sent_softmax_layer.trainable = False

In [31]:
ra_cnn_model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])

In [32]:
%rm -rf ./tb_graphs/*

In [33]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [34]:
checkpointer = ModelCheckpoint(filepath="ra_cnn_model_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [36]:
with tf.Session() as sess:
    # ra_cnn_model = keras.models.load_model('current_model.h5')
    sess.run(tf.global_variables_initializer())
    try:
        ra_cnn_model.load_weights("ra_cnn_model_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    ra_cnn_model.fit(x_train_34_T, x_train_34_C, 
          validation_data=(x_val_34_T, x_val_34_C),
          epochs=10, batch_size=32, shuffle=True,
          callbacks=[tb_callback,checkpointer])
    #ra_cnn_model.save('current_sent_model.h5')

no checkpoints available !
Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
