# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv
import json
import functools
import time
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import global_utils

random_state_number = 967898

In [2]:
import tensorflow as tf
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

In [5]:
store = pd.HDFStore('../../data_prep/processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']

In [6]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


In [7]:
corpus_vocab_list, corpus_vocab_wordidx = None, None
with open('../../data_prep/processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (corpus_vocab_list, corpus_wordidx) = pickle.load(f)
print(len(corpus_vocab_list), len(corpus_wordidx))

352220 352220


# Data Prep

To control the vocabulary pass in updated corpus_wordidx

In [50]:
from sklearn.model_selection import train_test_split
x_train_df, x_val_df = train_test_split(train_df,
                                         test_size=0.10, random_state=random_state_number,
                                         stratify=train_df.Class)

print(x_train_df.shape)
print(x_val_df.shape)

(2988, 5)
(333, 5)


In [51]:
print(x_train_df.Class.value_counts())
print(x_val_df.Class.value_counts())

7    858
4    617
1    511
2    407
6    247
5    218
3     80
9     33
8     17
Name: Class, dtype: int64
7    95
4    69
1    57
2    45
6    28
5    24
3     9
9     4
8     2
Name: Class, dtype: int64


In [52]:
from tensorflow.contrib.keras.python.keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [53]:
vocab_size=len(corpus_vocab_list)

## T:sent_words

### generate data

In [11]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "sentences",
         "divide_document": "multiple_unit"
      }

In [12]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_21_T, x_train_21_G, x_train_21_V, x_train_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [13]:
print("Train data")
print(np.array(x_train_21_T).shape, x_train_21_T[0])
print(np.array(x_train_21_G).shape, x_train_21_G[0])
print(np.array(x_train_21_V).shape, x_train_21_V[0])
print(np.array(x_train_21_C).shape, x_train_21_C[0])

Train data
(1086419,) [352216, 252037, 202038, 70974, 86431, 164788, 109857, 338562, 123191, 209585, 221967, 49123, 331220, 140212, 209585, 229015, 140770, 182848, 111721, 8208, 0, 352217]
(1086419, 3) [352216, 164788, 352217]
(1086419,) [352216, 86196, 352217]
(1086419,) 4


In [14]:
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_21_T, x_val_21_G, x_val_21_V, x_val_21_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [15]:
print("Val data")
print("text",np.array(x_val_21_T).shape)
print("gene",np.array(x_val_21_G).shape, x_val_21_G[0])
print("variation",np.array(x_val_21_V).shape, x_val_21_V[0])
print("classes",np.array(x_val_21_C).shape, x_val_21_C[0])

Val data
text (128341,)
gene (128341, 3) [352216, 217983, 352217]
variation (128341,) [352216, 41934, 352217]
classes (128341,) 4


### format data

In [16]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [17]:
MAX_SENT_LEN = 60

In [18]:
x_train_21_T = pad_sequences(x_train_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_21_T = pad_sequences(x_val_21_T, maxlen=MAX_SENT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_21_T.shape, x_val_21_T.shape)

(1086419, 60) (128341, 60)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [19]:
x_train_21_C = np.array(x_train_21_C) - 1
x_val_21_C = np.array(x_val_21_C) - 1

In [20]:
x_train_21_C = np_utils.to_categorical(np.array(x_train_21_C), 9)
x_val_21_C = np_utils.to_categorical(np.array(x_val_21_C), 9)
print(x_train_21_C.shape, x_val_21_C.shape)

(1086419, 9) (128341, 9)


## T:text_words

### generate data

In [54]:
custom_unit_dict = {
         "gene_unit"      : "words",
         "variation_unit" : "words",
         # text transformed to sentences attribute
         "doc_unit"       : "words",
         "doc_form"       : "text",
         "divide_document": "single_unit"
      }

In [55]:
%autoreload
import global_utils
gen_data = global_utils.GenerateDataset(x_train_df, corpus_wordidx)
x_train_22_T, x_train_22_G, x_train_22_V, x_train_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [56]:
print("Train data")
print("text",np.array(x_train_22_T).shape)
print("gene",np.array(x_train_22_G).shape, x_train_22_G[0])
print("variation",np.array(x_train_22_V).shape, x_train_22_V[0])
print("classes",np.array(x_train_22_C).shape, x_train_22_C[0])
print(unique(x_train_22_C))

Train data
text (2988,)
gene (2988, 3) [352216, 164788, 352217]
variation (2988,) [352216, 86196, 352217]
classes (2988,) 4
[1 2 3 4 5 6 7 8 9]


In [57]:
gen_data = global_utils.GenerateDataset(x_val_df, corpus_wordidx)
x_val_22_T, x_val_22_G, x_val_22_V, x_val_22_C = gen_data.generate_data(custom_unit_dict, 
                                                                             has_class=True,
                                                                             add_start_end_tag=True)
del gen_data

In [58]:
print("Val data")
print("text",np.array(x_val_22_T).shape)
print("gene",np.array(x_val_22_G).shape, x_val_22_G[0])
print("variation",np.array(x_val_22_V).shape, x_val_22_V[0])
print("classes",np.array(x_val_22_C).shape, x_val_22_C[0])
print(unique(x_val_22_C))

Val data
text (333,)
gene (333, 3) [352216, 217983, 352217]
variation (333,) [352216, 41934, 352217]
classes (333,) 4
[1 2 3 4 5 6 7 8 9]


### format data

In [59]:
word_unknown_tag_idx   = corpus_wordidx["<UNK>"]
char_unknown_tag_idx   = global_utils.char_unknown_tag_idx

In [60]:
MAX_TEXT_LEN = 5000

In [61]:
x_train_22_T = pad_sequences(x_train_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
x_val_22_T = pad_sequences(x_val_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_train_22_T.shape, x_val_22_T.shape)

(2988, 5000) (333, 5000)


In [62]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_train_22_G = pad_sequences(x_train_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_train_22_V = pad_sequences(x_train_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

x_val_22_G = pad_sequences(x_val_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_val_22_V = pad_sequences(x_val_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_train_22_G.shape, x_train_22_V.shape)
print(x_val_22_G.shape, x_val_22_V.shape)

(2988, 1) (2988, 4)
(333, 1) (333, 4)


keras np_utils.to_categorical expects zero index categorical variables

https://github.com/fchollet/keras/issues/570

In [63]:
x_train_22_C = np.array(x_train_22_C) - 1
x_val_22_C = np.array(x_val_22_C) - 1
x_train_22_Cp = x_train_22_C
x_val_22_Cp = x_val_22_C

In [64]:
print(unique(x_val_22_Cp))
print(unique(x_train_22_Cp))

[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8]


In [65]:
x_train_22_C = np_utils.to_categorical(np.array(x_train_22_C), 9)
x_val_22_C = np_utils.to_categorical(np.array(x_val_22_C), 9)
print(x_train_22_C.shape, x_val_22_C.shape)

(2988, 9) (333, 9)


In [66]:
print(unique(x_train_22_C))
print(unique(x_val_22_C))

[ 0.  1.]
[ 0.  1.]


### CV setup

In [9]:
gen_data = global_utils.GenerateDataset(train_df, corpus_wordidx)
x_22_T, x_22_G, x_22_V, x_22_C = gen_data.generate_data(custom_unit_dict,
                                                        has_class=True,
                                                        add_start_end_tag=True)
del gen_data

In [22]:
x_22_T = pad_sequences(x_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_22_T.shape)

(3321, 5000)


In [23]:
x_22_C = np.array(x_22_C) - 1
x_22_Cp = x_22_C

In [24]:
x_22_C = np_utils.to_categorical(np.array(x_22_C), 9)
print(x_22_C.shape)

(3321, 9)


### test Data setup

In [25]:
gen_data = global_utils.GenerateDataset(test_df, corpus_wordidx)
x_test_22_T, x_test_22_G, x_test_22_V, _ = gen_data.generate_data(custom_unit_dict, 
                                                                has_class=False,
                                                                add_start_end_tag=True)
del gen_data

In [26]:
print("Test data")
print("text",np.array(x_test_22_T).shape)
print("gene",np.array(x_test_22_G).shape, x_test_22_G[0])
print("variation",np.array(x_test_22_V).shape, x_test_22_V[0])

Test data
text (5668,)
gene (5668, 3) [352216, 136191, 352217]
variation (5668,) [352216, 327792, 352217]


In [27]:
x_test_22_T = pad_sequences(x_test_22_T, maxlen=MAX_TEXT_LEN, value=word_unknown_tag_idx,
                                  padding="post",truncating="post")
print(x_test_22_T.shape)

(5668, 5000)


In [28]:
MAX_GENE_LEN = 1
MAX_VAR_LEN = 4
x_test_22_G = pad_sequences(x_test_22_G, maxlen=MAX_GENE_LEN, value=word_unknown_tag_idx)
x_test_22_V = pad_sequences(x_test_22_V, maxlen=MAX_VAR_LEN, value=word_unknown_tag_idx)

print(x_test_22_G.shape, x_test_22_V.shape)

(5668, 1) (5668, 4)


In [29]:
s1_solution_df = pd.read_csv("../../data_prep/dataset/stage2/stage1_solution.csv")
s1_solution_df.head()

Unnamed: 0,ID,class1,class2,class3,class4,class5,class6,class7,class8,class9
0,0,0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1
2,2,0,0,0,0,0,0,1,0,0
3,3,0,1,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,1


In [30]:
test_matrix = s1_solution_df[['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9']]
x_test_22_Cp = [np.where(r==1)[0][0] for r in test_matrix.values ]

In [31]:
x_test_22_C = np.array(x_test_22_Cp) - 1
x_test_22_C = np_utils.to_categorical(np.array(x_test_22_C), 9)
print(x_test_22_C.shape)

(5668, 9)


## Embedding layer

### for words

In [32]:
WORD_EMB_SIZE = 200

In [33]:
%autoreload
import global_utils
ft_file_path = "/home/bicepjai/Projects/Deep-Survey-Text-Classification/data_prep/processed/stage1/pretrained_word_vectors/ft_sg_200d_50e.vec"
trained_embeddings = global_utils.get_embeddings_from_ft(ft_file_path, WORD_EMB_SIZE, corpus_vocab_list)
trained_embeddings.shape

(352220, 200)

### for characters

In [34]:
CHAR_EMB_SIZE = 100

In [35]:
char_embeddings = np.random.randn(global_utils.CHAR_ALPHABETS_LEN, CHAR_EMB_SIZE)
char_embeddings.shape

(75, 100)

# Models

## prep

In [36]:
import tensorflow.contrib.keras as keras
import tensorflow as tf

from keras import backend as K

from keras.engine import Layer, InputSpec, InputLayer

from keras.models import Model, Sequential

from keras.layers import Dropout, Embedding, concatenate
from keras.layers import Conv1D, MaxPool1D, Conv2D, MaxPool2D, ZeroPadding1D
from keras.layers import Dense, Input, Flatten, BatchNormalization
from keras.layers import Concatenate, Dot, Merge, Multiply, RepeatVector
from keras.layers import Bidirectional, TimeDistributed
from keras.layers import SimpleRNN, LSTM, GRU, Lambda, Permute

from keras.layers.core import Reshape, Activation
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,EarlyStopping,TensorBoard
from keras.constraints import maxnorm
from keras.regularizers import l2

%autoreload

## model_1: paper

In [39]:
text_seq_input = Input(shape=(MAX_TEXT_LEN,), dtype='int32')
text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
                            weights=[trained_embeddings], trainable=True)(text_seq_input)

filter_sizes = [3,4,5]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    l_pool = MaxPool1D(filter_size)(l_conv)
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu')(l_merge)
# since the text is too long we are maxpooling over 100
# and not GlobalMaxPool1D
l_pool1 = MaxPool1D(100)(l_cov1)
l_flat = Flatten()(l_pool1)
l_dense = Dense(128, activation='relu')(l_flat)
l_out = Dense(9, activation='softmax')(l_dense)
model_1 = Model(inputs=[text_seq_input], outputs=l_out)


In [40]:
model_1.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])
model_1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 5000, 200)     70444000    input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 5000, 128)     76928       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 5000, 128)     102528      embedding_1[0][0]                
___________________________________________________________________________________________

### training

In [44]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [45]:
checkpointer = ModelCheckpoint(filepath="model_1_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [46]:
earlystopping = EarlyStopping(monitor='val_categorical_accuracy', 
                              min_delta=0, patience=5, 
                              verbose=0, mode='auto')

### no CV

In [46]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_1.load_weights("model_11_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    model_1.fit(x_train_22_T, x_train_22_C, 
          validation_data=(x_val_22_T, x_val_22_C),
          epochs=10, batch_size=64,shuffle=True,
          callbacks=[tb_callback,checkpointer])

no checkpoints available !
Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### prediction

In [47]:
from sklearn.metrics import classification_report,confusion_matrix
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_1.load_weights("model_11_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    y_pred = model_1.predict(x_test_22_T)
    y_classes = y_pred.argmax(axis=-1)

print(confusion_matrix(x_test_22_Cp, y_classes))
print(classification_report(x_test_22_Cp, y_classes))

[[ 67  28   2  54  16   9 245   0   1]
 [ 85  71   2  87   8  13 473   0   2]
 [ 76  48   3  64   8   6 441   0   4]
 [ 79  57   1 101  16   9 447   0   1]
 [ 77  52   1  70  27  10 451   0   0]
 [ 90  56   2  65  14  21 454   0   6]
 [ 77  75   2  56  19   7 503   0   0]
 [ 72  47   3  72  11   6 456   0   2]
 [ 24  32   0  32  10   4 232   0   6]]
             precision    recall  f1-score   support

          0       0.10      0.16      0.13       422
          1       0.15      0.10      0.12       741
          2       0.19      0.00      0.01       650
          3       0.17      0.14      0.15       711
          4       0.21      0.04      0.07       688
          5       0.25      0.03      0.05       708
          6       0.14      0.68      0.23       739
          7       0.00      0.00      0.00       669
          8       0.27      0.02      0.03       340

avg / total       0.16      0.14      0.09      5668



  'precision', 'predicted', average, warn_for)


### CV

In [47]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [51]:
checkpointer = ModelCheckpoint(filepath="model_12_chk.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [52]:
cv_kfold = StratifiedKFold(n_splits=5, shuffle=True)

model_12 = None
model_acc = 0
for index, (train_indices, val_indices) in enumerate(cv_kfold.split(x_22_T, x_22_Cp)):
    
    print("fold",index,"=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*")
    xtrain, xval = x_22_T[train_indices], x_22_T[val_indices]
    ytrain, yval = x_22_Cp[train_indices], x_22_Cp[val_indices]
    ytrain = np_utils.to_categorical(np.array(ytrain), 9)
    yval = np_utils.to_categorical(np.array(yval), 9)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        try:
            model_1.load_weights("model_12_weights.hdf5")
        except IOError as ioe:
            print("no checkpoints available !")
        
        model_1.fit(xtrain, ytrain, 
              validation_data=(xval, yval),
              epochs=10, batch_size=64,shuffle=True,
              callbacks=[tb_callback,checkpointer])
    
        loss, acc = model_1.evaluate(xval, yval, verbose=0)
        if model_acc < acc:
            model_12 = model_1
            model_12.save_weights("model_12_weights.hdf5")
            model_acc = acc
        
    

fold 0 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
no checkpoints available !
Train on 2653 samples, validate on 668 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 1 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2654 samples, validate on 667 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 2 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2657 samples, validate on 664 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 3 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2659 samples, validate on 662 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 4 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2661 samples, validate on 660 samples
Epoch 1/10
Epo



#### predictions

In [53]:
from sklearn.metrics import classification_report,confusion_matrix
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_1.load_weights("model_12_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    y_pred = model_1.predict(x_test_22_T)
    y_classes = y_pred.argmax(axis=-1)

print(confusion_matrix(x_test_22_Cp, y_classes))
print(classification_report(x_test_22_Cp, y_classes))

[[155  63   2  45   8  10 135   1   3]
 [222 134   4  71   7  19 278   2   4]
 [194 122   4  54   4  12 253   3   4]
 [202 132   1  91   9  17 254   1   4]
 [186 116   2  69  20  15 276   2   2]
 [220 123   3  55   6  28 261   2  10]
 [193 134   8  55  18   7 315   5   4]
 [194 110   5  56  11   6 275   4   8]
 [ 77  59   1  27   4   8 155   1   8]]
             precision    recall  f1-score   support

          0       0.09      0.37      0.15       422
          1       0.13      0.18      0.15       741
          2       0.13      0.01      0.01       650
          3       0.17      0.13      0.15       711
          4       0.23      0.03      0.05       688
          5       0.23      0.04      0.07       708
          6       0.14      0.43      0.21       739
          7       0.19      0.01      0.01       669
          8       0.17      0.02      0.04       340

avg / total       0.17      0.13      0.10      5668



## model_2: refined

In [37]:
text_seq_input = Input(shape=(MAX_TEXT_LEN,), dtype='int32')
text_embedding = Embedding(vocab_size, WORD_EMB_SIZE, input_length=MAX_TEXT_LEN,
                            weights=[trained_embeddings], trainable=True)(text_seq_input)

filter_sizes = [3, 4, 5, 10, 30, 50]
convs = []
for filter_size in filter_sizes:
    l_conv = Conv1D(filters=128, kernel_size=filter_size, padding='same', activation='relu')(text_embedding)
    l_pool = MaxPool1D(filter_size)(l_conv)
    convs.append(l_pool)

l_merge = Concatenate(axis=1)(convs)
l_cov1= Conv1D(128, 5, activation='relu', kernel_regularizer= l2(0.01))(l_merge)
l_pool1 = MaxPool1D(128)(l_cov1)

l_flat = Flatten()(l_pool1)
l_dense = Dense(128, activation='relu')(l_flat)
l_out = Dense(9, activation='softmax')(l_dense)
model_2 = Model(inputs=[text_seq_input], outputs=l_out)


In [38]:
model_2.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['categorical_accuracy'])
model_2.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 5000, 200)     70444000    input_2[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 5000, 128)     76928       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 5000, 128)     102528      embedding_1[0][0]                
___________________________________________________________________________________________

### training

In [39]:
tb_callback = keras.callbacks.TensorBoard(log_dir='./tb_graphs', histogram_freq=0, write_graph=True, write_images=True)

In [44]:
checkpointer = ModelCheckpoint(filepath="model_21_weights.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [41]:
earlystopping = EarlyStopping(monitor='val_categorical_accuracy', 
                              min_delta=0, patience=5, 
                              verbose=1, mode='auto')

### no CV

In [67]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_2.load_weights("model_21_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    model_2.fit(x_train_22_T, x_train_22_C, 
          validation_data=(x_val_22_T, x_val_22_C),
          epochs=10, batch_size=64,shuffle=True,
          callbacks=[tb_callback,checkpointer])

no checkpoints available !
Train on 2988 samples, validate on 333 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### prediction

In [69]:
from sklearn.metrics import classification_report,confusion_matrix
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_2.load_weights("model_21_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    y_pred = model_2.predict(x_test_22_T)
    y_classes = y_pred.argmax(axis=-1)

print(confusion_matrix(x_test_22_Cp, y_classes))
print(classification_report(x_test_22_Cp, y_classes))

[[173  35   1  55  30   6 122   0   0]
 [226  91   1  76  41  11 295   0   0]
 [214  69   4  61  38   8 256   0   0]
 [218  81   0  97  49   6 260   0   0]
 [216  67   1  67  62   8 267   0   0]
 [246  76   1  62  51  21 251   0   0]
 [222 105   4  59  39   3 307   0   0]
 [208  77   3  61  41   9 269   0   1]
 [ 87  37   0  37  25   5 147   0   2]]
             precision    recall  f1-score   support

          0       0.10      0.41      0.16       422
          1       0.14      0.12      0.13       741
          2       0.27      0.01      0.01       650
          3       0.17      0.14      0.15       711
          4       0.16      0.09      0.12       688
          5       0.27      0.03      0.05       708
          6       0.14      0.42      0.21       739
          7       0.00      0.00      0.00       669
          8       0.67      0.01      0.01       340

avg / total       0.19      0.13      0.10      5668



  'precision', 'predicted', average, warn_for)


### CV

In [70]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [71]:
checkpointer = ModelCheckpoint(filepath="model_22_chk.hdf5", 
                                    verbose=1,
                                    monitor="val_categorical_accuracy",
                                    save_best_only=True,
                                    mode="max")

In [73]:
cv_kfold = StratifiedKFold(n_splits=5, shuffle=True)

model_22 = None
model_acc = 0
for index, (train_indices, val_indices) in enumerate(cv_kfold.split(x_22_T, x_22_Cp)):
    
    print("fold",index,"=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*")
    xtrain, xval = x_22_T[train_indices], x_22_T[val_indices]
    ytrain, yval = x_22_Cp[train_indices], x_22_Cp[val_indices]
    ytrain = np_utils.to_categorical(np.array(ytrain), 9)
    yval = np_utils.to_categorical(np.array(yval), 9)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        try:
            model_2.load_weights("model_22_weights.hdf5")
        except IOError as ioe:
            print("no checkpoints available !")
        
        model_2.fit(xtrain, ytrain, 
              validation_data=(xval, yval),
              epochs=10, batch_size=64,shuffle=True,
              callbacks=[tb_callback,checkpointer])
    
        loss, acc = model_2.evaluate(xval, yval, verbose=0)
        if model_acc < acc:
            model_22 = model_2
            model_22.save_weights("model_22_weights.hdf5")
            model_acc = acc
        
    

fold 0 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
no checkpoints available !
Train on 2653 samples, validate on 668 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 1 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2654 samples, validate on 667 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 2 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2657 samples, validate on 664 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 3 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2659 samples, validate on 662 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
fold 4 =*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*=*
Train on 2661 samples, validate on 660 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




#### predictions

In [74]:
from sklearn.metrics import classification_report,confusion_matrix
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    try:
        model_2.load_weights("model_22_weights.hdf5")
    except IOError as ioe:
        print("no checkpoints available !")
    
    y_pred = model_2.predict(x_test_22_T)
    y_classes = y_pred.argmax(axis=-1)

print(confusion_matrix(x_test_22_Cp, y_classes))
print(classification_report(x_test_22_Cp, y_classes))

[[148  46   2  86   5   8 123   1   3]
 [174 110   3 134   6  12 296   2   4]
 [153  86   5 117   5   6 269   2   7]
 [165  85   1 154   6  11 283   3   3]
 [174  93   3 125  21  10 258   3   1]
 [188  88   5 125   7  23 261   2   9]
 [170  99   7 116  16   4 321   3   3]
 [164 102   3 107  10   8 263   5   7]
 [ 70  41   1  59   6   7 146   2   8]]
             precision    recall  f1-score   support

          0       0.11      0.35      0.16       422
          1       0.15      0.15      0.15       741
          2       0.17      0.01      0.01       650
          3       0.15      0.22      0.18       711
          4       0.26      0.03      0.05       688
          5       0.26      0.03      0.06       708
          6       0.14      0.43      0.22       739
          7       0.22      0.01      0.01       669
          8       0.18      0.02      0.04       340

avg / total       0.18      0.14      0.10      5668

