In [1]:
import numpy as np
import os,re,datetime
import pandas as pd
import pickle

from keras.preprocessing.text import Tokenizer
from keras.metrics import categorical_accuracy, binary_accuracy, top_k_categorical_accuracy
from keras.callbacks import CSVLogger
import tensorflow as tf
import keras.backend as K

from sklearn.preprocessing import MultiLabelBinarizer

import scipy.sparse
from tools.helper import MetricsAtTopK
from tools.MyClock import MyClock
from models import get_model
clk = MyClock()

# argparse
import argparse
parser = argparse.ArgumentParser(description = 'run baseline models')
parser.add_argument('-i','--input', required = True, type = str, help = 'input directory e.g. ./data/dl_amazon_1/')
parser.add_argument('-o','--output', required = True, type = str, help = 'output directory')
parser.add_argument('-m','--model', required = True, type = str, help = 'model, one in: xmlcnn, attentionxml, attention,')
parser.add_argument('--epoch', default = 5, type = int, help = 'epochs')
parser.add_argument('--batch_size', default = 0, type = int, help = 'batch size')
parser.add_argument('--early_stopping', default = False, action = 'store_true', help = 'early stopping using validation set (not implemented yet)')
parser.add_argument('--save_weights', default = True, action = 'store_true', help = 'save trained model weights')
parser.add_argument('--save_prediction', default = 10, type = int, help = 'save top k prediction and corresponding probabilities (not implemented yet)')
# args = parser.parse_args('-i data/sic_hiararchy -o woop -m xmlcnn --epoch 5'.split(' '))

args = parser.parse_args('-i data/dl_sic -o woop -m xmlcnn --epoch 5'.split(' '))


Using TensorFlow backend.


In [2]:
def binary_cross_entropy_with_logits(y_true, y_pred):
    return K.mean(K.binary_crossentropy(y_true,y_pred,from_logits=True),axis=-1)
def categorical_cross_entropy_with_logits(y_true, y_pred):
    return K.mean(K.categorical_crossentropy(y_true,y_pred,from_logits=True),axis=-1)
# metrics
def binary_accuracy_with_logits(y_true, y_pred):
    return K.mean(K.equal(y_true, K.tf.cast(K.less(0.0,y_pred), y_true.dtype)))
pat1 = MetricsAtTopK(k=1)
pat5 = MetricsAtTopK(k=5)
def p1(x,y):
    return pat1.precision_at_k(x,y)
def p5(x,y):
    return pat5.precision_at_k(x,y)

if not args.batch_size:
    if args.model == 'attention':
        args.batch_size = 25
    elif args.model == 'xmlcnn':
        args.batch_size = 128
    elif args.model == 'attentionxml':
        args.batch_size = 20

IN_DIR = args.input
OUT_DIR = args.output
in_dirs = {
    'embedding_matrix':'embedding_matrix.npy',
    'x_train':'x_train.npy',
    'x_test':'x_test.npy',}
for d in os.listdir(IN_DIR):
    if d.startswith('y_'):
        in_dirs[d.split('.')[0]]=d
for key,val in in_dirs.items():
    d = os.path.join(IN_DIR,val)
    if not os.path.exists(d):
        raise Exception('path does not exist: {}'.format(d))
    else:
        in_dirs[key] = d
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
out_dir = os.path.join(
    args.output,
    datetime.datetime.now().strftime('%y%m%d_%H%M%S_{}'.format(args.model)),
)

# things
if not os.path.exists(IN_DIR):
    raise Exception('input path does not exist: {}'.format(IN_DIR))
print('READ DATA...')
embedding_matrix = np.load(in_dirs['embedding_matrix'])
x_train = np.load(in_dirs['x_train'])
x_test = np.load(in_dirs['x_test'])
y_trains = [scipy.sparse.load_npz(d).todense() for key,d in sorted(in_dirs.items()) if key.startswith('y_train')]
y_tests = [scipy.sparse.load_npz(d).todense() for key,d in sorted(in_dirs.items()) if key.startswith('y_test')]
labels_dims = [y_train.shape[-1] for y_train in y_trains]
num_words,embedding_dim = embedding_matrix.shape
max_sequence_length = x_train.shape[1]
print('Train: {}, Test: {}, Labels: {}, Vocab size: {}, Embedding: {}'.format(
    x_train.shape[0],x_test.shape[0],labels_dims,num_words-1,embedding_dim))

READ DATA...
Train: 588992, Test: 147247, Labels: [18, 77, 453, 538], Vocab size: 50000, Embedding: 300


## get multiple outputs

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, GlobalMaxPooling1D, Flatten, Concatenate
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
from keras.layers import CuDNNLSTM, Bidirectional, LSTM, Dropout
from keras.layers import TimeDistributed, Lambda, Softmax, merge
from keras.initializers import Constant
from keras.layers import Input, Embedding
import tensorflow as tf
import keras.backend as K

In [4]:
def pAt1(y_true,y_pred):
    return categorical_accuracy(y_true, y_pred)
def pAt5(y_true,y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)

In [5]:
model_name = 'xmlcnn'
bottle_neck = 256
num_words,embedding_dim = embedding_matrix.shape
embedding_layer = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_sequence_length,
                            trainable=False)
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
if model_name == 'xmlcnn':
    filter_sizes = [2,4,8]
    pooling_units = 32
    convs = []
    for fsz in filter_sizes:
        l = Conv1D(filters = 128, kernel_size = fsz, strides = 2, activation = 'relu')(embedded_sequences)
        s = int(l.shape[-2])
        pool_size = s//pooling_units
        l = MaxPooling1D(pool_size,padding = 'same')(l)
        l = Flatten()(l)
        convs.append(l)
    x = Concatenate(axis=-1)(convs)
    x = Dense(bottle_neck, activation = 'relu')(x)
    x = Dropout(0.5)(x)
    outs = []
    for i,labels_dim in enumerate(labels_dims):
        outs.append(Dense(labels_dim, activation = None, name = 'out{}'.format(i))(x))
elif model_name == 'xmlcnn_2':
    filter_sizes = [2,4,8]
    pooling_units = 32
    convs = []
    for fsz in filter_sizes:
        l = Conv1D(filters = 128, kernel_size = fsz, strides = 2, activation = 'relu')(embedded_sequences)
        s = int(l.shape[-2])
        pool_size = s//pooling_units
        l = MaxPooling1D(pool_size,padding = 'same')(l)
        l = Flatten()(l)
        convs.append(l)
    x = Concatenate(axis=-1)(convs)
    outs = []
    for i,labels_dim in enumerate(labels_dims):
        if labels_dim<bottle_neck:
            x2 = Dense(bottle_neck, activation = 'relu')(x)
            x2 = Dropout(0.5)(x2)
        else:
            x2 = x
        outs.append(Dense(labels_dim, activation = None, name = 'out{}'.format(i))(x2))

W0722 22:43:55.495620 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0722 22:43:55.511729 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0722 22:43:55.800088 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0722 22:43:55.815241 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0722 22:43:55.871634 139731949

In [6]:
model = Model(sequence_input, outs)
model.compile(loss=categorical_cross_entropy_with_logits,
              optimizer='adam',
              metrics=[pAt1,pAt5])
print(model.summary())

W0722 22:43:55.926267 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0722 22:43:55.942838 139731949930304 deprecation.py:323] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3298: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     15000300    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 100, 128)     76928       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 99, 128)      153728      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [7]:
model.fit(x_train, y_trains,
          batch_size = args.batch_size,
          epochs = args.epoch,
          validation_data = (x_test, y_tests),
#               callbacks = [csv_logger],
          shuffle=True,
         )

W0722 22:43:56.436262 139731949930304 deprecation_wrapper.py:119] From /home/angela/env/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Train on 588992 samples, validate on 147247 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f15d4820780>

In [8]:
model2 = Model(sequence_input, outs)
model2.compile(loss=binary_cross_entropy_with_logits,
               optimizer='adam',
               metrics=[pAt1,pAt5])

W0722 22:51:14.644289 139731949930304 deprecation.py:323] From /home/angela/env/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [9]:
model2.fit(x_train, y_trains,
          batch_size = args.batch_size,
          epochs = args.epoch,
          validation_data = (x_test, y_tests),
#               callbacks = [csv_logger],
          shuffle=True,
         )

Train on 588992 samples, validate on 147247 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f13a1b43320>

In [None]:
val_out0_pAt1: 0.4486 - val_out0_pAt5: 0.7201 - val_out1_pAt1: 0.6179 - val_out1_pAt5: 0.8793 - val_out2_pAt1: 0.8841 - val_out2_pAt5: 0.9808
            
val_out0_pAt1: 0.3923 - val_out0_pAt5: 0.6645 - val_out1_pAt1: 0.6012 - val_out1_pAt5: 0.8614 - val_out2_pAt1: 0.8972 - val_out2_pAt5: 0.9825
                        
                        
                        

In [None]:
# huge increase in the finer layer accuracy, mild decrease in the root accuracy

In [None]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
csv_logger = CSVLogger(os.path.join(out_dir,'train.log'),append=False)
if args.early_stopping:
    pass
else:
    model.fit(x_train, y_train,
              batch_size = args.batch_size,
              epochs = args.epoch,
              validation_data = (x_test, y_test),
              callbacks = [csv_logger],
              shuffle=True,
             )
if args.save_weights:
    model.save_weights(os.path.join(out_dir,'weights.h5'))
if args.save_prediction:
    print('SAVE PREDICTIONS')
    k = args.save_prediction
    batch_size = x_test.shape[0]//100
    IND_DIR = os.path.join(out_dir,'prediction_{}_ind.txt'.format(k))
    LOGITS_DIR = os.path.join(out_dir,'prediction_{}_logits.txt'.format(k))
    f_ind = open(IND_DIR,'ab')
    f_logits = open(IND_DIR,'ab')
    s = x_test.shape[0]
    clk.tic()
    for i,start in enumerate(range(0,s,batch_size)):
        end = min(start+batch_size,s)
        x_batch = x_test[start:end,:]
        out_probs = model.predict(x_batch)
        ind = np.argsort(out_probs,axis=1)[:,-k:]
        ind = ind[:,::-1]
        logits = np.take_along_axis(out_probs, ind, axis=1)
        np.savetxt(f_ind,ind,fmt='%d')
        np.savetxt(f_logits,logits,fmt='%1.3f')
        print('{:0.0f}% {}'.format(end/s*100,clk.toc(False)),end='\r')
    f_ind.close()
    f_logits.close()
csv_path = os.path.join(out_dir,'args.csv')
pd.DataFrame.from_dict([vars(args)]).to_csv(csv_path)