In [1]:
'''
  Reference : https://github.com/roomylee/self-attentive-emb-tf
'''

import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm

## Parameters

In [2]:
class Config:
    # Data loading params
    labels_count = 4
    max_sentence_length = 50
    dev_sample_percentage = 0.1
    
    # Model hyperparameters
    embedding_dim = 300 # Dimensionality of word embedding
    hidden_size= 256 # Size of LSTM hidden layer
    d_a_size = 350 # Size of W_s1 embedding
    r_size = 30 # Size of W_s2 embedding
    fc_size = 2000 # Size of fully connected layer
    p_coef = 1.0 # Coefficient for penalty
    
    # Training parameters
    batch_size = 64
    num_epochs = 100
    display_every = 500
    evaluate_every = 1000
    num_checkpoints = 4
    learning_rate = 1e-3


## Dataset 

Load AG's news topic classification dataset

In [3]:
import nltk
import re
import os
import csv

class Dataset:
    def clean_str(self, string):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()

    def load_data_and_labels(self, path):
        data = []
        labels = []
        with open(path, 'r') as f:
            rdr = csv.reader(f, delimiter=',', quotechar='"')
            for row in rdr:
                txt = ""
                for s in row[1:]:
                    txt = txt + re.sub("^\s*(.-)\s*$", "%1", s).replace("\\n", "\n") + " "
                txt = self.clean_str(txt)
                data.append(txt)
                labels.append(int(row[0]))

        data = np.asarray(data)
        labels = np.asarray(labels)
        
        df = pd.DataFrame({'data': data, 'labels': labels})
        return df
    
    def download_and_load_datasets(self):
        dataset = tf.keras.utils.get_file(
          fname="AG_news_data.zip", 
          origin="https://s3.ap-northeast-2.amazonaws.com/bowbowbow-storage/dataset/AG_news_data.zip", 
          extract=True)
        
        train_file = 'AG_news_data/train.csv'
        test_file = 'AG_news_data/test.csv'
        
        train_df = self.load_data_and_labels(os.path.join(os.path.dirname(dataset), train_file))
        test_df = self.load_data_and_labels(os.path.join(os.path.dirname(dataset), test_file))
        return train_df, test_df

dataset = Dataset()
train_df, test_df = dataset.download_and_load_datasets()
train_df.head()

Unnamed: 0,data,labels
0,wall st bears claw back into the black \( reut...,3
1,carlyle looks toward commercial aerospace \( r...,3
2,oil and economy cloud stocks' outlook \( reute...,3
3,iraq halts oil exports from main southern pipe...,3
4,"oil prices soar to all time record , posing ne...",3


## Model

In [4]:
class SelfAttention:
    def __init__(self, 
               sequence_length, 
               num_classes, 
               vocab_size, 
               embedding_size, 
               hidden_size,
               d_a_size,
               r_size,
               fc_size,
               p_coef,
            ):
        
        self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text')
        self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
        
        text_length = self._length(self.input_text)
        initializer = tf.contrib.layers.xavier_initializer()
        
        # Embedding layer
        with tf.name_scope('Embeddings'):
            self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0), name='W_text')
            self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_text)
        
        # Bidirectional RNN
        with tf.name_scope('bi-lstm'):
            fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
            bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
            (self.output_fw, self.output_bw), states = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=fw_cell,
                cell_bw=bw_cell,
                inputs=self.embedded_chars,
                sequence_length=text_length,
                dtype=tf.float32)
            
            self.H = tf.concat([self.output_fw, self.output_bw], axis=2) # [batch_size, sequence_length, 2*hidden_size]
            H_reshape = tf.reshape(self.H, [-1, 2*hidden_size]) # [batch_size * sequence_length, 2*hidden_size]
        
        with tf.name_scope('self-attention'):
            self.W_s1 = tf.get_variable('W_s1', shape=[2*hidden_size, d_a_size], initializer=initializer)
            _H_s1 = tf.nn.tanh(tf.matmul(H_reshape, self.W_s1)) # [batch_size * sequence_length, d_a_size]
            self.W_s2 = tf.get_variable('W_s2', shape=[d_a_size, r_size], initializer=initializer)
            _H_s2 = tf.matmul(_H_s1, self.W_s2) # [batch_size * sequence_length, r_size]
            _H_s2_reshape = tf.transpose(tf.reshape(_H_s2, [-1, sequence_length, r_size]), [0, 2, 1]) # [batch_size, r_size, sequence_length] ]
            self.A = tf.nn.softmax(_H_s2_reshape, name='attention') # [batch_size, r_size, sequence_length]
            
        with tf.name_scope('sentence-embedding'):
            self.M = tf.matmul(self.A, self.H) # [batch_size, r_size, 2*hidden_size]
        
        with tf.name_scope('fully-connected'):
            self.M_flat = tf.reshape(self.M, shape=[-1, 2*hidden_size * r_size]) # [batch_size, 2*hidden_size * r_size]
            W_fc = tf.get_variable('W_fc', shape=[2*hidden_size * r_size, fc_size], initializer=initializer)
            b_fc = tf.Variable(tf.constant(0.1, shape=[fc_size]), name='b_fc')
            self.fc = tf.nn.relu(tf.nn.xw_plus_b(self.M_flat, W_fc, b_fc), name='fc') # [batch_size, fc_size]
            
        with tf.name_scope('output'):
            W_output = tf.get_variable('W_output', shape=[fc_size, num_classes], initializer=initializer)
            b_output = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='b_output')
            
            self.logits = tf.nn.xw_plus_b(self.fc, W_output, b_output, name='logits') # [batch_size, num_classes]
            self.predictions = tf.argmax(self.logits, 1, name='predictions')
        
        with tf.name_scope("penalization"):
            A_T = tf.transpose(self.A, [0, 2, 1]) # [batch_size, sequence_length, r_size]
            self.AA_T = tf.matmul(self.A, A_T) # [batch_size, r_size, r_size]
            # copy identity matrix by batch_size. [r_size, r_size] -> [batch_size*r_size, r_size]
            I_ = tf.tile(tf.eye(r_size), [tf.shape(self.A)[0], 1]) # [batch_size*r_size, r_size]
            self.I = tf.reshape(I_, [-1, r_size, r_size]) # [batch_size, r_size, r_size]
            self.P = tf.square(tf.norm(self.AA_T - self.I, axis=[-2, -1], ord="fro"))
        
        # Calculate mean corss-entropy loss
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
            self.loss_P = tf.reduce_mean(self.P * p_coef)
            self.loss = tf.reduce_mean(losses) + self.loss_P
        
        # Accuracy    
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
            
    # Length of the sequence data
    @staticmethod
    def _length(seq):
        relevant = tf.sign(tf.abs(seq))
        length = tf.reduce_sum(relevant, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length

## Preprocessing

In [5]:
# Build vocabulary
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(Config.max_sentence_length)
x = np.array(list(vocab_processor.fit_transform(train_df['data'])))
y = np.array([np.eye(Config.labels_count)[label - 1] for label in train_df['labels']])

print('Text Vocabulary Size {}'.format(len(vocab_processor.vocabulary_)))
print('X = {}'.format(x.shape))
print('Y = {}'.format(y.shape))

# Randomly shuffle data to split into train and dev
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled, y_shuffled = x[shuffle_indices], y[shuffle_indices]

# Split train/dev set
dev_sample_index = -1 * int(Config.dev_sample_percentage*float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] 
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] 
print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Text Vocabulary Size 67789
X = (120000, 50)
Y = (120000, 4)
Train/Dev split: 108000/12000



## Function for train

In [6]:
# Pre-trained word2vec
def load_word2vec(embedding_dim, vocab):
    download_path = tf.keras.utils.get_file(
      fname="GoogleNews-vectors-negative300.bin.gz", 
      origin="https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", 
      extract=True)
    
    embedding_path = os.path.join(os.path.dirname(download_path), 'GoogleNews-vectors-negative300.bin')
    if not os.path.exists(embedding_path):
        print('unzip :', embedding_path)
        import gzip
        import shutil
        with gzip.open('{}.gz'.format(embedding_path), 'rb') as f_in:
            with open(embedding_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    print('embedding_path :', embedding_path)

    # initial matrix with random uniform
    initW = np.random.randn(len(vocab.vocabulary_), embedding_dim).astype(np.float32) / np.sqrt(len(vocab.vocabulary_))
    # load any vectors from the word2vec
    print("Load word2vec file {0}".format(embedding_path))
    with open(embedding_path, "rb") as f:
        header = f.readline()
        vocab_size, layer_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer_size
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1).decode('latin-1')
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            idx = vocab.vocabulary_.get(word)
            if idx != 0:
                initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return initW

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

## Train

In [7]:
import datetime
import time

from sklearn.metrics import f1_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

tf.reset_default_graph()
sess = tf.Session()
with sess.as_default():
    model = SelfAttention(
        sequence_length=x_train.shape[1],
        num_classes=y_train.shape[1],
        vocab_size=len(vocab_processor.vocabulary_),
        embedding_size=Config.embedding_dim,
        hidden_size=Config.hidden_size,
        d_a_size=Config.d_a_size,
        r_size=Config.r_size,
        fc_size=Config.fc_size,
        p_coef=Config.p_coef
    )
    
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = tf.train.AdadeltaOptimizer(Config.learning_rate).minimize(model.loss, global_step=global_step)
    
    # Output directory for models and summary
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "28.runs", timestamp))
    print("Writing to {}\n".format(out_dir))

    # Summaries for loss and accuracy
    loss_summary = tf.summary.scalar("loss", model.loss)
    acc_summary = tf.summary.scalar("accuracy", model.accuracy)
    
    # Train Summaries
    train_summary_op = tf.summary.merge([loss_summary, acc_summary])
    train_summary_dir = os.path.join(out_dir, "summaries", "train")
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

    # Dev summaries
    dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
    dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
    dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

    # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
    checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
    checkpoint_prefix = os.path.join(checkpoint_dir, "model")
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=Config.num_checkpoints)
    
    sess.run(tf.global_variables_initializer())

    pretrain_W = load_word2vec(Config.embedding_dim, vocab_processor)
    sess.run(model.W_text.assign(pretrain_W))
    print("Success to load pre-trained word2vec model!\n")
    
    # Generate batches
    batches = batch_iter(list(zip(x_train, y_train)), Config.batch_size,Config.num_epochs)
    
    # Training loop. For each batch...
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        # Train
        feed_dict = {
            model.input_text: x_batch,
            model.input_y: y_batch,
        }
        _, step, summaries, loss, accuracy = sess.run(
            [train_op, global_step, train_summary_op, model.loss, model.accuracy], feed_dict)
        train_summary_writer.add_summary(summaries, step)

        # Training log display
        if step % Config.display_every == 0:
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

        # Evaluation
        if step % Config.evaluate_every == 0:
            print("\nEvaluation:")
            feed_dict = {
                model.input_text: x_dev,
                model.input_y: y_dev,
            }
            summaries, loss, accuracy, predictions = sess.run(
                [dev_summary_op, model.loss, model.accuracy, model.predictions], feed_dict)
            dev_summary_writer.add_summary(summaries, step)

            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:.4f}\n".format(time_str, step, loss, accuracy))



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
I_ : Tensor("penalization/Tile:0", shape=(?, 30), dtype=float32)
I : Tensor("penalization/Reshape:0", shape=(?, 30, 30), dtype=float32)
Writing to /home/seungwon/project/tf-notes/28.runs/1557248376

embedding_path : /home/seungwon/.keras/datasets/GoogleNews-vectors-negative300.bin
Load word2vec file /home/seungwon/.keras/datasets/GoogleNews-vectors-negative300.bin




Success to load pre-trained word2vec model!

2019-05-08T02:00:25.094419: step 500, loss 30.4609, acc 0.6875
2019-05-08T02:00:54.153484: step 1000, loss 30.4122, acc 0.6875

Evaluation:
2019-05-08T02:00:55.903486: step 1000, loss 30.382, acc 0.8015

2019-05-08T02:01:24.984404: step 1500, loss 30.2903, acc 0.734375
2019-05-08T02:01:54.062674: step 2000, loss 30.1578, acc 0.828125

Evaluation:
2019-05-08T02:01:55.573597: step 2000, loss 30.2121, acc 0.8184

2019-05-08T02:02:24.681914: step 2500, loss 30.0708, acc 0.84375
2019-05-08T02:02:53.758449: step 3000, loss 30.1179, acc 0.78125

Evaluation:
2019-05-08T02:02:55.256589: step 3000, loss 30.0493, acc 0.8261

2019-05-08T02:03:24.368628: step 3500, loss 30.0016, acc 0.8125
2019-05-08T02:03:53.474670: step 4000, loss 29.8781, acc 0.859375

Evaluation:
2019-05-08T02:03:54.981363: step 4000, loss 29.9179, acc 0.8314

2019-05-08T02:04:24.093103: step 4500, loss 29.8669, acc 0.84375
2019-05-08T02:04:53.207673: step 5000, loss 29.7117, acc 0.9

2019-05-08T02:41:14.045445: step 41500, loss 29.6666, acc 0.84375
2019-05-08T02:41:43.145286: step 42000, loss 29.4119, acc 0.90625

Evaluation:
2019-05-08T02:41:44.645379: step 42000, loss 29.5219, acc 0.8793

2019-05-08T02:42:13.772007: step 42500, loss 29.6114, acc 0.859375
2019-05-08T02:42:42.868586: step 43000, loss 29.6038, acc 0.859375

Evaluation:
2019-05-08T02:42:44.373597: step 43000, loss 29.5206, acc 0.8793

2019-05-08T02:43:13.495066: step 43500, loss 29.3235, acc 0.953125
2019-05-08T02:43:42.634246: step 44000, loss 29.5575, acc 0.859375

Evaluation:
2019-05-08T02:43:44.122400: step 44000, loss 29.5192, acc 0.8804

2019-05-08T02:44:13.228692: step 44500, loss 29.5247, acc 0.84375
2019-05-08T02:44:42.350854: step 45000, loss 29.4009, acc 0.90625

Evaluation:
2019-05-08T02:44:43.846834: step 45000, loss 29.5179, acc 0.8807

2019-05-08T02:45:12.977389: step 45500, loss 29.4751, acc 0.859375
2019-05-08T02:45:42.091099: step 46000, loss 29.4298, acc 0.953125

Evaluation:
2019-

2019-05-08T03:21:33.805999: step 82000, loss 29.4768, acc 0.8888

2019-05-08T03:22:02.938439: step 82500, loss 29.4013, acc 0.90625
2019-05-08T03:22:32.053402: step 83000, loss 29.3702, acc 0.90625

Evaluation:
2019-05-08T03:22:33.533296: step 83000, loss 29.4745, acc 0.8892

2019-05-08T03:23:02.643502: step 83500, loss 29.3344, acc 0.9375
2019-05-08T03:23:31.760291: step 84000, loss 29.435, acc 0.859375

Evaluation:
2019-05-08T03:23:33.262918: step 84000, loss 29.4722, acc 0.8888

2019-05-08T03:24:02.385783: step 84500, loss 29.584, acc 0.859375
2019-05-08T03:24:31.490528: step 85000, loss 29.4631, acc 0.921875

Evaluation:
2019-05-08T03:24:32.998551: step 85000, loss 29.4693, acc 0.8890

2019-05-08T03:25:02.100001: step 85500, loss 29.3227, acc 0.953125
2019-05-08T03:25:31.206775: step 86000, loss 29.4638, acc 0.890625

Evaluation:
2019-05-08T03:25:32.695911: step 86000, loss 29.4659, acc 0.8889

2019-05-08T03:26:01.817551: step 86500, loss 29.6391, acc 0.765625
2019-05-08T03:26:30.9

2019-05-08T04:01:23.238976: step 122000, loss 27.709, acc 0.8843

2019-05-08T04:01:52.361120: step 122500, loss 27.7287, acc 0.90625
2019-05-08T04:02:21.455945: step 123000, loss 27.8605, acc 0.828125

Evaluation:
2019-05-08T04:02:22.983602: step 123000, loss 27.6808, acc 0.8838

2019-05-08T04:02:52.117607: step 123500, loss 27.4897, acc 0.90625
2019-05-08T04:03:21.243906: step 124000, loss 27.6135, acc 0.890625

Evaluation:
2019-05-08T04:03:22.750336: step 124000, loss 27.6532, acc 0.8839

2019-05-08T04:03:51.867588: step 124500, loss 27.4982, acc 0.921875
2019-05-08T04:04:20.999228: step 125000, loss 27.619, acc 0.890625

Evaluation:
2019-05-08T04:04:22.498024: step 125000, loss 27.6276, acc 0.8848

2019-05-08T04:04:51.618354: step 125500, loss 27.7777, acc 0.765625
2019-05-08T04:05:20.727951: step 126000, loss 27.3948, acc 0.96875

Evaluation:
2019-05-08T04:05:22.225073: step 126000, loss 27.5994, acc 0.8849

2019-05-08T04:05:51.338707: step 126500, loss 27.5812, acc 0.90625
2019-05

2019-05-08T04:39:42.407937: step 160500, loss 24.5367, acc 0.875
2019-05-08T04:40:11.533353: step 161000, loss 24.5078, acc 0.890625

Evaluation:
2019-05-08T04:40:13.048555: step 161000, loss 24.4375, acc 0.8860

2019-05-08T04:40:42.164108: step 161500, loss 24.2309, acc 0.9375
2019-05-08T04:41:11.302693: step 162000, loss 24.1572, acc 0.90625

Evaluation:
2019-05-08T04:41:12.794676: step 162000, loss 24.1974, acc 0.8840

2019-05-08T04:41:41.912345: step 162500, loss 24.0368, acc 0.875
2019-05-08T04:42:11.028818: step 163000, loss 23.9437, acc 0.90625

Evaluation:
2019-05-08T04:42:12.543906: step 163000, loss 23.946, acc 0.8840

2019-05-08T04:42:41.668303: step 163500, loss 23.8686, acc 0.90625
2019-05-08T04:43:10.791583: step 164000, loss 23.6363, acc 0.84375

Evaluation:
2019-05-08T04:43:12.283879: step 164000, loss 23.6849, acc 0.8839

2019-05-08T04:43:41.398538: step 164500, loss 23.7559, acc 0.859375
2019-05-08T04:44:10.537587: step 165000, loss 23.1917, acc 0.921875

Evaluation:


## Tensorboard

```
tensorboard --logdir=./28.runs --host 0.0.0.0
```