<a href="https://colab.research.google.com/github/aishitdharwal/Neural-Structured-Learning/blob/master/IMDB_Graph_regularization_for_sentiment_classification_using_synthesized_graphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Classify reviews as positive or negative**

Import libraries

In [0]:
!pip install --quiet neural-structured-learning
!pip install --quiet tensorflow-hub

In [0]:
import matplotlib.pyplot as plt
import numpy as np

import neural_structured_learning as nsl

import tensorflow as tf
import tensorflow_hub as hub

# Resets notebook state
tf.keras.backend.clear_session()

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print(
    "GPU is",
    "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

* Get data

* Data: 25k training, 25k test reviews 
(equal number of positive and negative reviews in both)

* num_words=10000 keeps the 10000 most frequently occurring words


In [0]:
imdb = tf.keras.datasets.imdb
(x_train, y_train), (x_test, y_test) = (imdb.load_data(num_words=10000))

In [0]:
print('Training entries: {}, labels: {}'.format(
    len(x_train), len(y_train)))
training_samples_count = len(x_train)

In [0]:
print(x_train[0])

In [0]:
len(x_train[0]), len(x_train[1])

*   get the words and their pre-assigned index
*   the indices 0,1,2,3 are reserved in the data, have to be prefixed by the following pre-assigned values
*   Reversing the sorted_word_index key->value pair

In [0]:
def build_reverse_word_index():
  # A dictionary mapping words to an integer index
  word_index = imdb.get_word_index()

  # The first indices are reserved
  word_index = {k: (v + 3) for k, v in word_index.items()}
  word_index['<PAD>'] = 0
  word_index['<START>'] = 1
  word_index['<UNK>'] = 2  # unknown
  word_index['<UNUSED>'] = 3
  return dict((value, key) for (key, value) in word_index.items())

reverse_word_index = build_reverse_word_index()

def decode_review(text):
  return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [0]:
decode_review(x_train[0])

In [0]:
!mkdir -p /tmp/imdb

* Using pretrained Swivel embeddings

* tf.reshape(x, shape=[-1]) flattens the array x

* swivel embedding converts text to a 20 element vector

In [0]:
pretrained_embedding = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1'

hub_layer = hub.KerasLayer(
    pretrained_embedding, input_shape=[], dtype=tf.string, trainable=True)

* convert vectors into tf compatible format

* tf.train.Feature wraps a list of data of a specific type so Tensorflow can understand it. It has a single attribute, which is a union of bytes_list/float_list/int64_list. Being a union, the stored list can be of type tf.train.BytesList (attribute name bytes_list), tf.train.FloatList (attribute name float_list), or tf.train.Int64List (attribute name int64_list).

* create_embeddings() converts the training data into swivel embeddings and saves them as a feature in tfr (tensorflow record) format (easier and lighter to read by tensorflow)

* the output 25000 is just the number of features recorded (returned as record_id)

In [0]:
def _int64_feature(value):
  """Returns int64 tf.train.Feature."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value.tolist()))


def _bytes_feature(value):
  """Returns bytes tf.train.Feature."""
  return tf.train.Feature(
      bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))


def _float_feature(value):
  """Returns float tf.train.Feature."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value.tolist()))


def create_embedding_example(word_vector, record_id):
  """Create tf.Example containing the sample's embedding and its ID."""

  text = decode_review(word_vector)

  # Shape = [batch_size,].
  sentence_embedding = hub_layer(tf.reshape(text, shape=[-1,]))

  # Flatten the sentence embedding back to 1-D.
  sentence_embedding = tf.reshape(sentence_embedding, shape=[-1])

  features = {
      'id': _bytes_feature(str(record_id)),
      'embedding': _float_feature(sentence_embedding.numpy())
  }
  return tf.train.Example(features=tf.train.Features(feature=features))


def create_embeddings(word_vectors, output_path, starting_record_id):
  record_id = int(starting_record_id)
  with tf.io.TFRecordWriter(output_path) as writer:
    for word_vector in word_vectors:
      example = create_embedding_example(word_vector, record_id)
      record_id = record_id + 1
      writer.write(example.SerializeToString())
  return record_id


# Persist TF.Example features containing embeddings for training data in
# TFRecord format.
create_embeddings(x_train, '/tmp/imdb/embeddings.tfr', 0)

* create a graph from the features using nsl and save it in .tsv format

* this uses cosine similarity as the metric to compare the embeddings

In [0]:
nsl.tools.build_graph(['/tmp/imdb/embeddings.tfr'],
                      '/tmp/imdb/graph_99.tsv',
                      similarity_threshold=0.99)

* the graph can be read in Pandas

* For a similarity threshold=99%, there are 863786 edges created for the data.

In [0]:
import pandas as pd
df=pd.read_csv('/tmp/imdb/graph_99.tsv',sep='\t')
df

* This creates a .tfr file of train, test data which has id, indices array of data, label of data

* the output 50000 is just the number of data points covered (25k + 25k)

In [0]:
def create_example(word_vector, label, record_id):
  """Create tf.Example containing the sample's word vector, label, and ID."""
  features = {
      'id': _bytes_feature(str(record_id)),
      'words': _int64_feature(np.asarray(word_vector)),
      'label': _int64_feature(np.asarray([label])),
  }
  return tf.train.Example(features=tf.train.Features(feature=features))

def create_records(word_vectors, labels, record_path, starting_record_id):
  record_id = int(starting_record_id)
  with tf.io.TFRecordWriter(record_path) as writer:
    for word_vector, label in zip(word_vectors, labels):
      example = create_example(word_vector, label, record_id)
      record_id = record_id + 1
      writer.write(example.SerializeToString())
  return record_id

# Persist TF.Example features (word vectors and labels) for training and test
# data in TFRecord format.
next_record_id = create_records(x_train, y_train,
                                '/tmp/imdb/train_data.tfr', 0)
create_records(x_test, y_test, '/tmp/imdb/test_data.tfr',
               next_record_id)

* Pack the neighbors data (graph) with the training data, to create the augmented training data, with max. neighbors=3

* Supports unlabeled data with labeled data (semi supervised learning), but it is kept blank for this problem

In [0]:
nsl.tools.pack_nbrs(
    '/tmp/imdb/train_data.tfr',
    '',
    '/tmp/imdb/graph_99.tsv',
    '/tmp/imdb/nsl_train_data.tfr',
    add_undirected_edges=True,
    max_nbrs=3)

A class of hyperparameters for the model


-   **num_classes**: There are 2 classes -- *positive* and *negative*.

-   **max_seq_length**: This is the maximum number of words considered from each
    movie review in this example.

-   **vocab_size**: This is the size of the vocabulary considered for this
    example.

-   **distance_type**: This is the distance metric used to regularize the sample
    with its neighbors.

-   **graph_regularization_multiplier**: This controls the relative weight of
    the graph regularization term in the overall loss function.

-   **num_neighbors**: The number of neighbors used for graph regularization.
    This value has to be less than or equal to the `max_nbrs` argument used
    above when invoking `nsl.tools.pack_nbrs`.

-   **num_fc_units**: The number of units in the fully connected layer of the
    neural network.

-   **train_epochs**: The number of training epochs.

-   **batch_size**: Batch size used for training and evaluation.

-   **eval_steps**: The number of batches to process before deeming evaluation
    is complete. If set to `None`, all instances in the test set are evaluated.

In [0]:
NBR_FEATURE_PREFIX = 'NL_nbr_'
NBR_WEIGHT_SUFFIX = '_weight'

In [0]:
class HParams(object):
  """Hyperparameters used for training."""
  def __init__(self):
    ### dataset parameters
    self.num_classes = 2
    self.max_seq_length = 256
    self.vocab_size = 10000
    ### neural graph learning parameters
    self.distance_type = nsl.configs.DistanceType.L2
    self.graph_regularization_multiplier = 0.1
    self.num_neighbors = 2
    ### model architecture
    self.num_embedding_dims = 16
    self.num_lstm_dims = 64
    self.num_fc_units = 64
    ### training parameters
    self.train_epochs = 10
    self.batch_size = 128
    ### eval parameters
    self.eval_steps = None  # All instances in the test set are evaluated.

HPARAMS = HParams()

* pad the input to make them of the same length

* The graph regularization term has a different weight associated for different neighbors, setting them to 0 initially

* map(map_func, num_parallel_calls=None, deterministic=None)

Maps map_func across the elements of this dataset.

This transformation applies map_func to each element of this dataset, and returns a new dataset containing the transformed elements, in the same order as they appeared in the input. map_func can be used to change both the values and the structure of a dataset's elements. For example, adding 1 to each element, or projecting a subset of element components.


* create batches

>> dataset = tf.data.Dataset.range(8)

>> dataset = dataset.batch(3)

>> list(dataset.as_numpy_iterator())

[array([0,1,2]),array([3,4,5]),array([6,7])]

In [0]:
def pad_sequence(sequence, max_seq_length):
  """Pads the input sequence (a `tf.SparseTensor`) to `max_seq_length`."""
  pad_size = tf.maximum([0], max_seq_length - tf.shape(sequence)[0])
  padded = tf.concat(
      [sequence.values,
       tf.fill((pad_size), tf.cast(0, sequence.dtype))],
      axis=0)
  # The input sequence may be larger than max_seq_length. Truncate down if
  # necessary.
  return tf.slice(padded, [0], [max_seq_length])

def parse_example(example_proto):
  """Extracts relevant fields from the `example_proto`.

  Args:
    example_proto: An instance of `tf.train.Example`.

  Returns:
    A pair whose first value is a dictionary containing relevant features
    and whose second value contains the ground truth labels.
  """
  # The 'words' feature is a variable length word ID vector.
  feature_spec = {
      'words': tf.io.VarLenFeature(tf.int64),
      'label': tf.io.FixedLenFeature((), tf.int64, default_value=-1),
  }
  # We also extract corresponding neighbor features in a similar manner to
  # the features above.
  for i in range(HPARAMS.num_neighbors):
    nbr_feature_key = '{}{}_{}'.format(NBR_FEATURE_PREFIX, i, 'words')
    nbr_weight_key = '{}{}{}'.format(NBR_FEATURE_PREFIX, i, NBR_WEIGHT_SUFFIX)
    feature_spec[nbr_feature_key] = tf.io.VarLenFeature(tf.int64)

    # We assign a default value of 0.0 for the neighbor weight so that
    # graph regularization is done on samples based on their exact number
    # of neighbors. In other words, non-existent neighbors are discounted.
    feature_spec[nbr_weight_key] = tf.io.FixedLenFeature(
        [1], tf.float32, default_value=tf.constant([0.0]))

  features = tf.io.parse_single_example(example_proto, feature_spec)

  # Since the 'words' feature is a variable length word vector, we pad it to a
  # constant maximum length based on HPARAMS.max_seq_length
  features['words'] = pad_sequence(features['words'], HPARAMS.max_seq_length)
  for i in range(HPARAMS.num_neighbors):
    nbr_feature_key = '{}{}_{}'.format(NBR_FEATURE_PREFIX, i, 'words')
    features[nbr_feature_key] = pad_sequence(features[nbr_feature_key],
                                             HPARAMS.max_seq_length)

  labels = features.pop('label')
  return features, labels

def make_dataset(file_path, training=False):
  """Creates a `tf.data.TFRecordDataset`.

  Args:
    file_path: Name of the file in the `.tfrecord` format containing
      `tf.train.Example` objects.
    training: Boolean indicating if we are in training mode.

  Returns:
    An instance of `tf.data.TFRecordDataset` containing the `tf.train.Example`
    objects.
  """
  dataset = tf.data.TFRecordDataset([file_path])
  if training:
    dataset = dataset.shuffle(10000)
  dataset = dataset.map(parse_example)
  dataset = dataset.batch(HPARAMS.batch_size)
  return dataset

train_dataset = make_dataset('/tmp/imdb/nsl_train_data.tfr', True)
test_dataset = make_dataset('/tmp/imdb/test_data.tfr')

* Using bidirectional lstm model, since it is a sequence data, so RNN is apt for it. And words carry context far through the sentence, and lstm enables it to carry the context farther through the model. Without lstm, the effect of far away cells diminishes as we traverse through the neurons


# Layers
* inputs: takes the input of size max_seq_length (the integer encoded data) as specified in the hyperparamenter class

* Embedding: takes the inputs layer and looks up the embedding for each word index, these embeddings are learned during training

tf.keras.layers.Embedding(
    input_dim,
    output_dim,
    embeddings_initializer="uniform",
    embeddings_regularizer=None,
    activity_regularizer=None,
    embeddings_constraint=None,
    mask_zero=False,
    input_length=None,
    **kwargs
)

* LSTM: num_lstm_dims is the size of the output from the layer

* Output: taking output as the probability of being a positive review, so size=1


* Bidirectional layer doubles the number of parameters, so num_lstm_dims=64 gives 128 parameters

In [0]:
# # This function exists as an alternative to the bi-LSTM model used in this
# # notebook.
# def make_feed_forward_model():
#   """Builds a simple 2 layer feed forward neural network."""
#   inputs = tf.keras.Input(
#       shape=(HPARAMS.max_seq_length,), dtype='int64', name='words')
#   embedding_layer = tf.keras.layers.Embedding(HPARAMS.vocab_size, 16)(inputs)
#   pooling_layer = tf.keras.layers.GlobalAveragePooling1D()(embedding_layer)
#   dense_layer = tf.keras.layers.Dense(16, activation='relu')(pooling_layer)
#   outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense_layer)
#   return tf.keras.Model(inputs=inputs, outputs=outputs)


def make_bilstm_model():
  """Builds a bi-directional LSTM model."""
  inputs = tf.keras.Input(
      shape=(HPARAMS.max_seq_length,), dtype='int64', name='words')
  embedding_layer = tf.keras.layers.Embedding(HPARAMS.vocab_size,
                                              HPARAMS.num_embedding_dims)(
                                                  inputs)
  lstm_layer = tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(HPARAMS.num_lstm_dims))(
          embedding_layer)
  dense_layer = tf.keras.layers.Dense(
      HPARAMS.num_fc_units, activation='relu')(
          lstm_layer)
  outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense_layer)
  return tf.keras.Model(inputs=inputs, outputs=outputs)


# Feel free to use an architecture of your choice.
model = make_bilstm_model()
model.summary()

since it is a binary classification problem, loss is taken as binary_crossentropy

L = −(ylog(p)+(1−y)log(1−p))

In [0]:
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Create a validation set from training data

In [0]:
validation_fraction = 0.9
validation_size = int(validation_fraction *
                      int(training_samples_count / HPARAMS.batch_size))
print(validation_size)
validation_dataset = train_dataset.take(validation_size)
train_dataset = train_dataset.skip(validation_size)

In [0]:
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=HPARAMS.train_epochs,
    verbose=1)

In [0]:
results = model.evaluate(test_dataset, steps=HPARAMS.eval_steps)
print(results)

In [0]:
history_dict = history.history
history_dict.keys()

In [0]:
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "-r^" is for solid red line with triangle markers.
plt.plot(epochs, loss, '-r^', label='Training loss')
# "-b0" is for solid blue line with circle markers.
plt.plot(epochs, val_loss, '-bo', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

plt.show()

In [0]:
plt.clf()   # clear figure

plt.plot(epochs, acc, '-r^', label='Training acc')
plt.plot(epochs, val_acc, '-bo', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='best')

plt.show()

# Graph Regularization on the above created model

In [0]:
# Build a new base LSTM model.
base_reg_model = make_bilstm_model()

In [0]:
# Wrap the base model with graph regularization.
graph_reg_config = nsl.configs.make_graph_reg_config(
    max_neighbors=HPARAMS.num_neighbors,
    multiplier=HPARAMS.graph_regularization_multiplier,
    distance_type=HPARAMS.distance_type,
    sum_over_axis=-1)
graph_reg_model = nsl.keras.GraphRegularization(base_reg_model,
                                                graph_reg_config)
graph_reg_model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
graph_reg_history = graph_reg_model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=HPARAMS.train_epochs,
    verbose=1)

In [0]:
graph_reg_results = graph_reg_model.evaluate(test_dataset, steps=HPARAMS.eval_steps)
print(graph_reg_results)

In [0]:
graph_reg_history_dict = graph_reg_history.history
graph_reg_history_dict.keys()

In [0]:
acc = graph_reg_history_dict['accuracy']
val_acc = graph_reg_history_dict['val_accuracy']
loss = graph_reg_history_dict['loss']
graph_loss = graph_reg_history_dict['graph_loss']
val_loss = graph_reg_history_dict['val_loss']
val_graph_loss = graph_reg_history_dict['val_graph_loss']

epochs = range(1, len(acc) + 1)

plt.clf()   # clear figure

# "-r^" is for solid red line with triangle markers.
plt.plot(epochs, loss, '-r^', label='Training loss')
# "-gD" is for solid green line with diamond markers.
plt.plot(epochs, graph_loss, '-gD', label='Training graph loss')
# "-b0" is for solid blue line with circle markers.
plt.plot(epochs, val_loss, '-bo', label='Validation loss')
# "-ms" is for solid magenta line with square markers.
plt.plot(epochs, val_graph_loss, '-ms', label='Validation graph loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='best')

plt.show()

In [0]:
plt.clf()   # clear figure

plt.plot(epochs, acc, '-r^', label='Training acc')
plt.plot(epochs, val_acc, '-bo', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='best')

plt.show()