<a href="https://colab.research.google.com/github/blainerothrock/nlp-group-2/blob/master/nlp_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Assignment 2 (Bengio and other Neural Language Models)

In [0]:
from __future__ import absolute_import, division, print_function

%tensorflow_version 2.x
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
print(tf.__version__)

from google.colab import drive, files 
drive.mount('/content/drive')

import os, pickle
import numpy as np
import math
import typing

In [0]:
os.listdir()

In [0]:
blaine_data_path = '/Users/blaine/Google Drive File Stream/My Drive/Winter20/nlp/nlp_group2/data'
data_path = 'drive/My Drive/Winter20/nlp/nlp_group2/data'
grant_data_path = 'drive/My Drive/nlp_group2/data'
sundar_data_path = 'drive/My Drive/nlp_group2/data'
z_data_path = 'drive/My Drive/nlp_group2/data'
sundar_local_path = '~/Workspaces/Q2/NLP/data'

In [0]:
data_path = data_path

In [0]:
print(os.listdir(data_path))

## Task 1
Split train corpus with `batch_size=30` and `window=5` 

In [0]:
# load data
int_train = pickle.load(open(os.path.join(data_path, 'group2.int_train.p'), 'rb'))
int_validation = pickle.load(open(os.path.join(data_path, 'group2.int_valid.p'), 'rb'))
int_test = pickle.load(open(os.path.join(data_path, 'group2.int_test.p'), 'rb'))
train = pickle.load(open(os.path.join(data_path, 'group2.train.p'), 'rb'))
vocab_dict = pickle.load(open(os.path.join(data_path, 'group2.vocab_dict.p'), 'rb'))
# int_train = [vocab_dict[w] for w in train]

In [0]:
print(int_train[:10], '\n', train[:10])

print("vocab len: %i" % len(vocab_dict))
print("int rep: %s" % len(int_train))
print("train token: %s" % len(train))

In [0]:
# batch the train integer representations
def gen_batches(context_size, num_batches, data):
  num_data = len(data)

  # removing remainder tokens
  remainder = num_data % math.floor(num_data/num_batches)
  print(remainder)
  data = data[:num_data - remainder]
  num_data = len(data)

  # batches = np.array_split(data, math.floor(num_data)/batch_size)
  batches = np.split(np.array(data),num_batches,axis=0)
  return batches

batches_words = gen_batches(5, 30, train)
batches_int = gen_batches(5, 30, int_train)
# batches_embeddings = gen_batches(5, 30, embeddings)

In [0]:
def print_seq(batch, window, seq_idx):
  input_tokens = batch[seq_idx:seq_idx+window]
  target_token = batch[seq_idx+window]

  print("input : ", input_tokens)
  print("target: [", target_token, "]\n")

In [0]:
print('--- batch 01 ---')
print_seq(batches_words[0], 5, 0)
print_seq(batches_words[0], 5, 1)
print_seq(batches_words[0], 5, 2)

print('-- batch 02 --')
print_seq(batches_words[1], 5, 0)
print_seq(batches_words[1], 5, 1)
print_seq(batches_words[1], 5, 2)

## Task 2: Bengio Style Feedforward network language model
- TensorFlow version: `2.1.0`

In [0]:
class BengioParams():

  def __init__(self, vocab_dict):
    self.context_window = 5
    self.num_batches = 30

    self.vocab_len = len(vocab_dict)
    
    self.hidden_units = 50
    self.embeddings_dim = 60
    self.num_epochs = 20

    self.learning_rate = 0.5

    self.gpu_mem = 0.25
    
    self.tf_precision = tf.float32
    self.np_precision = np.float32

    self.init_scale = 0.5
    self.max_grad = 10.0


In [0]:
class BengioModel():
  """
  Class implements Bengio NN model with Tensorflow accoring to the function:
    y = b + Wx + Utanh(d + Hx)
  
  and 
    cost = softmax_cross_entropy?

  """

  def __init__(self, params):

    self.Y = tf.placeholder(
        dtype=params.tf_precision,
        shape=(params.num_batches, params.vocab_len),
        name="Y"
    )
    
    self.X = tf.placeholder(
        tf.int32, 
        shape=(params.num_batches, params.context_window),
        name="X"
    )

    # embeddings
    self.C = tf.Variable(
        tf.truncated_normal(
            shape=(params.vocab_len, params.embeddings_dim),
            mean=-1,
            stddev=-1
        ),
        dtype=params.tf_precision,
        name="C"
    )

    self.W = tf.Variable(
        tf.random_normal(
            shape=(params.vocab_len, params.context_window * params.embeddings_dim)
        ),
        name="W",
        dtype=params.tf_precision
    )
    
    self.H = tf.Variable(
        tf.random_normal(
            shape=(params.hidden_units, params.context_window * params.embeddings_dim)
        ),
        name="H",
        dtype=params.tf_precision
    )

    self.d = tf.Variable(
        tf.random_normal(
            shape=(params.hidden_units,)
        ),
        name="d",
        dtype=params.tf_precision
    )

    self.U = tf.Variable(
        tf.random_normal(
            (params.vocab_len, params.hidden_units)
        ),
        name="U",
        dtype=params.tf_precision
    )

    self.b = tf.Variable(
        tf.random_normal(
            shape=(params.vocab_len, )
        ),
        name="b",
        dtype=params.tf_precision
    )

    with tf.name_scope("Projection_Layer"):
      x = tf.nn.embedding_lookup(self.C, self.X)
      x = tf.reshape(
          x,
          shape=(params.num_batches, params.context_window * params.embeddings_dim)
      )

    with tf.name_scope("Hidden_Layer"):
      Hx = tf.matmul(x, tf.transpose(self.H))
      a = tf.nn.tanh(tf.add(Hx, self.d))

    with tf.name_scope("Output_Layer"):
      Ua = tf.matmul(a, tf.transpose(self.U))
      Wx = tf.matmul(x, tf.transpose(self.W))
      Y_hat = tf.add(self.b, tf.add(Wx, Ua)) 

    with tf.name_scope("Cost"):
     self.cost = tf.reduce_mean( 
        tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=self.Y,
            logits=Y_hat
        )
      )
     self.perplexity = tf.exp(self.cost)

    self.optimizer = tf.train.GradientDescentOptimizer(params.learning_rate).minimize(self.cost)

In [0]:
def spiltInputTarget(batch, win_idx, params):
  _x = batch[win_idx:win_idx + params.context_window]
  _y = np.zeros(params.vocab_len)
  _y[batch[win_idx + params.context_window]] = 1
  return _x, _y


def run(model, params, batches_train_int, batches_validation_int, batches_test_int):

  perplexity_history = []
  cost_history = []

  val_perplexity_history = []
  val_cost_history = []

  test_perplexity_history = []
  test_cost_history = []
  

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.25)
  with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=True)) as sess:
    initializer = tf.global_variables_initializer()
    initializer.run()
    step = 0
    for epoch in range(params.num_epochs):
      # run model with 30 batches for a window size
      # for idx in total number of window sizes
        # for batch in batch_int
      win_idx = 0
      while win_idx < (len(batches_train_int[0]) - params.context_window - 1):
        batch_x = []
        batch_y = []
        for batch in batches_train_int:
          _x, _y = spiltInputTarget(batch, win_idx, params)
          batch_x.append(_x)
          batch_y.append(_y)
        
        cost, perplexity, _ = sess.run(
            [model.cost, model.perplexity, model.optimizer], 
            feed_dict={ model.X:batch_x, model.Y:batch_y }
        )

        # calculate validation & test preplexity after each epoch

        if step % 1000 == 0:
          print("train: step {}, cost: {}, perplexity: {}".format(step, cost, perplexity))
          perplexity_history.append(perplexity)
          cost_history.append(cost)
        
        step+=1
        win_idx+=1

      # validation
      win_idx = 0
      val_perplexity = 0
      val_cost = 0
      while win_idx < (len(batches_validation_int[0]) - params.context_window - 1):
        val_batch_x = []
        val_batch_y = []
        for batch in batches_validation_int:
          _x, _y = spiltInputTarget(batch, win_idx, params)
          val_batch_x.append(_x)
          val_batch_y.append(_y)

        cost, perplexity = sess.run(
            [model.cost, model.perplexity], 
            feed_dict={ model.X:val_batch_x, model.Y:val_batch_y }
        )
        val_perplexity = perplexity
        val_cost = cost
        
        win_idx+=1
      
      val_perplexity_history.append(val_perplexity)
      val_cost_history.append(val_cost)
      print("validation: epoch {}, cost: {}, perplexity: {}".format(epoch, val_cost, val_perplexity))

      # test
      win_idx = 0
      test_perplexity = 0
      test_cost = 0
      while win_idx < (len(batches_test_int[0]) - params.context_window - 1):
        test_batch_x = []
        test_batch_y = []
        for batch in batches_test_int:
          _x, _y = spiltInputTarget(batch, win_idx, params)
          test_batch_x.append(_x)
          test_batch_y.append(_y)

        cost, perplexity = sess.run(
            [model.cost, model.perplexity], 
            feed_dict={ model.X:test_batch_x, model.Y:test_batch_y }
        )
        test_perplexity = perplexity
        test_cost = cost
        
        win_idx+=1
      
      test_perplexity_history.append(test_perplexity)
      test_cost_history.append(test_cost)
      print("test: epoch {}, cost: {}, perplexity: {}".format(epoch, test_cost, test_perplexity_history))

  return perplexity_history, cost_history, val_perplexity_history, val_cost_history, test_perplexity_history, test_cost_history


In [0]:
# brown

# read the brown corpus text file
with open(os.path.join(data_path, 'brown_tokenized.txt'), 'r') as f:
  brown_tokens_all = f.readline().split(" ")

# remove tokens with less than 3 freq
brown_tok_freq = {}
for idx, tok in enumerate(brown_tokens_all):
  brown_tok_freq[tok] = brown_tok_freq.get(tok, 0) + 1

brown_tokens_all = [tok for tok in filter(lambda x: brown_tok_freq[x] >= 3, brown_tokens_all)]  

# create vocab
brown_vocab = set([tok for tok in brown_tokens_all])

# create train, validation, test
brown_train = brown_tokens_all[:800000]
brown_validation = brown_tokens_all[800000:1000000]
brown_test = brown_tokens_all[1000000:]

print("size of brown vocab: %i" % len(brown_vocab))

# integer representation
brown_vocab_dict = {}
for i, v in enumerate(brown_vocab):
    brown_vocab_dict[v] = i

brown_train_int = [brown_vocab_dict[tok] for tok in brown_train]
brown_validation_int = [brown_vocab_dict[tok] for tok in brown_validation]
brown_test_int = [brown_vocab_dict[tok] for tok in brown_test]

In [0]:
# train on brown
# brown_train_int

brown_params = BengioParams(brown_vocab_dict)
brown_model = BengioModel(params=brown_params)

brown_train_batches_int = gen_batches(5, 30, brown_train_int)
brown_val_batches_int = gen_batches(5, 30, brown_validation_int)
brown_test_batches_int = gen_batches(5, 30, brown_test_int)

brown_perplexity_history,  brown_cost_history, brown_val_perplexity_history, brown_val_cost_history, brown_test_perplexity_history, brown_test_cost_history = run(brown_model, brown_params, brown_train_batches_int, brown_val_batches_int, brown_test_batches_int)

In [0]:
import pickle

pickle.dump(brown_perplexity_history, open(os.path.join(data_path, 'brown_perplexity_history.p'), 'wb'))
pickle.dump(brown_cost_history, open(os.path.join(data_path, 'brown_cost_history.p'), 'wb'))
pickle.dump(brown_val_perplexity_history, open(os.path.join(data_path, 'brown_val_perplexity_history.p'), 'wb'))
pickle.dump(brown_val_cost_history, open(os.path.join(data_path, 'brown_val_cost_history.p'), 'wb'))
pickle.dump(brown_test_perplexity_history, open(os.path.join(data_path, 'brown_test_perplexity_history.p'), 'wb'))
pickle.dump(brown_test_cost_history, open(os.path.join(data_path, 'brown_test_cost_history.p'), 'wb'))

In [0]:
# train our model
class BengioModel2():
  """
  Class implements Bengio NN model with Tensorflow accoring to the function:
    y = b + Utanh(d + Hx)
  
  and 
    cost = softmax_cross_entropy?

  """

  def __init__(self, params):

    self.Y = tf.placeholder(
        dtype=params.tf_precision,
        shape=(params.num_batches, params.vocab_len),
        name="Y"
    )
    
    self.X = tf.placeholder(
        tf.int32, 
        shape=(params.num_batches, params.context_window),
        name="X"
    )

    # embeddings
    self.C = tf.Variable(
        tf.truncated_normal(
            shape=(params.vocab_len, params.embeddings_dim),
            mean=-1,
            stddev=-1
        ),
        dtype=params.tf_precision,
        name="C"
    )

    # self.W = tf.Variable(
    #     tf.random_normal(
    #         shape=(params.vocab_len, params.context_window * params.embeddings_dim)
    #     ),
    #     name="W",
    #     dtype=params.tf_precision
    # )
    
    self.H = tf.Variable(
        tf.random_normal(
            shape=(params.hidden_units, params.context_window * params.embeddings_dim)
        ),
        name="H",
        dtype=params.tf_precision
    )

    self.d = tf.Variable(
        tf.random_normal(
            shape=(params.hidden_units,)
        ),
        name="d",
        dtype=params.tf_precision
    )

    self.U = tf.Variable(
        tf.random_normal(
            (params.vocab_len, params.hidden_units)
        ),
        name="U",
        dtype=params.tf_precision
    )

    self.b = tf.Variable(
        tf.random_normal(
            shape=(params.vocab_len, )
        ),
        name="b",
        dtype=params.tf_precision
    )

    with tf.name_scope("Projection_Layer"):
      x = tf.nn.embedding_lookup(self.C, self.X)
      x = tf.reshape(
          x,
          shape=(params.num_batches, params.context_window * params.embeddings_dim)
      )

    with tf.name_scope("Hidden_Layer"):
      Hx = tf.matmul(x, tf.transpose(self.H))
      a = tf.nn.tanh(tf.add(Hx, self.d))

    with tf.name_scope("Output_Layer"):
      Ua = tf.matmul(a, tf.transpose(self.U))
      # Wx = tf.matmul(x, tf.transpose(self.W))
      Y_hat = tf.add(self.b, Ua) 

    with tf.name_scope("Cost"):
     self.cost = tf.reduce_mean( 
        tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=self.Y,
            logits=Y_hat
        )
      )
     self.perplexity = tf.exp(self.cost)

    self.optimizer = tf.train.GradientDescentOptimizer(params.learning_rate).minimize(self.cost)

int_train = pickle.load(open(os.path.join(data_path, 'group2.int_train.p'), 'rb'))
int_validation = pickle.load(open(os.path.join(data_path, 'group2.int_valid.p'), 'rb'))
int_test = pickle.load(open(os.path.join(data_path, 'group2.int_test.p'), 'rb'))
vocab_dict = pickle.load(open(os.path.join(data_path, 'group2.vocab_dict.p'), 'rb'))

train_batches_int = gen_batches(5, 30, int_train)
val_batches_int = gen_batches(5, 30, int_validation)
test_batches_int = gen_batches(5, 30, int_test)

run01_params = BengioParams(vocab_dict=vocab_dict)
run01_params.embedding_dimensions = 100
run01_params.hidden_units = 100
run01_params.context_window = 5

run01_model = BengioModel2(params=run01_params)

run01_perplexity_history,  run01_cost_history, run01_val_perplexity_history, run01_val_cost_history, run01_test_perplexity_history, run01_test_cost_history = run(run01_model, run01_params, train_batches_int, val_batches_int, test_batches_int)