<a href="https://colab.research.google.com/github/blainerothrock/nlp-group-2/blob/master/nlp_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Assignment 2 (Bengio and other Neural Language Models)

In [0]:
from __future__ import absolute_import, division, print_function

%tensorflow_version 2.x
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
print(tf.__version__)

from google.colab import drive, files 
drive.mount('/content/drive')

import os, pickle
import numpy as np
import math
import typing

In [0]:
os.listdir()

In [0]:
blaine_data_path = '/Users/blaine/Google Drive File Stream/My Drive/Winter20/nlp/nlp_group2/data'
data_path = 'drive/My Drive/Winter20/nlp/nlp_group2/data'
grant_data_path = 'drive/My Drive/nlp_group2/data'
sundar_data_path = 'drive/My Drive/nlp_group2/data'
z_data_path = 'drive/My Drive/nlp_group2/data'
sundar_local_path = '~/Workspaces/Q2/NLP/data'

In [0]:
data_path = data_path

In [0]:
print(os.listdir(data_path))

## Task 1
Split train corpus with `batch_size=30` and `window=5` 

In [0]:
# load data
int_train = pickle.load(open(os.path.join(data_path, 'group2.int_train.p'), 'rb'))
train = pickle.load(open(os.path.join(data_path, 'group2.train.p'), 'rb'))
vocab_dict = pickle.load(open(os.path.join(data_path, 'group2.vocab_dict.p'), 'rb'))
int_train = [vocab_dict[w] for w in train]

In [0]:
print(int_tagged_train[:10], '\n', tagged_train[:10])

print("vocab len: %i" % len(vocab_dict))
print("int rep: %s" % len(int_train))
print("train token: %s" % len(train))

In [0]:
word_ids = tf.constant(int_train)
embeddings = tf.Variable(tf.random_uniform([len(vocab_dict), 60]))
embed = tf.nn.embedding_lookup(embeddings, word_ids)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    embeddings = sess.run(embed)

embeddings.shape

In [0]:
# batch the train integer representations
def gen_batches(context_size, num_batches, data):
  num_data = len(data)

  # removing remainder tokens
  remainder = num_data % math.floor(num_data/num_batches)
  print(remainder)
  data = data[:num_data - remainder]
  num_data = len(data)

  # batches = np.array_split(data, math.floor(num_data)/batch_size)
  batches = np.split(np.array(data),num_batches,axis=0)
  return batches

batches_words = gen_batches(5, 30, train)
batches_embeddings = gen_batches(5, 30, embeddings)

In [0]:
def print_seq(batch, window, seq_idx):
  input_tokens = batch[seq_idx:seq_idx+window]
  target_token = batch[seq_idx+window]

  print("input : ", input_tokens)
  print("target: [", target_token, "]\n")

In [0]:
print('--- batch 01 ---')
print_seq(batches_words[0], 5, 0)
print_seq(batches_words[0], 5, 1)
print_seq(batches_words[0], 5, 2)

print('-- batch 02 --')
print_seq(batches_words[1], 5, 0)
print_seq(batches_words[1], 5, 1)
print_seq(batches_words[1], 5, 2)

## Task 2: Bengio Style Feedforward network language model
- TensorFlow version: `2.1.0`

In [0]:
class BengioParams():
  context_window = 5
  num_batches = 30

  vocab_len = len(vocab_dict)
  
  hidden_units = 50
  embeddings_dim = 60
  num_epochs = 20

  learning_rate = 0.5

  gpu_mem = 0.25
  
  tf_precision = tf.float32
  np_precision = np.float32

  init_scale = 0.5
  max_grad = 10.0


In [0]:
class BengioModel():
  """
  Class implements Bengio NN model with Tensorflow accoring to the function:
    y = b + Wx + Utanh(d + Hx)
  
  and 
    cost = softmax_cross_entropy?

  NOTE: dimensions are not correct yet! need clarification 
  (may need to transpose)

  NOTE: Y = (30, |v|)
        X = (30, 300) -> window_size * embedding_size -> 5*60 = 300
        W = (300, |v|)
        b = (1, |v|)
        U = (50, |v|)
        d = (1, 50)
        H = (300, 50)

  """

  def __init__(self, params=BengioParams()):

    self.Y = tf.placeholder(
        params.tf_precision,
        shape=[params.num_batches, params.vocab_len],
        name="Y"
    )
    
    self.X = tf.placeholder(
        params.tf_precision, 
        shape=[params.num_batches, params.context_window * params.embeddings_dim],
        name="X"
    )

    
    self.W = tf.get_variable(
        name="W",
        shape=[params.context_window * params.embeddings_dim, params.vocab_len],
        dtype=params.tf_precision
    )
    
    self.H = tf.get_variable(
        name="H",
        shape=[params.context_window * params.embeddings_dim, params.hidden_units],
        dtype=params.tf_precision
    )

    self.U = tf.get_variable(
        name="U",
        shape=[params.hidden_units, params.vocab_len],
        dtype=params.tf_precision
    )

    self.b = tf.get_variable(
        name="b",
        shape=[1, params.vocab_len],
        dtype=params.tf_precision
    )

    self.d = tf.get_variable(
        name="d",
        shape=[1, params.hidden_units],
        dtype=params.tf_precision
    )

    self.a1 = tf.tanh(
        self.d + tf.matmul(self.X, self.H)
    )

    self.y_hat =  self.b + tf.matmul(tf.transpose(self.W), tf.transpose(self.X)) + tf.matmul(tf.transpose(U), self.a1)

    self.cost = tf.nn.softmax_cross_entropy_with_logits(
        self.Y,
        self.y_hat,
        name='cross_entropy_coss_fn'
    )

    self.optimizer = tf.train.GradientDescentOptimizer(params.learning_rate).minimize(self.cost)

In [0]:
def run():

  params = BengioParams()

  with tf.Session() as sess:

    initializer = tf.random_uniform_initializer(-params.init_scale,params.init_scale)
    with tf.variable_scope("BengioModel", reuse=tf.AUTO_REUSE, initializer=initializer):
      model = BengioModel(params=params)
      for batch in batch_embeddings:
        batch_x = []
        batch_y = []
        for win_idx in range(len(batch) - params.context_window - 1):
          batch_x.append(batch[win_idx:win_idx+params.context_window])
          batch_y.append(batch[win_idx+params.context_window])
        
        session.run(model.optimizer, feed={ x:batch_x, y: batch_y })

In [0]:
run()