<a href="https://colab.research.google.com/github/blainerothrock/nlp-group-2/blob/master/nlp_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Assignment 2 (Bengio and other Neural Language Models)

In [0]:
from __future__ import absolute_import, division, print_function

# %tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

# from google.colab import drive, files 
#drive.mount('/content/drive')

import os, pickle
import numpy as np
import math
import typing

In [0]:
os.listdir()

In [0]:
blaine_data_path = '/Users/blaine/Google Drive File Stream/My Drive/Winter20/nlp/nlp_group2/data'
data_path = 'drive/My Drive/Winter20/nlp/nlp_group2/data'
grant_data_path = 'drive/My Drive/nlp_group2/data'
sundar_data_path = 'drive/My Drive/nlp_group2/data'
z_data_path = 'drive/My Drive/nlp_group2/data'
sundar_local_path = '~/Workspaces/Q2/NLP/data'

In [0]:
data_path = blaine_data_path

In [0]:
print(os.listdir(data_path))

## Task 1
Split train corpus with `batch_size=30` and `window=5` 

In [0]:
# load data
int_tagged_train = pickle.load(open(os.path.join(data_path, 'group2.int_tagged_train.p'), 'rb'))
tagged_train = pickle.load(open(os.path.join(data_path, 'group2.tagged_train.p'), 'rb'))
vocab_dict = pickle.load(open(os.path.join(data_path, 'group2.vocab_dict.p'), 'rb'))

In [0]:
print(int_tagged_train[:10], '\n', tagged_train[:10])

In [0]:
# batch the train integer representations
def gen_batches(context_size, batch_size, data):
  num_data = len(data)

  # removing remainder tokens
  remainder = num_data % 30
  data = data[:num_data - remainder]
  num_data = len(data)

  batches = np.array_split(data, math.floor(num_data)/batch_size)
  return batches

batches = gen_batches(5, 30, tagged_train)

In [0]:
def print_seq(batch, window, seq_idx):
  input_tokens = batch[seq_idx:seq_idx+window]
  target_token = batch[seq_idx+window]

  print("input : ", input_tokens)
  print("target: [", target_token, "]\n")

In [0]:
print('--- batch 01 ---')
print_seq(batches[0], window, 0)
print_seq(batches[0], window, 1)
print_seq(batches[0], window, 2)

print('-- batch 02 --')
print_seq(batches[1], window, 0)
print_seq(batches[1], window, 1)
print_seq(batches[1], window, 2)

## Task 2: Bengio Style Feedforward network language model
- TensorFlow version: `2.1.0`

In [0]:
class BengioParams():
  context_window = 5
  batch_size = 30
  
  hidden_units = 50
  embeddings_space = 60
  num_epochs = 20

  learning_rate = 0.5

  gpu_mem = 0.25
  
  tf_precision = tf.float32
  np_precision = np.float32

  init_scale = 0.5
  max_grad = 10.0


In [0]:

class BengioModel():
  """
  Class implements Bengio NN model with Tensorflow accoring to the function:
    y = b + Ux + tanh(d + Hx)
  
  and 
    cost = softmax_cross_entropy?

  NOTE: dimensions are not correct yet! need clarification 
  (may need to transpose)
  """

  def __init__(self, params=BengioParams()):
    self.x = tf.compat.v1.placeholder(
        params.tf_precision, 
        shape=[params.context_window, 1],
        name="x"
    )

    self.y = tf.compat.v1.placeholder(
        params.tf_precision,
        shape=[1],
        name="y"
    )

    self.W = tf.compat.v1.get_variable(
        name="W",
        shape=[params.context_window, 1],
        dtype=params.tf_precision
    )
    
    self.H = tf.compat.v1.get_variable(
        name="H",
        shape=[params.context_window, 1],
        dtype=params.tf_precision
    )

    self.U = tf.compat.v1.get_variable(
        name="U",
        shape=[params.context_window, 1],
        dtype=params.tf_precision
    )

    self.b = tf.compat.v1.get_variable(
        name="b",
        shape=[1],
        dtype=params.tf_precision
    )

    self.d = tf.compat.v1.get_variable(
        name="d",
        shape=[1],
        dtype=params.tf_precision
    )

    self.a1 = tf.compat.v1.tanh(
        self.d + tf.matmul(self.x, self.H)
    )

    self.y_hat =  self.b + tf.matmul(self.W, self.x) + self.a1

    self.cost = tf.nn.softmax_cross_entropy_with_logits(
        self.y,
        self.y_hat,
        name='cross_entropy_coss_fn'
    )

    