<a href="https://colab.research.google.com/github/arnisafazla/Neural_Networks_Implementations/blob/main/Simple_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import math
import random
from numpy.random import seed
from numpy.random import randn
from numpy import linspace

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
vocab = np.zeros((250, 250))
np.fill_diagonal(vocab, 1)
def one_hot(x):
  return vocab[x]
def inverse_one_hot(x):
  return np.argmax(x)
def sigmoid(v):
  v = np.clip( v, -500, 500 )
  return 1/(1 + np.exp(-v))
def sigmoid_derivative(v):
  return sigmoid(v) * (1 - sigmoid(v))

In [None]:
file = h5py.File("/content/drive/MyDrive/EEE443/hw2/assign2_data2.h5", "r")
trainx = np.array(file['trainx'])
traind = np.array(file['traind'])
testx = np.array(file['testx'])
testd = np.array(file['testd'])
testx = np.array(file['testx'])
valx = np.array(file['valx'])
vald = np.array(file['vald'])

print(trainx.shape, testx.shape, testx.shape)
trainx -= 1
valx -= 1
testx -= 1
traind -= 1
vald -= 1
testd -= 1

(372500, 3) (46500, 3) (46500, 3)


In [None]:
words = file['words']

In [None]:
words[0]

b'all'

In [None]:
X_train = np.array([[one_hot(word) for word in row] for row in trainx])
X_val = np.array([[one_hot(word) for word in row] for row in valx])
X_test = np.array([[one_hot(word) for word in row] for row in testx])
Y_train = np.array([one_hot(val) for val in traind])
Y_val = np.array([one_hot(val) for val in vald])
Y_test = np.array([one_hot(val) for val in testd])

In [None]:
index = [*range(20)]
chosen = np.random.choice(index, 5)
np.take(index, chosen)

array([ 9,  3, 16, 11, 11])

In [None]:
chosen

array([ 9,  3, 16, 11, 11])

In [None]:
class Model(object):
  def __init__(self, D, P, learning_rate, alpha=0.85):
    self.D = D
    self.P = P
    self.learning_rate = learning_rate
    self.alpha = alpha
    self.R = np.random.normal(0, 0.01, 250 * D).reshape((250, D))
    self.W_hidden = np.random.normal(0, 0.01, P * (D + 1)).reshape((P, D + 1))
    self.W_output = np.random.normal(0, 0.01, 250 * (P + 1)).reshape((250, P + 1))
    self.r_R = np.zeros((self.R.shape))
    self.r_hidden = np.zeros((self.W_hidden.shape))
    self.r_output = np.zeros((self.W_output.shape))

  def forward_pass(self, X):
    self.X = X
    self.embedded = np.matmul(self.X, self.R) # input to the hidden layer. take their avg
    self.embedded = np.array([np.append(row, -1) for row in self.embedded])
    self.v = np.matmul(self.embedded, self.W_hidden.T)
    hidden_output = sigmoid(self.v)  # input to the output layer
    hidden_output = np.array([np.append(row, -1) for row in hidden_output])
    self.hidden_output = hidden_output
    
    self.z = np.matmul(self.hidden_output, self.W_output.T)  # input to the softmax function
    return softmax(self.z)

  def back_propogate(self, derivative, batch_size):
    gradient_W_output = np.matmul(derivative.T, self.hidden_output)
    self.r_output = self.alpha * self.r_output + (1 - self.alpha) * gradient_W_output ** 2
    self.update_W_output = - (self.learning_rate * gradient_W_output / (0.0000000000000001 + np.sqrt(self.r_output))) / batch_size

    # now hidden layer
    W_output_raw = self.W_output.T[0:self.W_output.shape[-1] - 1].T
    delta = np.multiply(np.matmul(derivative, W_output_raw), sigmoid_derivative(self.v)).T
    gradient_W_hidden = np.matmul(delta, self.embedded)
    self.r_hidden = self.alpha * self.r_hidden + (1 - self.alpha) * gradient_W_hidden ** 2
    self.update_W_hidden = - (self.learning_rate * gradient_W_hidden / (0.0000000000000001 + np.sqrt(self.r_hidden))) / batch_size

    # now embedding layer
    W_hidden_raw = self.W_hidden.T[0:self.W_hidden.shape[-1] - 1].T
    gradient_R = np.matmul(self.X.T, np.matmul(delta.T, W_hidden_raw))
    self.r_R = self.alpha * self.r_R + (1 - self.alpha) * gradient_R ** 2
    self.update_R = - (self.learning_rate * gradient_R / (0.0000000000000001 + np.sqrt(self.r_R))) / batch_size

    return self.update_W_output, self.update_W_hidden, self.update_R

  def train(self, X_train, Y_train, X_val, Y_val, batch_size, verbose=False):
    print('Training => This will take some time')
    n = X_train.shape[0]
    no_of_batches = n // batch_size
    extra = n % batch_size

    self.train_error_list = []
    self.val_error_list = []
    for epoch in range(5):
      gradient = 0
      train_error = 0
      val_error = 0
      # shuffle X_train, Y_train
      index = [*range(n)]
      random.shuffle(index)
      X_train_shuffled = np.array([X_train[i] for i in index])
      Y_train_shuffled = np.array([Y_train[i] for i in index])

      # shuffle validation data
      index = [*range(n)]
      random.shuffle(index)

      X_val_tmp = np.array([X_val[i] for i in index])
      Y_val_tmp = np.array([Y_val[i] for i in index])
      X1_val = X_val_tmp.T[0]
      X2_val = X_val_tmp.T[1]
      X3_val = X_val_tmp.T[2] 

      X1_val = np.array([one_hot(word) for word in X1_val])
      X2_val = np.array([one_hot(word) for word in X2_val])
      X3_val = np.array([one_hot(word) for word in X3_val])
      Y1_val = np.array([one_hot(word) for word in Y_val_tmp])

      for i in range(no_of_batches):
        update_W_output = 0
        update_W_hidden = 0
        update_R = 0
        if i < no_of_batches - 1:
          net_batch_size = batch_size
          batch_X = X_train_shuffled[i * batch_size:(i+1) * batch_size]
          batch_Y = Y_train_shuffled[i * batch_size:(i+1) * batch_size]
          
        else:
          net_batch_size = batch_size + extra
          batch_X = X_train_shuffled[i * batch_size:(i+1) * batch_size + extra]
          batch_Y = Y_train_shuffled[i * batch_size:(i+1) * batch_size + extra]

        X1 = batch_X.T[0]
        X2 = batch_X.T[1]
        X3 = batch_X.T[2]

        X1 = np.array([one_hot(word) for word in X1])
        X2 = np.array([one_hot(word) for word in X2])
        X3 = np.array([one_hot(word) for word in X3])
        Y = np.array([one_hot(word) for word in batch_Y])

        softmax_output = self.forward_pass(X1)
        cross_out = cross_entropy(softmax_output, X2)

        update_W_output_tmp, update_W_hidden_tmp, update_R_tmp = self.back_propogate(softmax_output - cross_out, net_batch_size)
        update_W_output += update_W_output_tmp
        update_W_hidden += update_W_hidden_tmp
        update_R += update_R_tmp
        # don't update yet

        softmax_output = self.forward_pass(X2)
        out = np.array([np.argmax(row) for row in softmax_output]) # indices of the output words
        cross_out = cross_entropy(softmax_output, X3)

        update_W_output_tmp, update_W_hidden_tmp, update_R_tmp = self.back_propogate(softmax_output - cross_out, net_batch_size)
        update_W_output += update_W_output_tmp
        update_W_hidden += update_W_hidden_tmp
        update_R += update_R_tmp

        softmax_output = self.forward_pass(X3)
        out = np.array([np.argmax(row) for row in softmax_output]) # indices of the output words
        cross_out = cross_entropy(softmax_output, Y)
        train_error += cross_out  # classification error

        update_W_output_tmp, update_W_hidden_tmp, update_R_tmp = self.back_propogate(softmax_output - cross_out, net_batch_size)
        update_W_output += update_W_output_tmp
        update_W_hidden += update_W_hidden_tmp
        update_R += update_R_tmp

        self.W_output += update_W_output / 3
        self.W_hidden += update_W_hidden / 3
        self.R += update_R / 3

        X1_val_tmp = X1_val
        X1_val_tmp = self.forward_pass(X1_val_tmp)
        X1_val_tmp = self.forward_pass(X1_val_tmp)
        X1_val_tmp = self.forward_pass(X1_val_tmp)    # X1_val_tmp has contribution to the output

        X2_val_tmp = X2_val
        X2_val_tmp = self.forward_pass(X2_val_tmp)
        X2_val_tmp = self.forward_pass(X2_val_tmp)    # X2_val_tmp has contribution to the output  

        X3_val_tmp = X3_val
        X3_val_tmp = self.forward_pass(X3_val_tmp)    # X3_val_tmp has contribution to the output  

        # outputs are the probabilities so we can take their averages
        val_output = (X1_val_tmp + X2_val_tmp + X3_val_tmp) / 3
        val_error += cross_entropy(val_output, Y1_val)
        
        if verbose == 2:
          print('   => Epoch ', epoch, ' batch ', i, ' out of ', no_of_batches, ' => train_error: ', train_error, ", val_error: ", val_error)
      if verbose == 1:
        print("Epoch ", epoch, " => train_error: ", train_error, ", val_error: ", val_error)
      self.train_error_list.append(train_error)
      self.val_error_list.append(val_error)
    return self.train_error_list, self.val_error_list

  def test(X_test, Y_test):
    X1 = X_test.T[0]
    X2 = X_test.T[1]
    X3 = X_test.T[2]

    X1 = np.array([one_hot(word) for word in X1])
    X2 = np.array([one_hot(word) for word in X2])
    X3 = np.array([one_hot(word) for word in X3])
  
    X1_tmp = X1
    X1_tmp = self.forward_pass(X1_tmp)
    X1_tmp = self.forward_pass(X1_tmp)
    X1_tmp = self.forward_pass(X1_tmp)    # X1_tmp has contribution to the output

    X2_tmp = X2
    X2_tmp = self.forward_pass(X2_tmp)
    X2_tmp = self.forward_pass(X2_tmp)    # X2_tmp has contribution to the output  

    X3_tmp = X3
    X3_tmp = self.forward_pass(X3_tmp)    # X3_tmp has contribution to the output  

    output = (X1_tmp + X2_tmp + X3_tmp) / 3
    error = cross_entropy(output, Y_test)
    return error
  def predict(self, X):
    X1 = X.T[0]
    X2 = X.T[1]
    X3 = X.T[2]
  
    X1 = np.array([one_hot(word) for word in X1])
    X2 = np.array([one_hot(word) for word in X2])
    X3 = np.array([one_hot(word) for word in X3])
    X1_tmp = X1
    X1_tmp = self.forward_pass(X1_tmp)
    X1_tmp = self.forward_pass(X1_tmp)
    X1_tmp = self.forward_pass(X1_tmp)    # X1_tmp has contribution to the output

    X2_tmp = X2
    X2_tmp = self.forward_pass(X2_tmp)
    X2_tmp = self.forward_pass(X2_tmp)    # X2_tmp has contribution to the output  

    X3_tmp = X3
    X3_tmp = self.forward_pass(X3_tmp)    # X3_tmp has contribution to the output 
    output = (X1_tmp + X2_tmp + X3_tmp) / 3
    return output

In [None]:
# try with different D and P values
(D, P, batch_size, learning_rate) = (32, 256, 200, 0.15)
# no need to shuffle X and Y because shuffled inside the init function
model = Model(D, P, learning_rate)
train_error_list, val_error_list = model.train(trainx, traind, valx, vald, batch_size, verbose=1)
draw_error_curves((train_error_list, val_error_list))
test_error = model.test(testx, testd)
print('Cross-entropy error on the test data is : ', test_error)

In [None]:
draw_error_curves((model.train_error_list, model.val_error_list))

In [None]:
R = model.R
W_hidden = model.W_hidden
W_output = model.W_output
# model2 = Model(32, 256, 0.15)

In [None]:
model2 = Model(32, 256, 0.15)
model2.R = R
model2.W_hidden = W_hidden
model2.W_output = W_output

In [None]:
print('Part B) try different trigrams')
index = [*range(testd.shape[0])]
index_chosen = np.random.choice(index, 5)    # take 5 random trigrams

X = np.take(testx, index_chosen, axis=0)
Y = np.take(testd, index_chosen)
outputs = model2.predict(X)

Part B) try different trigrams


In [None]:
outputs

array([[0.00386851, 0.00501055, 0.00452654, ..., 0.00358649, 0.00460833,
        0.00556633],
       [0.00386851, 0.00501055, 0.00452654, ..., 0.00358649, 0.00460833,
        0.00556633],
       [0.00386851, 0.00501055, 0.00452654, ..., 0.00358649, 0.00460833,
        0.00556633],
       [0.00386851, 0.00501055, 0.00452654, ..., 0.00358649, 0.00460833,
        0.00556633],
       [0.00386851, 0.00501055, 0.00452654, ..., 0.00358649, 0.00460833,
        0.00556633]])

In [None]:
idx = np.array([np.argpartition(row, -10)[-10:] for row in outputs])
indices = np.array([idx[i][np.argsort((-outputs[i])[idx[i]])] for i in range(idx.shape[0])])

In [None]:
for i in range(5):
  print('trigram: ', words[testx[i][0]], words[testx[i][1]], words[testx[i][2]])
  print('10 best outputs: ')
  x = indices[i]
  for j in range(10):
    print(words[x[j]])

In [None]:
print('The model gave all the same words each time. This is probably because of training less and that these are all common words.')

In [None]:
testd[0]

143

In [None]:
# plot all the error curves
def draw_error_curves(error, pdf=None):
  x = linspace(0, 1, len(error[0]))
  ax = plt.subplot(111)

  train = error[0]
  val = error[1]

  plt.plot(x, train, label = "train_error")
  plt.plot(x, val, label = "validation_error")

  plt.xlabel("epoch")
  plt.ylabel("Error")
  plt.title("Error Curve")
  ax.legend()
  if pdf != None:
    plt.savefig(fig1)
  else:
    plt.show()

In [None]:
def cross_entropy(Y, D, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray        
    Returns: scalar
    """
    # D_encode = np.array([one_hot(val) for val in D])
    Y = np.clip(Y, epsilon, 1. - epsilon)
    N = Y.shape[0]
    # print('Y: ', Y.shape)
    # print('D: ', D_encode.shape)
    ce = -np.sum(D*np.log(Y+1e-9))/N
    return ce

# SOFTMAX ACTIVATION FUNCTION
def softmax(X):
  x_max = np.array([np.max(row) for row in X])
  sum = np.array([np.sum(np.exp(row - np.max(row))) for row in X])
  # print(sum)
  prob = np.array([np.exp(X[i] - np.max(X[i])) / sum[i] for i in range(X.shape[0])])
  return prob
def softmax_crossentropy_derivative(s, y):
  # s: output of the softmax
  # y: output of the crossentropy
  return s - y