In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import tqdm

# %matplotlib widget

In [2]:
class Dense(object):
    def __init__(self, out_dim, inp_dim=None, reg=0.001):
        self.W = None
        self.b = np.random.normal(0, 0.2, out_dim)
        self.reg = reg
        self.out_dim = out_dim
        self.inp_dim = inp_dim
        self.inp = None
        self.next = None
        self.momment1 = None
        self.momment2 = None
        self.momment_b1 = np.zeros_like(self.b)
        self.momment_b2 = np.zeros_like(self.b)
        if inp_dim:
            # self.W = np.random.normal(0, 1, (self.out_dim, self.inp_dim))
            self.W = np.random.normal(0, np.sqrt(2.0/(self.out_dim + self.inp_dim)), (self.out_dim, self.inp_dim))
            self.momment1 = np.zeros_like(self.W)
            self.momment2 = np.zeros_like(self.W)

    def __call__(self, inp=None):
        self.inp = inp
        inp.next = self
        self.inp_dim = inp.out_dim
        # self.W = np.random.normal(0, 1, (self.out_dim, self.inp_dim))
        self.W = np.random.normal(0, np.sqrt(2.0/(self.out_dim + self.inp_dim)), (self.out_dim, self.inp_dim))
        self.momment1 = np.zeros_like(self.W)
        self.momment2 = np.zeros_like(self.W)
        return self
    
    def __repr__(self):
        return (self.__class__.__name__ + ' output: ' + str(self.out_dim) + ' input: ' + str(self.inp_dim))

    def forward(self, X, y=None, W=None, b=None):
        self.X = X
        if len(self.X.shape) > 2:
            X = X.reshape(-1, X.shape[-1])
        if not W:
            W = self.W
        if not b:
            b = self.b
        out = np.dot(W, X.T).T + b

        if len(self.X.shape) > 2:
            out = out.reshape(*self.X.shape[:-1], self.out_dim)
        return out # logits
    
    def backward(self, dO):
        dO_shape = dO.shape
        X = self.X
        if len(dO_shape) > 2 or len(self.X.shape) > 2:
            dO = dO.reshape(-1, dO.shape[-1])
            X = X.reshape(-1, self.X.shape[-1])
        
        dW = np.sum(X[:, np.newaxis, :] * dO[:, :, np.newaxis], axis=0) + self.reg*self.W
        dX = np.sum(self.W[:, :, np.newaxis] * dO.T[:, np.newaxis, :], axis=0).T
        db = np.sum(dO, axis=0)
        
        if len(self.X.shape) > 2:
            dX = dX.reshape(self.X.shape)
        return (dX, dW, db)
    
class LSTM(object):
    # inputs will include a mask
    # if return_seq is false, the last output will be determined using the mask
    def __init__(self, hidden_units, inp_dim=None, return_seq=True, return_mask=False, reg=0.001):
        self.W = None
        self.b = None
        self.reg = reg
        self.out_dim = hidden_units
        self.inp_dim = inp_dim
        self.next = None
        self.return_seq = return_seq
        self.return_mask = return_mask
        self.b = np.random.normal(0, 0.2, (4, self.out_dim))
        self.bf = self.b[0]
        self.bi = self.b[1]
        self.bo = self.b[2]
        self.bc = self.b[3]
        self.momment1 = None
        self.momment2 = None
        self.momment_b1 = np.zeros_like(self.b)
        self.momment_b2 = np.zeros_like(self.b)
        if inp_dim:
            self.W = np.random.normal(0, np.sqrt(2.0/(2 * self.out_dim + self.inp_dim)), (self.out_dim * 4, self.inp_dim + self.out_dim))
            self.Wf = self.W[:self.out_dim, :]
            self.Wi = self.W[self.out_dim: 2 * self.out_dim, :]
            self.Wo = self.W[2 * self.out_dim: 3 * self.out_dim, :]
            self.Wc = self.W[3 * self.out_dim:, :]
            self.b = np.random.normal(0, 0.2, 4 * self.out_dim)
            self.momment1 = np.zeros((self.out_dim * 4, self.inp_dim + self.out_dim))
            self.momment2 = np.zeros((self.out_dim * 4, self.inp_dim + self.out_dim))

    
    def __call__(self, inp):
        self.inp = inp
        inp.next = self
        self.inp_dim = inp.out_dim
        self.W = np.random.normal(0, np.sqrt(2.0/(2 * self.out_dim + self.inp_dim)), (self.out_dim * 4, self.inp_dim + self.out_dim))
        self.Wf = self.W[:self.out_dim, :]
        self.Wi = self.W[self.out_dim: 2 * self.out_dim, :]
        self.Wo = self.W[2 * self.out_dim: 3 * self.out_dim, :]
        self.Wc = self.W[3 * self.out_dim:, :]
        self.momment1 = np.zeros((self.out_dim * 4, self.inp_dim + self.out_dim))
        self.momment2 = np.zeros((self.out_dim * 4, self.inp_dim + self.out_dim))
        return self
    
    def __repr__(self):
        return (self.__class__.__name__ + ' output: ' + str(self.out_dim) + ' input: ' + str(self.inp_dim))
    
    @staticmethod
    def sigmoid(X):
        return 1.0/(1.0 + np.exp(-1 * X))
    
    def forward(self, X, y=None, W=None, b=None):
        #W = out x in
        self.Wf = self.W[:self.out_dim, :]
        self.Wi = self.W[self.out_dim: 2 * self.out_dim, :]
        self.Wo = self.W[2 * self.out_dim: 3 * self.out_dim, :]
        self.Wc = self.W[3 * self.out_dim:, :]

        self.bf = self.b[0]
        self.bi = self.b[1]
        self.bo = self.b[2]
        self.bc = self.b[3]

        if isinstance(X, dict):
            X_ = X
            X = X_['input_ids']
            mask = X_.get('seq_lens', np.array(X.shape[1]).repeat(X.shape[0]))
        else:
            mask = np.array(X.shape[1]).repeat(X.shape[0])
        fg = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        ig = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        og = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        pstate = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        state = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        out = np.zeros((X.shape[0], X.shape[1] + 1, self.out_dim))
        X = np.concatenate([np.zeros((X.shape[0], 1, X.shape[2])), X], axis=1) # shape of X is (N, T, d)
        self.X = X
        self.mask = mask
        
        for t in range(1, X.shape[1]):
            fg[:, t, :] = self.sigmoid(np.dot(np.hstack([out[:, t - 1, :], X[:, t, :]]), self.Wf.T) + self.bf)
            ig[:, t, :] = self.sigmoid(np.dot(np.hstack([out[:, t - 1, :], X[:, t, :]]), self.Wi.T) + self.bi)
            pstate[:, t, :] = np.tanh(np.dot(np.hstack([out[:, t - 1, :], X[:, t, :]]), self.Wc.T) + self.bc)
            state[:, t, :] = fg[:, t, :] * state[:, t - 1, :] + ig[:, t, :] * pstate[:, t, :]
            og[:, t, :] = self.sigmoid(np.dot(np.hstack([out[:, t - 1, :], X[:, t, :]]), self.Wo.T) + self.bo)
            out[:, t, :] = og[:, t, :] * np.tanh(state[:, t, :])
        self.fg = fg
        self.ig = ig
        self.og = og
        self.pstate = pstate
        self.state = state
        self.out = out
        out = out[:, 1:, :] if self.return_seq else out[np.arange(X.shape[0]), mask, :]
        # return out[:, 1:, :] if self.return_seq else out[np.arange(X.shape[0]), mask, :]
        return {'input_ids': out, 'seq_lens': mask} if self.return_mask else out

    def backward(self, dO):
        if not self.return_seq:
            dO_ = np.zeros_like(self.out)
            dO_[np.arange(self.X.shape[0]), self.mask, :] = dO
            dO = dO_
        else:
            dO = np.concatenate([np.zeros((dO.shape[0], 1, dO.shape[2])), dO], axis=1)
        # print (dO)
        dstate = np.zeros_like(self.state)
        dWo = np.zeros_like(self.Wo)
        dWi = np.zeros_like(self.Wi)
        dWf = np.zeros_like(self.Wf)
        dWc = np.zeros_like(self.Wc)
        dbf = np.zeros_like(self.bf)
        dbi = np.zeros_like(self.bi)
        dbo = np.zeros_like(self.bo)
        dbc = np.zeros_like(self.bc)
        dX = np.zeros_like(self.X)
        mask = np.arange(self.X.shape[1]) > self.mask[:, np.newaxis]
        for t in range(self.X.shape[1] - 1, 0, -1):
            dstate[:, t, :] += dO[:, t, :] * self.og[:, t, :] * (1 - np.square(np.tanh(self.state[:, t, :])))
            dstate[:, t - 1, :] = dstate[:, t, :] * self.fg[:, t, :]
            
            dWo += np.dot((dO[:, t, :] * np.tanh(self.state[:, t, :]) * self.og[:, t, :] * (1 - self.og[:, t, :])).T, 
                          np.hstack([self.out[:, t, :], self.X[:, t, :]]))
            dWi += np.dot((dstate[:, t, :] * self.pstate[:, t, :] * self.ig[:, t, :] * (1 - self.ig[:, t, :])).T, 
                          np.hstack([self.out[:, t, :], self.X[:, t, :]]))
            dWf += np.dot((dstate[:, t, :] * self.state[:, t - 1, :] * self.fg[:, t, :] * (1 - self.fg[:, t, :])).T, 
                          np.hstack([self.out[:, t, :], self.X[:, t, :]]))
            dWc += np.dot((dstate[:, t, :] * self.ig[:, t, :] * (1 - np.square(self.pstate[:, t, :]))).T,
                          np.hstack([self.out[:, t, :], self.X[:, t, :]]))
            
            dO[:, t - 1, :] += np.dot(dstate[:, t, :] * self.ig[:, t, :] * (1 - np.square(self.pstate[:, t, :])), self.Wc[:, :self.out_dim])\
                            + np.dot(dstate[:, t, :] * self.state[:, t - 1, :] * self.fg[:, t, :] * (1 - self.fg[:, t, :]), self.Wf[:, :self.out_dim])\
                            + np.dot(dstate[:, t, :] * self.pstate[:, t, :] * self.ig[:, t, :] * (1 - self.ig[:, t, :]), self.Wi[:, :self.out_dim])\
                            + np.dot(dO[:, t, :] * np.tanh(self.state[:, t, :]) * self.og[:, t, :] * (1 - self.og[:, t, :]), self.Wo[:, :self.out_dim])
            
            dX[:, t, :] = np.dot(dstate[:, t, :] * self.ig[:, t, :] * (1 - np.square(self.pstate[:, t, :])), self.Wc[:, self.out_dim:])\
                        + np.dot(dstate[:, t, :] * self.state[:, t - 1, :] * self.fg[:, t, :] * (1 - self.fg[:, t, :]), self.Wf[:, self.out_dim:])\
                        + np.dot(dstate[:, t, :] * self.pstate[:, t, :] * self.ig[:, t, :] * (1 - self.ig[:, t, :]), self.Wi[:, self.out_dim:])\
                        + np.dot(dO[:, t, :] * np.tanh(self.state[:, t, :]) * self.og[:, t, :] * (1 - self.og[:, t, :]), self.Wo[:, self.out_dim:])

            dbo += (dO[:, t, :] * np.tanh(self.state[:, t, :]) * self.og[:, t, :] * (1 - self.og[:, t, :])).sum(axis=0)
            dbi += (dstate[:, t, :] * self.pstate[:, t, :] * self.ig[:, t, :] * (1 - self.ig[:, t, :])).sum(axis=0)
            dbf += (dstate[:, t, :] * self.state[:, t - 1, :] * self.fg[:, t, :] * (1 - self.fg[:, t, :])).sum(axis=0)
            dbc += (dstate[:, t, :] * self.ig[:, t, :] * (1 - np.square(self.pstate[:, t, :]))).sum(axis=0)

        dW = np.vstack([dWf, dWi, dWo, dWc]) + self.reg * self.W
        db = np.vstack([dbf, dbi, dbo, dbc])
        # print ('dX')
        # print (dX)
        # print ('dstate')
        # print (dstate)
        dX[mask, :] = 0
        return (dX[:, 1:, :], dW, db)
    
class Embeddings(object):
    def __init__(self, num_embeddings, embedding_dim, pad_idx=None, trainable=True, reg=0.0001):
        self.W = None
        self.b = 0.
        self.reg = reg
        self.out_dim = embedding_dim
        self.inp_dim = num_embeddings
        self.pad_idx = pad_idx
        self.trainable = trainable
        self.inp = None
        self.next = None
        self.momment1 = None
        self.momment2 = None
        if num_embeddings:
            self.W = np.random.normal(0, 1, (self.out_dim, self.inp_dim))
            self.momment1 = np.zeros_like(self.W)
            self.momment2 = np.zeros_like(self.W)

    def __call__(self, inp):
        self.inp = inp
        inp.next = self
        if self.inp_dim != inp.out_dim:
            raise ValueError('num_embeddings do not match out_dim of inputs')
        return self
    
    def __repr__(self):
        return (self.__class__.__name__ + ' embedding_dim: ' + str(self.out_dim) + ' num_embeddings: ' + str(self.inp_dim))
    
    def forward(self, X, y=None):
        self.X = X
        # inputs are encoded tokens with token_ids
        # shape of X: (N, T)
        
        out = np.take(self.W.T, X, axis=0)
        # output shape: N, T, emb_dim
        if self.pad_idx is not None:
            mask = (X != self.pad_idx).sum(axis=1)
        return out if self.pad_idx is None else {'input_ids': out, 'seq_lens': mask}
    
    def backward(self, dO):
        dW = np.zeros_like(self.W)
        # dW[:, self.X] += dO
        if self.trainable:
            np.add.at(dW.T, np.s_[self.X, :], dO)
        # dW += self.reg * self.W
        return 0, dW, None

    
class Activation(object):
    def __init__(self, func='relu'): # options: relu, softmax_with_cat_cross_entropy (softmax)
        self.act_function = func
        self.next = None

    def __call__(self, inp=None):
        self.inp = inp
        inp.next = self
        self.inp_dim = inp.out_dim
        self.out_dim = self.inp_dim
        return self
    
    def __repr__(self):
        return (self.__class__.__name__ + ' ' + self.act_function + ' output: ' + str(self.out_dim) + ' input: ' + str(self.inp_dim))
    
    def forward(self, X, y=None):
        self.X = X
        self.y = y
        if self.act_function == 'relu':
            out = np.maximum(0, X)
            activations = out
        elif self.act_function == 'softmax':
            exps = np.exp(X - np.max(X, axis=1, keepdims=True))
            activations = exps / np.sum(exps, axis=1, keepdims=True)
            activations = np.where(activations > 1.0e-10, activations, 1.0e-10)
            out = np.mean(-1*np.sum(y * np.log(activations), axis=1))
        elif self.act_function == 'sigmoid_with_bin_cross_entropy':
            sig = 1/(1 + np.exp(-X))
            activations = sig
            activations = np.where(activations > 1.0e-10, activations, 1.0e-10)
            out = np.mean(-1*((y * np.log(activations)) + ((1 - y) * np.log(1 - activations))))
        elif self.act_function == 'sigmoid':
            activations = 1/(1 + np.exp(-X))
            activations = np.where(activations > 1.0e-7, activations, 1.0e-7)
            activations = np.where(activations < 1 - 1.0e-7, activations, 1 - 1.0e-7)
            out = activations
        self.activations = activations
        return out
    
    def predict(self, X, y=None):
        _ = self.forward(X, y)
        return self.activations
    
    def backward(self, dO=None):
        if self.act_function == 'relu':
            dX = np.where(self.X < 0, 0, 1) * dO
        elif self.act_function == 'softmax' or self.act_function == 'sigmoid_with_bin_cross_entropy':
            dX = self.activations - self.y
        elif self.act_function == 'sigmoid':
            dX = self.activations * (1 - self.activations) * dO
        return (dX, None, None)
    
class Loss(object):
    def __init__(self, loss_fn): # options: mse
        self.loss_function = loss_fn
        self.next = None

    def __call__(self, inp):
        self.inp = inp
        inp.next = self
        self.inp_dim = inp.out_dim
        self.out_dim = self.inp_dim
        return self
    
    def __repr__(self):
        return (self.__class__.__name__ + ' ' + self.loss_function + ' output: ' + str(self.out_dim) + ' input: ' + str(self.inp_dim))
    
    def forward(self, X, y):
        self.X = X
        self.y = y
        if self.loss_function == 'mse':
            loss = np.mean((X - y)**2)
        return loss
    
    def predict(self, X, y=None):
        return X
    
    def backward(self, dO=None):
        if self.loss_function == 'mse':
            dX = 2*(self.X - self.y)
        return (dX, None, None)

class Optimizer(object): # SGD implementation
    def __init__(self, lr=0.001, b1=0.9, b2=0.999):
        self.b1, self.b2 = b1, b2
        self.eps = 1e-8
        self.t = 1
        self.lr = lr
        self.loss = []

    def run_forward(self, input_layer, X, y):
        layer = input_layer
        out = X
        while (layer):
            # print (layer)
            # print (out.shape)
            out = layer.forward(out, y)
            layer = layer.next
        loss = out
        return loss

    def optimize_step(self, out_layer, verbose=False):
        layer = out_layer 
        t = self.t
        dO = 1  
        lr = self.lr 
        while (layer):
            # print (layer)
            dO, dW, db = layer.backward(dO)
            if dW is not None:
                moment1 = (self.b1 * layer.momment1) + ((1 - self.b1) * dW)
                moment2 = (self.b2 * layer.momment2) + ((1 - self.b2) * dW**2)
                mt1_hat = moment1/(1 - self.b1**t)
                mt2_hat = moment2/(1 - self.b2**t)
                # W = layer.W - (lr * dW)
                W = layer.W - (lr * mt1_hat/(np.sqrt(mt2_hat) + self.eps))
                layer.W = W
                layer.momment1 = moment1
                layer.momment2 = moment2
                if verbose >= 10:
                    print (layer)
                    print ('dW:', dW)
                    print ("updated W:")
                    print (W)

            if db is not None:
                moment_b1 = (self.b1 * layer.momment_b1) + ((1 - self.b1) * db)
                moment_b2 = (self.b2 * layer.momment_b2) + ((1 - self.b2) * db**2)
                mt1_hat = moment_b1/(1 - self.b1**t)
                mt2_hat = moment_b2/(1 - self.b2**t)
                # b = layer.b - (lr * db)
                b = layer.b - (lr * mt1_hat/(np.sqrt(mt2_hat) + self.eps))
                layer.b = b
                layer.momment_b1 = moment_b1
                layer.momment_b2 = moment_b2
                if verbose >= 10:
                    print ('db:', db)
                    print ("updated b:", b)

            self.t = t + 1
            layer = layer.inp
        return self.t

    def train(self, input_layer, out_layer, 
              X, y, batch_size=None, 
              patience=20, epochs=None, 
              verbose=False, loss_tr_ep=1.0e-10,
              inputs_batched=False):
        patience = patience
        loss_tracker = []
        epoch = 0
        if not epochs:
            epochs = 1e10
        if not patience:
            patience = epochs + 1
        patience_remaining = patience
        
        if not isinstance(batch_size, int) and not inputs_batched:
            batch_size = X.shape[0]
        elif inputs_batched:
            batch_size = X[0].shape[0]
        
        num_batches = len(X) if inputs_batched else int(np.ceil(X.shape[0]/batch_size))
        
        print ("Using batch_size of", batch_size)
        while (patience_remaining > 0 and epochs > epoch):
            loss_tracker_epoch = []
            for i in tqdm.tqdm(range(num_batches), disable=not verbose):
                if inputs_batched: # X is list of precreated batched
                    X_batch = X[i]
                    y_batch = y[i]
                else: # X is the entire data and batches are created here
                    up_ind = min(X.shape[0], (i + 1) * batch_size)
                    X_batch = X[i * batch_size: up_ind]
                    y_batch = y[i * batch_size: up_ind]
                loss = self.run_forward(input_layer, X_batch, y_batch)
                timestep = self.optimize_step(out_layer, verbose=verbose)
                loss_tracker_epoch.append(loss)
            epoch_loss = np.mean(loss_tracker_epoch)
            epoch += 1
            
            if len(loss_tracker) > 0 and epoch_loss + loss_tr_ep > min(loss_tracker):
                patience_remaining -= 1
            else:
                patience_remaining = patience
            loss_tracker.append(epoch_loss)

            print ('epoch:', epoch, 'loss:', epoch_loss)
        self.loss = self.loss + loss_tracker
    
    def predict(self, input_layer, X, y, batch_size=None, verbose=False):
        out_list = []
        if not isinstance(batch_size, int):
            batch_size = X.shape[0]
        print ("Using batch_size of", batch_size)
        for i in range(int(np.ceil(X.shape[0]/batch_size))):
            up_ind = min(X.shape[0], (i + 1) * batch_size)
            X_batch = X[i * batch_size: up_ind]
            y_batch = y[i * batch_size: up_ind]
            layer = input_layer
            out = X_batch
            while (layer):
                # print (layer)
                # print (out.shape)
                if isinstance(layer, (Activation, Loss)):
                # if isinstance(layer, relu1.__class__):
                    out = layer.predict(out, y_batch)
                else:
                    out = layer.forward(out, y_batch)
                if verbose:
                    print (layer)
                    print (out)
                layer = layer.next
            out_list.append(out)
        return np.vstack(out_list)


In [3]:
from irony import load_datasets
from sklearn.model_selection import train_test_split

train_sentences, train_labels, test_sentences, test_labels, label2i = load_datasets()

# TODO: Split train into train/dev
train_sentences, dev_sentences, train_labels, dev_labels = train_test_split(train_sentences, train_labels, train_size=0.95)

In [4]:
from irony import run_nb_baseline

run_nb_baseline()

Vectorizing Text: 100%|██████████| 3834/3834 [00:00<00:00, 22817.51it/s]
Vectorizing Text: 100%|██████████| 3834/3834 [00:00<00:00, 24369.04it/s]
Vectorizing Text: 100%|██████████| 784/784 [00:00<00:00, 29609.43it/s]

Baseline: Naive Bayes Classifier
F1-score Ironic: 0.6402966625463535
Avg F1-score: 0.6284487265300938





In [5]:
from typing import Dict, List, Optional, Tuple
from collections import Counter

import numpy as np
import spacy


class Tokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self, pad_symbol: Optional[str] = "<PAD>"):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<PAD>".
        """
        self.pad_symbol = pad_symbol
        self.nlp = spacy.load("en_core_web_lg")
    
    def __call__(self, batch: List[str]) -> List[List[str]]:
        """Tokenizes each sentence in the batch, and pads them if necessary so
        that we have equal length sentences in the batch.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            List[List[str]]: A List of equal-length token Lists.
        """
        batch = self.tokenize(batch)
        batch = self.pad(batch)

        return batch

    def tokenize(self, sentences: List[str]) -> List[List[str]]:
        """Tokenizes the List of string sentences into a Lists of tokens using spacy tokenizer.

        Args:
            sentences (List[str]): The input sentence.

        Returns:
            List[str]: The tokenized version of the sentence.
        """
        tokenized_sents = []
        # TODO: Tokenize the input with spacy.
        for sent in sentences:
            sent_tokens = [token.text.lower() for token in self.nlp(sent)]
        # TODO: Make sure the start token is the special <SOS> token and the end token
        #       is the special <EOS> token
            sent_tokens = ['<SOS>'] + sent_tokens + ['<EOS>']
            tokenized_sents.append(sent_tokens)

        return tokenized_sents

    def pad(self, batch: List[List[str]]) -> List[List[str]]:
        """Appends pad symbols to each tokenized sentence in the batch such that
        every List of tokens is the same length. This means that the max length sentence
        will not be padded.

        Args:
            batch (List[List[str]]): Batch of tokenized sentences.

        Returns:
            List[List[str]]: Batch of padded tokenized sentences. 
        """
        # TODO: For each sentence in the batch, append the special <P>
        #       symbol to it n times to make all sentences equal length
        out_batch = []
        max_len = max([len(sent) for sent in batch])
        for sent in batch:
            out_batch.append(sent + [self.pad_symbol]*(max_len - len(sent)))
        return out_batch
        

In [6]:
# create the vocabulary of the dataset: use both training and test sets here

SPECIAL_TOKENS = ['<UNK>', '<PAD>', '<SOS>', '<EOS>']

all_data = train_sentences + dev_sentences + test_sentences
my_tokenizer = Tokenizer()

tokenized_data = my_tokenizer.tokenize(all_data)
vocab = sorted(set([w for ws in tokenized_data + [SPECIAL_TOKENS] for w in ws]))

with open('vocab.txt', 'w') as vf:
    vf.write('\n'.join(vocab))

In [7]:
embedding_path = 'glove.twitter.27B.100d.txt'
vocab_path = "./vocab.txt"

In [8]:
from typing import Dict, Tuple

# import torch


def read_pretrained_embeddings(
    embeddings_path: str,
    vocab_path: str
) -> Tuple[Dict[str, int], np.ndarray]:
    """Read the embeddings matrix and make a dict hashing each word.

    Note that we have provided the entire vocab for train and test, so that for practical purposes
    we can simply load those words in the vocab, rather than all 27B embeddings

    Args:
        embeddings_path (str): _description_
        vocab_path (str): _description_

    Returns:
        Tuple[Dict[str, int], torch.FloatTensor]: _description_
    """
    word2i = {}
    vectors = []
    
    with open(vocab_path, encoding='utf8') as vf:
        vocab = set([w.strip() for w in vf.readlines()]) 
    
    print(f"Reading embeddings from {embeddings_path}...")
    with open(embeddings_path, "r") as f:
        i = 0
        for line in f:
            word, *weights = line.rstrip().split(" ")
            # TODO: Build word2i and vectors such that
            #       each word points to the index of its vector,
            #       and only words that exist in `vocab` are in our embeddings
            if word in vocab:
                word2i.update({word: i})
                vectors.append(weights)
                i += 1
            # raise NotImplementedError

    return (word2i, np.array(vectors, dtype=np.float32))

In [9]:
def get_oovs(vocab_path: str, word2i: Dict[str, int]) -> List[str]:
    """Find the vocab items that do not exist in the glove embeddings (in word2i).
    Return the List of such (unique) words.

    Args:
        vocab_path: List of batches of sentences.
        word2i (Dict[str, int]): _description_

    Returns:
        List[str]: _description_
    """
    with open(vocab_path, encoding='utf8') as vf:
        vocab = set([w.strip() for w in vf.readlines()])
    
    glove_and_vocab = set(word2i.keys())
    vocab_and_not_glove = vocab - glove_and_vocab
    return list(vocab_and_not_glove)

def intialize_new_embedding_weights(num_embeddings: int, dim: int) -> np.ndarray:
    """xavier initialization for the embeddings of words in train, but not in gLove.

    Args:
        num_embeddings (int): _description_
        dim (int): _description_

    Returns:
        torch.FloatTensor: _description_
    """
    # TODO: Initialize a num_embeddings x dim matrix with xiavier initiialization
    #      That is, a normal distribution with mean 0 and standard deviation of dim^-0.5
    oov_emb = np.random.normal(0, 1/np.sqrt(dim), (num_embeddings, dim))
    return oov_emb


def update_embeddings(
    glove_word2i: Dict[str, int],
    glove_embeddings: np.ndarray,
    oovs: List[str]
) -> Tuple[Dict[str, int], np.ndarray]:
    # TODO: Add the oov words to the dict, assigning a new index to each
    max_ind = max(glove_word2i.values())
    glove_word2i.update({oov: new_ind for oov, new_ind in zip(oovs, range(max_ind + 1, max_ind + 1 + len(oovs)))})
    # TODO: Concatenate a new row to embeddings for each oov
    #       initialize those new rows with `intialize_new_embedding_weights`
    new_glove_embeddings = np.vstack((glove_embeddings, intialize_new_embedding_weights(len(oovs), glove_embeddings.shape[1])))
    # TODO: Return the tuple of the dictionary and the new embeddings matrix
    return glove_word2i, new_glove_embeddings


In [10]:
def make_batches(sequences: List[str], batch_size: int) -> List[List[str]]:
    """Yield batch_size chunks from sequences."""
    # TODO
    for i in range(0, len(sequences), batch_size):
        yield sequences[i: i + batch_size]


# TODO: Set your preferred batch size
batch_size = 16
tokenizer = Tokenizer()

# We make batches now and use those.
batch_tokenized = []
# Note: Labels need to be batched in the same way to ensure
# We have train sentence and label batches lining up.
for batch in make_batches(train_sentences, batch_size):
    batch_tokenized.append(tokenizer(batch))


glove_word2i, glove_embeddings = read_pretrained_embeddings(
    embedding_path,
    vocab_path
)

# Find the out-of-vocabularies
oovs = get_oovs(vocab_path, glove_word2i)

# Add the oovs from training data to the word2i encoding, and as new rows
# to the embeddings matrix
word2i, embeddings = update_embeddings(glove_word2i, glove_embeddings, oovs)

Reading embeddings from /Users/anupbhutada/Documents/Courses/Natural Langauge Processing/Assignment4/assignment_4/glove.twitter.27B.100d.txt...


In [11]:
len(oovs), len(word2i)

(6002, 15299)

In [12]:
# Use these functions to encode your batches before you call the train loop.

def encode_sentences(batch: List[List[str]], word2i: Dict[str, int]) -> np.ndarray:
    """Encode the tokens in each sentence in the batch with a dictionary

    Args:
        batch (List[List[str]]): The padded and tokenized batch of sentences.
        word2i (Dict[str, int]): The encoding dictionary.

    Returns:
        torch.LongTensor: The tensor of encoded sentences.
    """
    UNK_IDX = word2i["<UNK>"]
    tensors = []
    for sent in batch:
        tensors.append(np.array([word2i.get(w, UNK_IDX) for w in sent], dtype='int'))
        
    return np.stack(tensors)


def encode_labels(labels: List[int]) -> np.ndarray:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[int]): List of all labels in the batch

    Returns:
        torch.FloatTensor: Tensor of all labels in the batch
    """
    arr = np.array([int(l) for l in labels], dtype='int')
    out = np.zeros((arr.shape[0], 2))
    out[np.arange(arr.shape[0]), arr] = 1
    return out

In [13]:
# TODO: Load the model and run the training loop 
#       on your train/dev splits. Set and tweak hyperparameters.
batch_size = 32

print ("Creating batches...")
batch_train_tokenized = []
batch_tokenized_train = [tokenizer(batch) for batch in make_batches(train_sentences, batch_size)]
batch_labels_train = list(make_batches(train_labels, batch_size))
batch_tokenized_dev = [tokenizer(batch) for batch in make_batches(dev_sentences, batch_size)]
batch_labels_dev = list(make_batches(dev_labels, batch_size))

# print ("Encoding batches...")
# batch_sentences_train = [encode_sentences(batch, word2i) for batch in batch_tokenized_train]
# batch_labels_train = [encode_labels(batch_labels).reshape(-1, 1) for batch_labels in batch_labels_train]
# batch_sentences_dev = [encode_sentences(batch, word2i) for batch in batch_tokenized_dev]
# batch_labels_dev = [encode_labels(batch_labels).reshape(-1, 1) for batch_labels in batch_labels_dev]

print ("Encoding batches...")
batch_sentences_train = [encode_sentences(batch, word2i) for batch in batch_tokenized_train]
batch_labels_train = [encode_labels(batch_labels) for batch_labels in batch_labels_train]
batch_sentences_dev = [encode_sentences(batch, word2i) for batch in batch_tokenized_dev]
batch_labels_dev = [encode_labels(batch_labels) for batch_labels in batch_labels_dev]

print ("DONE")


Creating batches...
Encoding batches...
DONE


In [14]:
## create model
embedding_layer = Embeddings(num_embeddings=embeddings.shape[0], 
                             embedding_dim=embeddings.shape[1], 
                             pad_idx=word2i.get('<PAD>'),
                             trainable=False)
embedding_layer.W = embeddings.T.copy()
lstm1 = LSTM(hidden_units=32, return_seq=True, return_mask=True, reg=0.0)(embedding_layer)
lstm = LSTM(hidden_units=32, return_seq=False, return_mask=False, reg=0.0)(lstm1)
dense = Dense(out_dim=2, reg=1e-5)(lstm)
out = Activation(func='softmax')(dense)


In [15]:
opt = Optimizer(lr=0.0005)
# opt.run_forward(embedding_layer, batch_sentences_train[0][:2], np.array([[1], [0]]))

In [16]:
opt.train(embedding_layer, out, batch_sentences_train, batch_labels_train, epochs=10, verbose=True, inputs_batched=True)

Using batch_size of 32


100%|██████████| 114/114 [00:04<00:00, 25.15it/s]


epoch: 1 loss: 0.6898707811905954


100%|██████████| 114/114 [00:04<00:00, 25.53it/s]


epoch: 2 loss: 0.6625442806437766


100%|██████████| 114/114 [00:04<00:00, 25.41it/s]


epoch: 3 loss: 0.623647019672039


100%|██████████| 114/114 [00:04<00:00, 24.78it/s]


epoch: 4 loss: 0.5989175510142293


100%|██████████| 114/114 [00:04<00:00, 24.79it/s]


epoch: 5 loss: 0.5775970506589221


100%|██████████| 114/114 [00:04<00:00, 25.42it/s]


epoch: 6 loss: 0.5554375102782249


100%|██████████| 114/114 [00:04<00:00, 25.51it/s]


epoch: 7 loss: 0.533283039629901


100%|██████████| 114/114 [00:04<00:00, 25.73it/s]


epoch: 8 loss: 0.5120307482097481


100%|██████████| 114/114 [00:04<00:00, 25.72it/s]


epoch: 9 loss: 0.4904869924634359


100%|██████████| 114/114 [00:04<00:00, 24.94it/s]

epoch: 10 loss: 0.46965137724356054





In [17]:
opt.train(embedding_layer, out, batch_sentences_train, batch_labels_train, epochs=10, verbose=True, inputs_batched=True)

Using batch_size of 32


100%|██████████| 114/114 [00:04<00:00, 24.81it/s]


epoch: 1 loss: 0.4490552356553369


100%|██████████| 114/114 [00:04<00:00, 25.36it/s]


epoch: 2 loss: 0.42840454854736804


100%|██████████| 114/114 [00:04<00:00, 25.40it/s]


epoch: 3 loss: 0.40757502460038525


100%|██████████| 114/114 [00:04<00:00, 25.37it/s]


epoch: 4 loss: 0.3855895878466547


100%|██████████| 114/114 [00:04<00:00, 25.43it/s]


epoch: 5 loss: 0.36759962774402394


100%|██████████| 114/114 [00:04<00:00, 25.32it/s]


epoch: 6 loss: 0.3516726393946023


100%|██████████| 114/114 [00:04<00:00, 25.36it/s]


epoch: 7 loss: 0.3490585615402517


100%|██████████| 114/114 [00:04<00:00, 25.16it/s]


epoch: 8 loss: 0.32920309867864705


100%|██████████| 114/114 [00:04<00:00, 25.35it/s]


epoch: 9 loss: 0.30185517379402316


100%|██████████| 114/114 [00:04<00:00, 25.30it/s]

epoch: 10 loss: 0.27749569631309684





In [18]:
opt.train(embedding_layer, out, batch_sentences_train, batch_labels_train, epochs=5, verbose=True, inputs_batched=True)

Using batch_size of 32


100%|██████████| 114/114 [00:04<00:00, 25.49it/s]


epoch: 1 loss: 0.25759218029766573


100%|██████████| 114/114 [00:04<00:00, 25.53it/s]


epoch: 2 loss: 0.23643022500643596


100%|██████████| 114/114 [00:04<00:00, 25.64it/s]


epoch: 3 loss: 0.2320291561273965


100%|██████████| 114/114 [00:04<00:00, 25.52it/s]


epoch: 4 loss: 0.21417932766611947


100%|██████████| 114/114 [00:04<00:00, 25.59it/s]

epoch: 5 loss: 0.20630273688194267





In [19]:
opt.train(embedding_layer, out, batch_sentences_train, batch_labels_train, epochs=5, verbose=True, inputs_batched=True)

Using batch_size of 32


100%|██████████| 114/114 [00:04<00:00, 25.27it/s]


epoch: 1 loss: 0.22514189563024053


100%|██████████| 114/114 [00:04<00:00, 25.41it/s]


epoch: 2 loss: 0.23502655660711175


100%|██████████| 114/114 [00:04<00:00, 25.43it/s]


epoch: 3 loss: 0.18471396731409073


100%|██████████| 114/114 [00:04<00:00, 25.38it/s]


epoch: 4 loss: 0.17175057873317187


100%|██████████| 114/114 [00:04<00:00, 25.39it/s]

epoch: 5 loss: 0.17331602316559108





In [20]:
opt.run_forward(embedding_layer, batch_sentences_dev[0], batch_labels_dev[0])

1.244748763374625

In [258]:
np.set_printoptions(suppress=True, precision=4)
np.hstack([out.activations, batch_labels_dev[0]])

array([[1.    , 0.    , 1.    , 0.    ],
       [0.9995, 0.0005, 1.    , 0.    ],
       [0.0003, 0.9997, 0.    , 1.    ],
       [0.0031, 0.9969, 1.    , 0.    ],
       [0.9986, 0.0014, 1.    , 0.    ],
       [0.9934, 0.0066, 1.    , 0.    ],
       [0.0006, 0.9994, 0.    , 1.    ],
       [0.9774, 0.0226, 0.    , 1.    ],
       [0.0029, 0.9971, 1.    , 0.    ],
       [0.0001, 0.9999, 0.    , 1.    ],
       [0.0068, 0.9932, 0.    , 1.    ],
       [0.0377, 0.9623, 0.    , 1.    ],
       [0.998 , 0.002 , 1.    , 0.    ],
       [0.9007, 0.0993, 0.    , 1.    ],
       [0.9976, 0.0024, 1.    , 0.    ],
       [0.0006, 0.9994, 0.    , 1.    ],
       [0.0031, 0.9969, 1.    , 0.    ],
       [0.0017, 0.9983, 0.    , 1.    ],
       [0.0051, 0.9949, 0.    , 1.    ],
       [1.    , 0.    , 0.    , 1.    ],
       [0.9068, 0.0932, 0.    , 1.    ],
       [0.9511, 0.0489, 1.    , 0.    ],
       [0.0001, 0.9999, 0.    , 1.    ],
       [0.6782, 0.3218, 1.    , 0.    ],
       [0.9401, 

In [21]:
((opt.predict(embedding_layer, batch_sentences_dev[1], batch_labels_dev[1]) > 0.5).astype('int') == batch_labels_dev[1]).mean()

Using batch_size of 32


0.625

In [22]:
from util import f1_score

def predict_batches(input_layer, batched_X, batched_y):
    all_preds = []
    for X, y in zip(batched_X, batched_y):
        preds = opt.predict(input_layer, X, y)
        all_preds.append(preds)
    return np.vstack(all_preds)

def get_pred_and_true_labels(true_y=None, pred_y=None):
    if true_y is not None:
        true_y = np.argmax(true_y, axis=1)
    if pred_y is not None:
        pred_y = np.argmax(pred_y, axis=1)
    return true_y, pred_y

In [23]:
preds = predict_batches(embedding_layer, batch_sentences_train, batch_labels_train)
y_true = np.vstack(batch_labels_train)
((preds > 0.5).astype('float') == y_true).mean()

Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch

0.9349258649093904

In [24]:
preds = predict_batches(embedding_layer, batch_sentences_dev, batch_labels_dev)
y_true = np.vstack(batch_labels_dev)
((preds > 0.5).astype('float') == y_true).mean()

Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32


0.6458333333333334

In [25]:
# Validation F1-score
f1_score(*get_pred_and_true_labels(predict_batches(embedding_layer, batch_sentences_dev, batch_labels_dev), np.vstack(batch_labels_dev)))

Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32


0.6179775280898876

In [26]:
test_encoded = encode_sentences(tokenizer(test_sentences), word2i)
test_labels_encoded = encode_labels(test_labels)
test_labels_encoded = test_labels_encoded

In [27]:
((opt.predict(embedding_layer, test_encoded, test_labels_encoded) > 0.5).astype('int') == test_labels_encoded).mean()

Using batch_size of 784


0.6492346938775511

In [28]:
# Test F1-score
true_y, pred_y = get_pred_and_true_labels(test_labels_encoded, opt.predict(embedding_layer, test_encoded, test_labels_encoded))
f1_score(pred_y, true_y)

Using batch_size of 784


0.5424292845257903

## Experiment 2

In [4]:
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from keras.utils import to_categorical

In [5]:
reuters_data = keras.datasets.reuters
(X_train, y_train),(X_test, y_test) = reuters_data.load_data(num_words=25000)

In [6]:
seq_lengths = [len(x) for x in X_train]
np.quantile(np.array(seq_lengths), 0.9)

313.0

In [7]:
X_train = pad_sequences(X_train, maxlen=350, padding='post')
X_test = pad_sequences(X_test, maxlen=350, padding='post')

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=500)

In [9]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((8482, 350), (8482,), (500, 350), (500,), (2246, 350), (2246,))

In [10]:
# model = Sequential()
# model.add(InputLayer(input_shape=(350,)))
# model.add(Embedding(input_dim=25000, output_dim=64, mask_zero=True))
# model.add(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(46, activation='softmax'))

In [11]:
## create model
embedding_layer = Embeddings(num_embeddings=25000, 
                             embedding_dim=64, 
                             pad_idx=0,
                             trainable=True)
# embedding_layer.W = embeddings.T.copy()
# lstm1 = LSTM(hidden_units=128, return_seq=True, return_mask=True, reg=0.0)(embedding_layer)
lstm = LSTM(hidden_units=128, return_seq=False, return_mask=False, reg=1e-4)(embedding_layer)
dense1 = Dense(out_dim=64, reg=5e-4)(lstm)
act1 = Activation(func='relu')(dense1)
dense = Dense(out_dim=46, reg=1e-4)(act1)
out = Activation(func='softmax')(dense)


In [12]:
opt = Optimizer(lr=5e-3)
# opt.run_forward(embedding_layer, batch_sentences_train[0][:2], np.array([[1], [0]]))

In [13]:
opt.train(embedding_layer, out, X_train, to_categorical(y_train), epochs=10, batch_size=128, verbose=True)

Using batch_size of 128


  0%|          | 0/67 [00:00<?, ?it/s]

100%|██████████| 67/67 [02:36<00:00,  2.33s/it]


epoch: 1 loss: 2.255361786979062


100%|██████████| 67/67 [02:24<00:00,  2.16s/it]


epoch: 2 loss: 2.1051410344001305


100%|██████████| 67/67 [02:34<00:00,  2.30s/it]


epoch: 3 loss: 1.904604715212081


100%|██████████| 67/67 [02:35<00:00,  2.32s/it]


epoch: 4 loss: 1.7064607626719517


100%|██████████| 67/67 [02:27<00:00,  2.20s/it]


epoch: 5 loss: 1.6231017069796987


100%|██████████| 67/67 [02:21<00:00,  2.11s/it]


epoch: 6 loss: 1.458767529718493


100%|██████████| 67/67 [02:28<00:00,  2.22s/it]


epoch: 7 loss: 1.2633129531069447


100%|██████████| 67/67 [02:29<00:00,  2.23s/it]


epoch: 8 loss: 1.089423604429264


100%|██████████| 67/67 [02:44<00:00,  2.45s/it]


epoch: 9 loss: 0.9511709621117089


100%|██████████| 67/67 [02:26<00:00,  2.18s/it]

epoch: 10 loss: 0.9011357115154357





In [21]:
opt.train(embedding_layer, out, X_train, to_categorical(y_train), epochs=10, batch_size=64, verbose=True)

Using batch_size of 64


100%|██████████| 133/133 [02:31<00:00,  1.14s/it]


epoch: 1 loss: 0.7094624225582709


100%|██████████| 133/133 [02:32<00:00,  1.15s/it]


epoch: 2 loss: 0.5980168502808894


100%|██████████| 133/133 [02:33<00:00,  1.16s/it]


epoch: 3 loss: 0.5225998740373119


100%|██████████| 133/133 [02:33<00:00,  1.15s/it]


epoch: 4 loss: 0.4628947940258892


100%|██████████| 133/133 [02:33<00:00,  1.15s/it]


epoch: 5 loss: 0.42024021920055826


100%|██████████| 133/133 [02:32<00:00,  1.15s/it]


epoch: 6 loss: 0.3684495619195603


100%|██████████| 133/133 [02:33<00:00,  1.16s/it]


epoch: 7 loss: 0.33526827036573603


100%|██████████| 133/133 [02:33<00:00,  1.15s/it]


epoch: 8 loss: 0.30713217249119


100%|██████████| 133/133 [02:31<00:00,  1.14s/it]


epoch: 9 loss: 0.27593803259504734


100%|██████████| 133/133 [02:31<00:00,  1.14s/it]

epoch: 10 loss: 0.2622258582089536





In [28]:
opt.train(embedding_layer, out, X_train, to_categorical(y_train), epochs=5, batch_size=128, verbose=True)

Using batch_size of 128


100%|██████████| 67/67 [02:24<00:00,  2.16s/it]


epoch: 1 loss: 0.21784990162273726


100%|██████████| 67/67 [02:14<00:00,  2.00s/it]


epoch: 2 loss: 0.17604003727031522


100%|██████████| 67/67 [02:16<00:00,  2.04s/it]


epoch: 3 loss: 0.16121125842849707


100%|██████████| 67/67 [02:14<00:00,  2.01s/it]


epoch: 4 loss: 0.1520056982995575


100%|██████████| 67/67 [02:13<00:00,  2.00s/it]

epoch: 5 loss: 0.1446977032185888





In [44]:
opt.lr = 0.0002

In [53]:
opt.train(embedding_layer, out, X_train, to_categorical(y_train), epochs=3, batch_size=128, verbose=True)

Using batch_size of 128


100%|██████████| 67/67 [02:21<00:00,  2.12s/it]


epoch: 1 loss: 0.1174973161716144


100%|██████████| 67/67 [02:18<00:00,  2.07s/it]


epoch: 2 loss: 0.11639521939871779


100%|██████████| 67/67 [02:18<00:00,  2.06s/it]

epoch: 3 loss: 0.11538785745599028





In [30]:
opt.lr = 0.0005

In [48]:
embedding_layer.momment1 *= 0
embedding_layer.momment2 *= 0
lstm.momment1 *= 0
lstm.momment2 *= 0
dense1.momment1 *= 0
dense1.momment2 *= 0
dense.momment1 *= 0
dense.momment2 *= 0
opt.t = 1

In [152]:
opt.train(embedding_layer, out, X_train, to_categorical(y_train), epochs=1, batch_size=128, verbose=True)

Using batch_size of 128


  mt1_hat = moment1/(1 - self.b1**t)
  mt2_hat = moment2/(1 - self.b2**t)
  W = layer.W - (lr * mt1_hat/(np.sqrt(mt2_hat) + self.eps))
  mt1_hat = moment_b1/(1 - self.b1**t)
  mt2_hat = moment_b2/(1 - self.b2**t)
  b = layer.b - (lr * mt1_hat/(np.sqrt(mt2_hat) + self.eps))
  mt1_hat = moment_b1/(1 - self.b1**t)
  mt2_hat = moment_b2/(1 - self.b2**t)
  mt1_hat = moment1/(1 - self.b1**t)
  mt2_hat = moment2/(1 - self.b2**t)
100%|██████████| 67/67 [02:25<00:00,  2.17s/it]

epoch: 1 loss: 22.695382931101676





In [51]:
opt.run_forward(embedding_layer, X_val, to_categorical(y_val, 46))

1.6807436519317873

In [40]:
# preds = predict_batches(embedding_layer, batch_sentences_train, batch_labels_train)
preds = opt.predict(embedding_layer, X_val, to_categorical(y_val, 46))
# y_true = np.vstack(batch_labels_train)
pred_y = np.argmax(preds, axis=1)
(pred_y == y_val).mean()

Using batch_size of 500


0.726

In [41]:
# preds = predict_batches(embedding_layer, batch_sentences_train, batch_labels_train)
preds = opt.predict(embedding_layer, X_test, to_categorical(y_test))
# y_true = np.vstack(batch_labels_train)
pred_y = np.argmax(preds, axis=1)
(pred_y == y_test).mean()

Using batch_size of 2246


0.711487088156723

In [42]:
f1_score(y_test, pred_y, average=None)

array([0.27272727, 0.61904762, 0.47368421, 0.91183575, 0.80887012,
       0.        , 0.57142857, 0.33333333, 0.48837209, 0.72      ,
       0.625     , 0.58441558, 0.13333333, 0.28947368, 0.        ,
       0.        , 0.60550459, 0.08695652, 0.58536585, 0.61481481,
       0.40944882, 0.48888889, 0.        , 0.16666667, 0.28571429,
       0.5483871 , 0.15384615, 0.4       , 0.1       , 0.25      ,
       0.31578947, 0.15384615, 0.38095238, 0.6       , 0.53333333,
       0.4       , 0.28571429, 0.        , 0.        , 0.16666667,
       0.28571429, 0.10526316, 0.        , 0.5       , 0.88888889,
       0.5       ])

In [43]:
f1_score(y_test, pred_y, average='macro')

0.36181051917055607

In [52]:
# pred_y = np.argmax(preds, axis=1)
# (pred_y == y_test).mean()

In [54]:
# preds = predict_batches(embedding_layer, batch_sentences_train, batch_labels_train)
preds = opt.predict(embedding_layer, X_val, to_categorical(y_val, 46))
# y_true = np.vstack(batch_labels_train)
pred_y = np.argmax(preds, axis=1)
(pred_y == y_val).mean()

Using batch_size of 500


0.73

In [55]:
# preds = predict_batches(embedding_layer, batch_sentences_train, batch_labels_train)
preds = opt.predict(embedding_layer, X_test, to_categorical(y_test))
# y_true = np.vstack(batch_labels_train)
pred_y = np.argmax(preds, axis=1)
(pred_y == y_test).mean()

Using batch_size of 2246


0.711487088156723

In [56]:
f1_score(y_test, pred_y, average=None)

array([0.26086957, 0.61111111, 0.47368421, 0.91206792, 0.80541103,
       0.        , 0.57142857, 0.33333333, 0.49411765, 0.70833333,
       0.59701493, 0.57324841, 0.13333333, 0.27848101, 0.        ,
       0.        , 0.60550459, 0.1       , 0.6       , 0.62222222,
       0.40625   , 0.46511628, 0.        , 0.18181818, 0.26666667,
       0.56666667, 0.16666667, 0.4       , 0.11764706, 0.25      ,
       0.31578947, 0.16666667, 0.4       , 0.44444444, 0.53333333,
       0.4       , 0.25      , 0.        , 0.        , 0.18181818,
       0.28571429, 0.11764706, 0.        , 0.5       , 0.88888889,
       0.5       ])

In [57]:
f1_score(y_test, pred_y, average='macro')

0.3583759796436626

In [None]:
preds = predict_batches(embedding_layer, batch_sentences_dev, batch_labels_dev)
y_true = np.vstack(batch_labels_dev)
((preds > 0.5).astype('float') == y_true).mean()

Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32


0.71875

In [None]:
# Validation F1-score
f1_score(*get_pred_and_true_labels(predict_batches(embedding_layer, batch_sentences_dev, batch_labels_dev), np.vstack(batch_labels_dev)))

Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32
Using batch_size of 32


0.7522935779816514

In [None]:
test_encoded = encode_sentences(tokenizer(test_sentences), word2i)
test_labels_encoded = encode_labels(test_labels)
test_labels_encoded = test_labels_encoded

In [None]:
((opt.predict(embedding_layer, test_encoded, test_labels_encoded) > 0.5).astype('int') == test_labels_encoded).mean()

Using batch_size of 784


0.6594387755102041

In [None]:
# Test F1-score
true_y, pred_y = get_pred_and_true_labels(test_labels_encoded, opt.predict(embedding_layer, test_encoded, test_labels_encoded))
f1_score(pred_y, true_y)

Using batch_size of 784


0.6377204884667571

## Testing and Debugging

In [788]:
[t.text for t in tokenizer.nlp("Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR")]

['Sweet',
 'United',
 'Nations',
 'video',
 '.',
 'Just',
 'in',
 'time',
 'for',
 'Christmas',
 '.',
 '#',
 'imagine',
 '#',
 'NoReligion',
 ' ',
 'http://t.co/fej2v3OUBR']

In [790]:
not_training_data = dev_sentences + test_sentences
my_tokenizer = Tokenizer()

tokenized_data = my_tokenizer.tokenize(not_training_data)
not_training_vocab = set([w for ws in tokenized_data + [SPECIAL_TOKENS] for w in ws])

not_training_oovs = set(oovs).intersection(not_training_vocab)
not_training_oovs

{'@restlessduncan',
 'women|#misogyny|',
 'grinning_face_with_big_eyes::thumbs_up',
 '@grimmers',
 '@mtnza',
 'loveeee',
 'web|please',
 '@twitwhizz',
 'guardiansofpeace',
 'http://t.co/3io7qlk0lr',
 'face_with_tears_of_joy::face_with_tears_of_joy::face_with_tears_of_joy::face_with_tears_of_joy',
 'http://t.co/9wjfiq6wmk',
 '@siddharth_0703',
 'http://t.co/gmfru7gmir',
 '90ssarcasm',
 'http://t.co/yygdzirwfz',
 '@quiksilverindia',
 'nigher',
 'morefollowersplease',
 '@lariatofhestia',
 'http://t.co/3ov57zhqch',
 'idc#istillloveyou',
 'http://t.co/qulijvfndk',
 '||#no',
 'http://t.co/delfxn0bpi',
 '@virginmedia',
 'http://t.co/hvxmxrdktm',
 '@oneplanetmikey',
 'nmucomputersarethebest',
 '2012',
 '@progressmich',
 '@thelexzane',
 '@worlddiamark',
 'qhycb5zkev',
 'orevsosu',
 '|keeping',
 'lakhvi',
 'loudly_crying_face::crying_face',
 '11',
 'cancup',
 'film2014',
 '@garnetngold22',
 '@gazthegooner84',
 'not@allpeaceful',
 'nic_kill',
 'weary_face',
 '@erin_also',
 'triedandtrue',
 'there

In [792]:
len(not_training_vocab), len(not_training_oovs)

(6784, 2120)

In [794]:
len(vocab), len(set(vocab) - not_training_vocab)

(15302, 8518)

In [698]:
batch_labels_train[0]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [79]:
lstm.W

array([[-0.05115519,  2.79060194,  1.74904423, ...,  0.90218995,
        -0.07823642,  0.46452929],
       [ 0.03957311, -0.08359327,  0.07810808, ..., -1.1645801 ,
        -1.47297465,  1.90070732],
       [ 0.62807933,  0.95442775, -0.89309501, ..., -1.15517324,
        -0.74455474, -0.79810859],
       ...,
       [ 1.49882801,  0.70325473, -1.13812968, ...,  0.54420823,
        -0.29446765, -0.33132347],
       [-1.28244904, -0.60616415,  0.90391816, ..., -0.01481464,
        -0.34305424, -2.27542031],
       [-0.27966446, -0.14053061, -0.39564932, ...,  1.14049864,
        -0.76508639,  1.06653365]])

In [81]:
lstm.W

array([[-0.05115519,  2.79060194,  1.74904423, ...,  0.90218995,
        -0.07823642,  0.46452929],
       [ 0.03957311, -0.08359327,  0.07810808, ..., -1.1645801 ,
        -1.47297465,  1.90070732],
       [ 0.62807933,  0.95442775, -0.89309501, ..., -1.15517324,
        -0.74455474, -0.79810859],
       ...,
       [ 1.49882801,  0.70325473, -1.13812968, ...,  0.54420823,
        -0.29446765, -0.33132347],
       [-1.28244904, -0.60616415,  0.90391816, ..., -0.01481464,
        -0.34305424, -2.27542031],
       [-0.27966446, -0.14053061, -0.39564932, ...,  1.14049864,
        -0.76508639,  1.06653365]])

In [80]:
lstm.Wf

array([[-0.05115519,  2.79060194,  1.74904423, ...,  0.90218995,
        -0.07823642,  0.46452929],
       [ 0.03957311, -0.08359327,  0.07810808, ..., -1.1645801 ,
        -1.47297465,  1.90070732],
       [ 0.62807933,  0.95442775, -0.89309501, ..., -1.15517324,
        -0.74455474, -0.79810859],
       ...,
       [ 0.72198955, -2.17146252,  0.0466316 , ...,  0.17704579,
        -1.78284933, -0.21232404],
       [-2.08491991, -1.31290281,  1.06203141, ..., -0.36402803,
        -0.35470426, -0.40404115],
       [-0.39347399,  0.65912157,  0.28453421, ...,  0.17566116,
        -0.99604728, -0.5599636 ]])

In [859]:
lstm.Wf

array([[ 0.614 ,  1.0055, -0.1676, ..., -1.4183,  0.728 , -2.0562],
       [-0.48  , -0.7523, -0.9078, ..., -0.0931,  0.1954, -0.9103],
       [-0.893 ,  0.1351, -1.06  , ..., -1.2487,  1.2845, -1.5094],
       ...,
       [-0.4075,  0.6146,  0.6026, ..., -0.1131, -0.2829, -1.1643],
       [-0.7772, -0.1522,  2.1256, ...,  0.3602, -0.4155, -0.0238],
       [-1.212 , -1.4438,  0.4278, ..., -0.9146,  0.6358, -0.5758]])

In [658]:
lstm.forward(embedding_layer.forward(batch_sentences_train[0][:1]))

array([[ 0.0009,  0.272 ,  0.4744,  0.075 ,  0.0352, -0.107 , -0.552 ,
         0.1838, -0.119 ,  0.0983,  0.2458,  0.2809,  0.2147, -0.0157,
         0.3402, -0.1941]])

In [82]:
lstm.mask

array([24, 19,  7, 13, 31,  8, 33, 12, 21, 26, 11, 11, 22, 16, 20, 13, 20,
       38, 31, 18, 10, 35, 17, 31, 12, 12, 17, 17, 23, 12, 14, 19, 10, 13,
       21, 30, 18, 16, 24, 21, 18, 37, 17, 25, 22, 10, 24, 29, 17, 21, 19,
       24, 12, 14, 31, 26, 14, 27, 34, 12, 25, 11, 25, 30, 12, 30, 10, 17,
       34, 24, 26, 14, 25, 29, 23, 19, 22, 22, 24, 22, 15, 31, 33, 13, 24,
       24, 26, 22, 13, 23, 17, 23, 18, 28, 29, 26, 23, 17, 16, 28, 28, 17,
       13, 30, 20, 30, 22, 32, 18, 30, 14, 22, 24, 12, 25, 25, 19, 13,  9,
       22, 19,  9, 16, 28, 40, 13, 25, 18, 31, 12, 31, 33, 11, 25, 15, 32,
       21,  8, 30, 25, 17, 16, 23, 29, 17, 28, 32, 30, 12, 24, 17, 22, 38,
       13, 13, 25, 20, 19, 21, 17, 32, 17, 27, 11, 16, 22, 24, 29, 22, 29,
       15, 10, 22, 20, 13, 12, 13, 14,  9, 23, 24, 29, 35, 21, 18,  8, 11,
       15, 27, 26, 19, 14, 28, 21, 11, 10, 28, 17, 35, 20, 30, 21, 41, 16,
       19, 21, 23, 18, 32, 15,  9, 10, 25, 23, 16, 19, 20, 31, 24, 20,  9,
       22, 26,  8, 36, 30

In [883]:
lstm.out

array([[[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.5149, -0.0346, -0.3286, ..., -0.1882, -0.0276,  0.1175],
        [ 0.7545,  0.0282, -0.1592, ..., -0.4149, -0.0049, -0.1407],
        ...,
        [-0.0466,  0.3701, -0.4454, ...,  0.0014, -0.0075, -0.0044],
        [ 0.5875,  0.1914, -0.0786, ...,  0.0058, -0.081 , -0.1206],
        [ 0.6102,  0.3939, -0.0117, ...,  0.0298, -0.1872, -0.0031]],

       [[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.5149, -0.0346, -0.3286, ..., -0.1882, -0.0276,  0.1175],
        [ 0.4924, -0.2652, -0.3919, ..., -0.338 , -0.1627,  0.2448],
        ...,
        [ 0.2651,  0.1682, -0.4598, ..., -0.0756, -0.8167, -0.0436],
        [ 0.0166, -0.6327, -0.148 , ..., -0.0053, -0.7913, -0.3015],
        [ 0.0409, -0.648 , -0.0991, ..., -0.024 , -0.016 ,  0.3617]],

       [[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.5149, -0.0346, -0.3286, ..., -0.1882, -0.0276,  0.1175],
    

In [180]:
embedding_layer = Embeddings(5, 10, 0)
embedding_layer

Embeddings embedding_dim: 10 num_embeddings: 5

In [181]:
X = np.array([[2,3,0,0], [1,4,1,0]])
X

array([[2, 3, 0, 0],
       [1, 4, 1, 0]])

In [182]:
out1 = embedding_layer.forward(X)
out1

{'input_ids': array([[[ 0.11844567,  0.49995652, -0.65802771, -1.20548665,
           0.21329025,  0.58381855, -0.16526934, -0.25223838,
           0.76283889,  0.94854758],
         [-1.51672031,  0.03719266, -0.44633685,  2.44429478,
           1.71287478, -0.76141086,  1.07287826, -0.21405207,
          -1.29394351,  0.29765764],
         [ 0.61638133, -0.96546929, -0.10739356,  0.74669643,
           0.27681166,  1.67266701,  0.72222543,  0.93011466,
           1.28923875, -2.84106437],
         [ 0.61638133, -0.96546929, -0.10739356,  0.74669643,
           0.27681166,  1.67266701,  0.72222543,  0.93011466,
           1.28923875, -2.84106437]],
 
        [[-1.52225956, -0.99993144,  0.31207765, -1.16411845,
           0.95962902, -1.11016667,  0.27763399,  0.25024548,
           0.51261377, -1.12282121],
         [-1.51425629, -0.2689891 ,  0.7137898 ,  0.89214974,
          -0.48302   ,  0.94458768, -0.37389572, -0.14598221,
          -0.59578975, -0.73285059],
         [-1.52225

In [183]:
lstm1 = LSTM(4, return_seq=True, return_mask=True)(embedding_layer)
lstm = LSTM(4, return_seq=False)(lstm1)
dense = Dense(2)(lstm)


In [184]:
lstm.inp_dim

4

In [944]:
# X = np.ones((5, 4, 3))
# X

In [608]:
lstm.fg

array([[[0.    , 0.    , 0.    , 0.    ],
        [0.6511, 0.4577, 0.9708, 0.2667],
        [0.0445, 0.133 , 0.5706, 0.6013],
        [0.018 , 0.9978, 0.1421, 0.0926],
        [0.0074, 0.9998, 0.4178, 0.1584]],

       [[0.    , 0.    , 0.    , 0.    ],
        [0.0766, 0.0793, 0.9569, 0.7063],
        [0.0442, 0.9916, 0.0006, 0.0399],
        [0.0294, 0.403 , 0.9871, 0.7899],
        [0.0937, 0.9995, 0.2483, 0.0019]]])

In [185]:
out = lstm1.forward(out1)
out

{'input_ids': array([[[ 3.74326341e-01,  1.09327787e-01, -2.01756594e-02,
           7.11266094e-02],
         [ 2.76879383e-02, -1.26430568e-01,  1.39025959e-02,
          -6.55696046e-02],
         [-6.52705451e-01, -8.33636621e-01, -4.68831358e-01,
           6.32572510e-03],
         [-9.03486253e-01, -9.51213402e-01, -8.68561298e-01,
           8.78239453e-03]],
 
        [[ 3.92885282e-02, -6.53056042e-02, -5.39128568e-01,
           1.42195093e-04],
         [-5.36765554e-01, -7.64438079e-01, -1.20788970e-01,
           2.18462473e-02],
         [ 3.88728567e-02, -1.23081991e-01, -8.31529499e-01,
           2.04708422e-02],
         [-7.39809751e-01, -8.95744397e-01, -9.66949000e-01,
           1.07063025e-02]]]),
 'seq_lens': array([2, 3])}

In [186]:
lstm1.out

array([[[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00],
        [ 3.74326341e-01,  1.09327787e-01, -2.01756594e-02,
          7.11266094e-02],
        [ 2.76879383e-02, -1.26430568e-01,  1.39025959e-02,
         -6.55696046e-02],
        [-6.52705451e-01, -8.33636621e-01, -4.68831358e-01,
          6.32572510e-03],
        [-9.03486253e-01, -9.51213402e-01, -8.68561298e-01,
          8.78239453e-03]],

       [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          0.00000000e+00],
        [ 3.92885282e-02, -6.53056042e-02, -5.39128568e-01,
          1.42195093e-04],
        [-5.36765554e-01, -7.64438079e-01, -1.20788970e-01,
          2.18462473e-02],
        [ 3.88728567e-02, -1.23081991e-01, -8.31529499e-01,
          2.04708422e-02],
        [-7.39809751e-01, -8.95744397e-01, -9.66949000e-01,
          1.07063025e-02]]])

In [187]:
X

array([[2, 3, 0, 0],
       [1, 4, 1, 0]])

In [188]:
out3 = lstm.forward(out)
out3

array([[-0.09760457, -0.13338779,  0.02814205, -0.14806707],
       [ 0.34470959,  0.23761088, -0.17409077,  0.23676048]])

In [189]:
lstm.out

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.07855371, -0.14074963,  0.16732286, -0.13013733],
        [-0.09760457, -0.13338779,  0.02814205, -0.14806707],
        [ 0.03975932,  0.1266031 , -0.13598183,  0.20835219],
        [ 0.0561025 ,  0.10093059, -0.1341875 ,  0.27669279]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.13433303,  0.2249573 , -0.09394001, -0.01706918],
        [ 0.13240294,  0.11801888, -0.1209671 ,  0.23776542],
        [ 0.34470959,  0.23761088, -0.17409077,  0.23676048],
        [ 0.12173959,  0.10047364, -0.10940592,  0.23454546]]])

In [190]:
out2 = dense.forward(out3)
out2

array([[-0.11223215, -0.03931857],
       [-0.16044303,  0.77155521]])

In [193]:
dX, dW, db = lstm.backward(dX)

dX
[[[ 0.          0.          0.          0.        ]
  [ 0.05332589  0.00516852  0.06965946 -0.01055025]
  [ 0.03592446 -0.0117603   0.03784639 -0.01498404]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[ 0.          0.          0.          0.        ]
  [-0.01275348 -0.05692982  0.03757692  0.01166092]
  [ 0.17349736 -0.42322456 -0.32163424  0.0360653 ]
  [-0.9429311  -0.30329168  0.04347839  0.44282734]
  [ 0.          0.          0.          0.        ]]]
dstate
[[[-0.00261688 -0.018689   -0.01478616 -0.01906517]
  [-0.00419815 -0.06514418 -0.01982833 -0.02856457]
  [ 0.04522555 -0.08438495 -0.03783448 -0.02488617]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.04378892 -0.01372805  0.01033982  0.01623359]
  [-0.09185422 -0.07548797  0.01281539  0.03317353]
  [-0.27016119 -0.02206315 -0.59054194  0.58491162]
  [-0.3024206  -0.53490295 -0.7187889   0.698510

In [195]:
dX, dW, db = lstm1.backward(dX)

dX
[[[ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]
  [ 0.00218676  0.00632558 -0.0044597   0.00114102 -0.00377907
    0.02417695  0.00120844 -0.0032456   0.00779047 -0.01200078]
  [ 0.00457815 -0.00165317  0.00565791  0.00260445  0.00158525
    0.00437805 -0.00157866  0.00077636  0.00073112 -0.00178632]
  [ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]]

 [[ 0.          0.          0.          0.          0.
    0.          0.          0.          0.          0.        ]
  [ 0.00177688  0.02925502  0.01909082  0.00404067  0.01567801
    0.05020792 -0.0291943   0.01623197 -0.01736629 -0.04664208]
  [ 0.07661261 -0.04912754  0.02399619  0.09333513 -0.05932359
   -0.0087092   0.05002928 -0.03507111  0.08198772 -0.00519837]
  [-0.06099

In [175]:
lstm.X

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.13776511,  0.02097367,  0.40883244, -0.26052464],
        [-0.28063548,  0.00266169,  0.86347146, -0.29904221],
        [-0.69821838, -0.50917675,  0.84450935, -0.37113322],
        [-0.78662037, -0.35453346,  0.89423365, -0.28881855]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [-0.69887257, -0.27756377,  0.06375953,  0.18895972],
        [-0.85692807, -0.03247021,  0.70473237, -0.56481148],
        [-0.94235021, -0.34661484,  0.64816491, -0.47360723],
        [-0.82331877, -0.76438959,  0.92088445, -0.31302486]]])

In [612]:
dO_ = np.zeros_like(lstm.out)
dO_[np.arange(out1['input_ids'].shape[0]), lstm.mask, :] = out
dO_
# dO = dO_

array([[[ 0.    ,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    ,  0.    ],
        [-0.0036, -0.582 , -0.8003,  0.1991],
        [ 0.    ,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    ,  0.    ]],

       [[ 0.    ,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    ,  0.    ],
        [-0.0128, -0.0181,  0.1125, -0.7442],
        [ 0.    ,  0.    ,  0.    ,  0.    ]]])

In [197]:
embedding_layer.backward(dX)

(0,
 array([[ 0.        , -0.05922114,  0.00218676,  0.00457815,  0.07661261],
        [ 0.        ,  0.02145665,  0.00632558, -0.00165317, -0.04912754],
        [ 0.        ,  0.0447334 , -0.0044597 ,  0.00565791,  0.02399619],
        [ 0.        ,  0.04976271,  0.00114102,  0.00260445,  0.09333513],
        [ 0.        , -0.04457249, -0.00377907,  0.00158525, -0.05932359],
        [ 0.        ,  0.08656122,  0.02417695,  0.00437805, -0.0087092 ],
        [ 0.        , -0.0050131 ,  0.00120844, -0.00157866,  0.05002928],
        [ 0.        ,  0.08998536, -0.0032456 ,  0.00077636, -0.03507111],
        [ 0.        ,  0.00597879,  0.00779047,  0.00073112,  0.08198772],
        [ 0.        , -0.07629663, -0.01200078, -0.00178632, -0.00519837]]),
 None)

In [641]:
X

array([[2, 3, 0, 0],
       [1, 4, 1, 0]])

In [196]:
dX

array([[[ 0.00218676,  0.00632558, -0.0044597 ,  0.00114102,
         -0.00377907,  0.02417695,  0.00120844, -0.0032456 ,
          0.00779047, -0.01200078],
        [ 0.00457815, -0.00165317,  0.00565791,  0.00260445,
          0.00158525,  0.00437805, -0.00157866,  0.00077636,
          0.00073112, -0.00178632],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ]],

       [[ 0.00177688,  0.02925502,  0.01909082,  0.00404067,
          0.01567801,  0.05020792, -0.0291943 ,  0.01623197,
         -0.01736629, -0.04664208],
        [ 0.07661261, -0.04912754,  0.02399619,  0.09333513,
         -0.05932359, -0.0087092 ,  0.05002928, -0.03507111,
          0.08198772, -0.00519837],
        [-0.06099801, -0.00779837,  0.02564258,  0

In [173]:
dW

array([[ 4.34236954e-05,  3.48001222e-03, -3.01183044e-04,
        -1.39723269e-04, -3.23882547e-03,  6.49948497e-04,
         6.11086862e-03, -5.69352317e-03],
       [ 6.82570249e-03,  1.81512230e-02,  5.16372383e-04,
         1.47822349e-02, -3.37038306e-02, -9.55732222e-03,
         4.37065888e-02, -2.04024695e-02],
       [ 6.80646142e-03,  3.39200561e-02,  2.71320253e-02,
         3.31304287e-02, -1.53301216e-01, -5.22977361e-02,
         7.82465163e-02, -4.94589470e-02],
       [-5.97894992e-03, -1.61054161e-02, -7.73983436e-03,
        -1.42489793e-02,  4.97698560e-02,  1.17907714e-02,
        -4.00771246e-02,  2.71915634e-02],
       [ 1.17544666e-03,  6.84595295e-04, -4.62125066e-04,
         8.17768813e-04, -5.57717133e-03,  1.90028548e-04,
         7.13945130e-03, -6.02578380e-03],
       [ 7.30720765e-03,  1.56494740e-02,  2.49934283e-03,
         1.17733442e-02, -3.30604011e-02, -1.22425306e-02,
         4.10656120e-02, -2.10979068e-02],
       [ 1.90958911e-03,  2.030879

In [174]:
db

array([[ 0.01220342,  0.0572976 ,  0.1649863 , -0.06155899],
       [ 0.01121735,  0.05274276,  0.12200713, -0.05893465],
       [ 0.04408651,  0.19130427,  0.04313542, -0.04690746],
       [ 0.08806456,  0.12116166,  0.39583877, -0.12531922]])

In [845]:
# Wf = dW[:4, :]
Wf

array([[10.0001, 10.0024,  9.9972, 10.    , 10.0043,  9.999 , 10.0012,
         9.9973,  9.9998,  9.9999, 10.0034, 10.0084,  9.9988,  9.9993],
       [ 9.9997,  9.9998, 10.0011,  9.9998, 10.0001,  9.9994,  9.9997,
         9.9991,  9.9992, 10.0031,  9.9988,  9.9995,  9.9995, 10.0017],
       [10.0035, 10.0035, 10.0214, 10.0288, 10.0168, 10.0257,  9.9654,
        10.0119,  9.8888, 10.0588, 10.0023,  9.9728,  9.9593,  9.9891],
       [ 9.9995,  9.9996,  9.9998, 10.001 , 10.0009, 10.0004,  9.9992,
         9.9994, 10.    ,  9.9999,  9.9999,  9.9989, 10.0006,  9.999 ]])

In [171]:
np.set_printoptions(suppress=True, precision=4)

In [841]:
Wf += 10

In [843]:
dW = dW + 100

In [844]:
dW

array([[110.0001, 110.0024, 109.9972, 110.    , 110.0043, 109.999 ,
        110.0012, 109.9973, 109.9998, 109.9999, 110.0034, 110.0084,
        109.9988, 109.9993],
       [109.9997, 109.9998, 110.0011, 109.9998, 110.0001, 109.9994,
        109.9997, 109.9991, 109.9992, 110.0031, 109.9988, 109.9995,
        109.9995, 110.0017],
       [110.0035, 110.0035, 110.0214, 110.0288, 110.0168, 110.0257,
        109.9654, 110.0119, 109.8888, 110.0588, 110.0023, 109.9728,
        109.9593, 109.9891],
       [109.9995, 109.9996, 109.9998, 110.001 , 110.0009, 110.0004,
        109.9992, 109.9994, 110.    , 109.9999, 109.9999, 109.9989,
        110.0006, 109.999 ],
       [100.0001,  99.9998,  99.9997,  99.9995,  99.998 , 100.0013,
         99.9999, 100.0027,  99.9964, 100.0023,  99.9978,  99.9966,
         99.9999,  99.9987],
       [100.0002, 100.0041, 100.0113,  99.9986,  99.9839, 100.0199,
         99.991 , 100.0184, 100.0339,  99.9832,  99.9831,  99.9619,
         99.9924,  99.9719],
       [10

In [4]:
x=1
(np.exp(x) - np.exp(-x))/(np.exp(-x) + np.exp(x))

0.7615941559557649

In [65]:
a = np.random.randint(0, 10, (3,4))
a

array([[6, 6, 1, 0],
       [3, 1, 3, 6],
       [7, 4, 6, 3]])

In [66]:
b = np.array([1,2,3, 5])
b

array([1, 2, 3, 5])

In [67]:
a + b

array([[ 7,  8,  4,  5],
       [ 4,  3,  6, 11],
       [ 8,  6,  9,  8]])

In [27]:
a *= 2

In [28]:
b

array([ 8, 18, 12, 20])

In [32]:
dO = np.random.randint(0,10, (3, 4, 9))
dO

array([[[0, 3, 5, 1, 1, 1, 7, 5, 2],
        [8, 1, 3, 6, 0, 2, 1, 0, 0],
        [9, 2, 2, 2, 3, 0, 6, 5, 5],
        [4, 9, 5, 4, 6, 1, 4, 9, 3]],

       [[6, 9, 1, 8, 0, 0, 2, 2, 8],
        [7, 2, 3, 0, 7, 8, 8, 3, 6],
        [2, 5, 2, 0, 4, 1, 9, 3, 7],
        [6, 1, 3, 1, 8, 2, 3, 1, 2]],

       [[3, 2, 8, 3, 3, 1, 7, 0, 0],
        [8, 2, 1, 1, 7, 8, 9, 4, 7],
        [6, 5, 8, 6, 0, 8, 5, 3, 3],
        [0, 0, 6, 0, 8, 6, 2, 2, 1]]])

In [49]:
dO = np.concatenate([np.zeros((dO.shape[0], 1, dO.shape[2])), dO], axis=1)

In [40]:
np.zeros((dO.shape[0], 1, dO.shape[2])).shape

(3, 1, 9)

In [47]:
dO

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 3., 5., 1., 1., 1., 7., 5., 2.],
        [8., 1., 3., 6., 0., 2., 1., 0., 0.],
        [9., 2., 2., 2., 3., 0., 6., 5., 5.],
        [4., 9., 5., 4., 6., 1., 4., 9., 3.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [6., 9., 1., 8., 0., 0., 2., 2., 8.],
        [7., 2., 3., 0., 7., 8., 8., 3., 6.],
        [2., 5., 2., 0., 4., 1., 9., 3., 7.],
        [6., 1., 3., 1., 8., 2., 3., 1., 2.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [3., 2., 8., 3., 3., 1., 7., 0., 0.],
        [8., 2., 1., 1., 7., 8., 9., 4., 7.],
        [6., 5., 8., 6., 0., 8., 5., 3., 3.],
        [0., 0., 6., 0., 8., 6., 2., 2., 1.]]])

In [535]:
emb = np.random.randint(0,20, (10, 8))
emb

array([[ 0, 12, 15,  4,  1, 19, 14, 16],
       [15,  0, 18,  6, 10, 13,  4,  5],
       [ 9,  3, 18,  7,  1, 16,  0,  5],
       [11,  7,  4, 13,  6,  1, 10,  3],
       [ 6,  2,  1, 16,  9, 17,  1,  2],
       [10,  0, 10, 14, 12,  2, 16, 10],
       [ 9,  0,  5, 13, 18, 10,  5,  7],
       [ 9, 19,  0,  0,  3, 15, 15, 12],
       [12,  5, 10,  7,  0, 16, 13,  8],
       [15, 16, 11,  2, 14, 15, 13,  2]])

In [546]:
X = np.array([[1,2,3, 7], [4,3,6,6], [4,1,7,5]])
X

array([[1, 2, 3, 7],
       [4, 3, 6, 6],
       [4, 1, 7, 5]])

In [548]:
(X == 6).sum(axis=1)

array([0, 2, 0])

In [203]:
emb[np.ix_(np.arange(10))]

ValueError: Cross index must be 1 dimensional

In [537]:
np.take(emb.T, X, axis=0)

array([[[12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [15, 18, 18,  4,  1, 10,  5,  0, 10, 11],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2],
        [19, 13, 16,  1, 17,  2, 10, 15, 16, 15]]])

In [538]:
# N, T, emb_size 
dW = np.zeros_like(emb)
dW


array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [539]:
dO = np.tile(np.arange(10), (3,4,1))
dO

array([[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],

       [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],

       [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]])

In [472]:
dW.T[X, :] += dO

In [473]:
dW

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1],
       [0, 2, 2, 2, 2, 2, 2, 2],
       [0, 3, 3, 3, 3, 3, 3, 3],
       [0, 4, 4, 4, 4, 4, 4, 4],
       [0, 5, 5, 5, 5, 5, 5, 5],
       [0, 6, 6, 6, 6, 6, 6, 6],
       [0, 7, 7, 7, 7, 7, 7, 7],
       [0, 8, 8, 8, 8, 8, 8, 8],
       [0, 9, 9, 9, 9, 9, 9, 9]])

In [477]:
np.add.at(dW.T, np.s_[X, :], dO)

In [478]:
dW

array([[ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  2,  1,  2,  2,  1,  2,  2],
       [ 0,  4,  2,  4,  4,  2,  4,  4],
       [ 0,  6,  3,  6,  6,  3,  6,  6],
       [ 0,  8,  4,  8,  8,  4,  8,  8],
       [ 0, 10,  5, 10, 10,  5, 10, 10],
       [ 0, 12,  6, 12, 12,  6, 12, 12],
       [ 0, 14,  7, 14, 14,  7, 14, 14],
       [ 0, 16,  8, 16, 16,  8, 16, 16],
       [ 0, 18,  9, 18, 18,  9, 18, 18]])

In [540]:
X = np.take(emb.T, X, axis=0)

In [541]:
X

array([[[12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [15, 18, 18,  4,  1, 10,  5,  0, 10, 11],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2],
        [19, 13, 16,  1, 17,  2, 10, 15, 16, 15]]])

In [492]:
mask = np.arange(X.shape[1]) > np.array([1,2,0])[:, np.newaxis]
mask

array([[False, False,  True,  True],
       [False, False, False,  True],
       [False,  True,  True,  True]])

In [493]:
X[mask, :] = 0

In [542]:
X

array([[[12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [15, 18, 18,  4,  1, 10,  5,  0, 10, 11],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [ 4,  6,  7, 13, 16, 14, 13,  0,  7,  2],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13],
        [14,  4,  0, 10,  1, 16,  5, 15, 13, 13]],

       [[ 1, 10,  1,  6,  9, 12, 18,  3,  0, 14],
        [12,  0,  3,  7,  2,  0,  0, 19,  5, 16],
        [16,  5,  5,  3,  2, 10,  7, 12,  8,  2],
        [19, 13, 16,  1, 17,  2, 10, 15, 16, 15]]])

In [487]:
mask = np.array([1,2,0])
dO_ = np.zeros_like(X)
dO_

array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])

In [490]:
dO = X[np.arange(3), 1, :]
dO

array([[ 4,  7, 14, 19, 16, 15,  1,  4,  4, 19],
       [ 5, 10, 13,  2, 15, 10,  9,  6, 16,  1],
       [16,  4, 13, 19,  1,  1,  7,  1,  7, 14]])

In [491]:
dO_[np.arange(X.shape[0]), mask, :] = dO
dO_

array([[[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4,  7, 14, 19, 16, 15,  1,  4,  4, 19],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],

       [[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 5, 10, 13,  2, 15, 10,  9,  6, 16,  1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],

       [[16,  4, 13, 19,  1,  1,  7,  1,  7, 14],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]])

In [263]:
X_2d = X.reshape(-1, X.shape[2])
X_2d

array([[ 6,  0,  9, 11,  2,  9, 17, 16,  6,  4],
       [13, 19, 11,  0, 14, 16,  1, 15,  4,  9],
       [13, 19, 11,  0, 14, 16,  1, 15,  4,  9],
       [11,  1,  7, 17, 14,  2, 12, 11,  8, 15],
       [18, 12, 11, 16, 16,  2, 13,  0, 15, 12],
       [ 6,  0,  9, 11,  2,  9, 17, 16,  6,  4]])

In [293]:
X_2d.reshape(*a.shape)

array([[[ 6,  0,  9, 11,  2,  9, 17, 16,  6,  4],
        [13, 19, 11,  0, 14, 16,  1, 15,  4,  9]],

       [[13, 19, 11,  0, 14, 16,  1, 15,  4,  9],
        [11,  1,  7, 17, 14,  2, 12, 11,  8, 15]],

       [[18, 12, 11, 16, 16,  2, 13,  0, 15, 12],
        [ 6,  0,  9, 11,  2,  9, 17, 16,  6,  4]]])

In [394]:
a = np.zeros_like(X_2d.reshape(-1, 2, 10))
a

array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])

In [296]:
b = a

In [298]:
b = a.reshape(-1, 10)

In [299]:
b

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [395]:
a

array([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])

In [391]:
def f1(con):
    return 1 if con else 2

In [393]:
f1(False)

2

In [508]:
np.array(1).repeat(10)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [885]:
ou = np.arange(9)
ou

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [886]:
bu = ou
ou = ou[1:]

In [887]:
bu

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [888]:
ou

array([1, 2, 3, 4, 5, 6, 7, 8])