# Word-level entailment with neural networks

In [1]:
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2016"

In [2]:
import os
import sys
import copy
import cPickle as pickle
import random
from collections import defaultdict
import numpy as np
from numpy import dot, outer
from sklearn.metrics import classification_report
import tensorflow as tf
import utils

## Set-up

In [3]:
wordentail_data__filename = 'wordentail_data.pickle'
glove_home = "glove.6B"

## Overview

__Problem__: For two words $w_{1}$ and $w_{2}$, predict $w_{1} \subset w_{2}$ or $w_{1} \supset w_{2}$

__Approach__: Feed-forward neural networks

![fig/wordentail.png](fig/wordentail.png)

## Data

In [4]:
wordentail_data = pickle.load(file(wordentail_data__filename))
vocab, splits = wordentail_data

In [5]:
splits.keys()

['test', 'disjoint_vocab_test', 'train']

* All three sets are disjoint. 

* The `test` vocab is a subset of the `train` vocab. So every word seen at test time was seen in training. 

* The `disjoint_test` has a vocabulary that is totally disjoint from `train`. So none of the words are seen in training. 

* All the words are in the GloVe vocabulary.

In [6]:
# Class labels:
SUBSET = 1.0    # Left word entails right, as in (hippo, mammal)
SUPERSET = -1.0 # Right word entails left, as in (mammal, hippo)

## Shallow neural network from scratch

Our simple shallow neural network is parameterized as follows:

* A weight matrix $W^{1}$ of dimension $m \times n$, where $m$ is the dimensionality of the input vector representations and $n$ is the dimensionality of the hidden layer.
* A bias term $b_{1}$ of dimension $m \times 1$.
* A weight matrix $W^{2}$ of dimension $n \times p$, where $p$ is the dimensionality of the output vector.
* A bias term $b_{2}$ of dimension $n \times 1$.
* A non-linear activation functions $f$. In our initial experiments, this is $\tanh$.

The network is then defined as follows, with $x$ the input layer, $h$ the hidden layer of dimension $n$, and $y$ the output of dimension $1 \times p$:

$$h = f\left(xW^{1} + b^{1}\right)$$

$$y = f\left(hW^{2} + b^{2}\right)$$

In [51]:
def d_tanh(z):
    return 1.0 - z**2

def progress_bar(iteration, error):
    sys.stderr.write('\r')
    sys.stderr.write('completed iteration %s; error is %s' % ((iteration+1), error))
    sys.stderr.flush()

class ShallowNeuralNetwork:
    def __init__(self, 
            input_dim=0, 
            hidden_dim=0, 
            output_dim=0, 
            afunc=np.tanh, 
            d_afunc=d_tanh,
            maxiter=100,
            eta=0.05,
            epsilon=1.5e-8,
            display_progress=True):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.afunc = afunc 
        self.d_afunc = d_afunc 
        self.maxiter = maxiter
        self.eta = eta        
        self.epsilon = epsilon
        self.display_progress = display_progress
                
    def forward_propagation(self, ex):        
        self.x[ : -1] = ex # ignore the bias
        self.h[ : -1] = self.afunc(dot(self.x, self.W1)) # ignore the bias
        self.y = self.afunc(dot(self.h, self.W2))
        return copy.deepcopy(self.y)
        
    def backward_propagation(self, y_):
        y_ = np.array(y_)       
        self.y_err = (y_ - self.y) * self.d_afunc(self.y)
        h_err = dot(self.y_err, self.W2.T) * self.d_afunc(self.h)
        self.W2 += self.eta * outer(self.h, self.y_err)
        self.W1 += self.eta * outer(self.x, h_err[:-1]) # ignore the bias
        return np.sum(0.5 * (y_ - self.y)**2)

    def fit(self, training_data):        
        # Parameter initialization:
        self.x = np.ones(self.input_dim+1)  # +1 for the bias                                         
        self.h = np.ones(self.hidden_dim+1) # +1 for the bias        
        self.y = np.ones(self.output_dim)        
        self.W1 = utils.randmatrix(self.input_dim+1, self.hidden_dim)
        self.W2 = utils.randmatrix(self.hidden_dim+1, self.output_dim)        
        self.y_err = np.zeros(self.output_dim)
        self.x_err = np.zeros(self.input_dim+1)
        # SGD:
        iteration = 0
        error = sys.float_info.max
        while error > self.epsilon and iteration < self.maxiter:            
            error = 0.0
            random.shuffle(training_data)
            for ex, labels in training_data:
                self.forward_propagation(ex)
                error += self.backward_propagation(labels)           
            if self.display_progress:
                progress_bar(iteration, error)
            iteration += 1
                    
    def predict(self, ex):
        self.forward_propagation(ex)
        return copy.deepcopy(self.y)
        
    def hidden_representation(self, ex):
        self.forward_propagation(ex)
        return self.h

## Input feature representation

Where $x_{l}$ is a vector representation of the left word and $x_{r}$ is a representation of the right word, we define a combination function $\textbf{combine}$ such that $\textbf{combine}(x_{l}, x_{r})$ returns a new input vector $x$ of dimension $1 \times m$. $\textbf{combine}$ could be concatenation, vector average, vector difference, etc. (even combinations of those) &mdash; there's lots of space for experimentation here.

### Representing the inputs

In [8]:
def randvec(w, n=40, lower=-0.5, upper=0.5):
    """Returns a random vector of length n. w is ignored."""
    return np.array([random.uniform(lower, upper) for i in range(n)])

In [9]:
glove_src = os.path.join(glove_home, 'glove.6B.50d.txt')
GLOVE_MAT, GLOVE_VOCAB, _ = utils.build_glove(glove_src)

def glvvec(w):
    """Return the GloVe vector for w."""
    i = GLOVE_VOCAB.index(w)
    return GLOVE_MAT[i]

### Combining the inputs

In [10]:
def vec_concatenate(u, v):
    return np.concatenate((u, v))

## Building datasets for experiments

In [48]:
def build_dataset(wordentail_data,vector_func=randvec, vector_combo_func=vec_concatenate): 
    # Load in the dataset:
    vocab, splits = wordentail_data
    # Make vectors a mapping from words (as strings) to their vector
    # representations, as determined by vector_func.
    vectors = {w: vector_func(w) for w in vocab}
    # Create a dataset in the format required by the neural network:
    # {'train': [(vec, [cls]), (vec, [cls]), ...],
    #  'test':  [(vec, [cls]), (vec, [cls]), ...] }
    dataset = defaultdict(list)
    for split, data in splits.items():
        for clsname, word_pairs in data.items():
            for w1, w2 in word_pairs:
                # Use vector_combo_func to combine the word vectors for
                # w1 and w2, as given by the vectors dictionary above,
                # and pair it with the singleton array containing clsname.
                item = [vector_combo_func(vectors[w1], vectors[w2]), np.array([clsname])]
                dataset[split].append(item)
    return dataset

## Running experiments

In [52]:
def experiment(dataset, network): 
    # Get the train and test sets from the dataset:
    train = dataset['train']
    test = dataset['test']
    disjoint_vocab_test = dataset['disjoint_vocab_test']    
    # Set these dimensions based on the data:
    network.input_dim = len(train[0][0])
    network.output_dim = len(train[0][1])    
    # Train the network, with the number of iterations set you by you
    # (make it a keyword argument to this function). You might want
    # to use display_progress=True to track errors andd speed.
    network.fit(train)
    # The following is evaluation code. You won't have to alter it
    # unless you did something unexpected like  transform the output
    # variables before training.
    for typ, data in (('train', train), ('test', test), ('disjoint_vocab_test', disjoint_vocab_test)):
        predictions = []
        cats = []
        for ex, cat in data:            
            # The raw prediction is a singleton list containing a float in (-1,1).
            # We want only its contents:
            prediction = network.predict(ex)[0]
            # Categorize the prediction for accuracy comparison:
            prediction = SUPERSET if prediction <= 0.0 else SUBSET            
            predictions.append(prediction)
            # Store the gold label for the classification report:
            cats.append(cat[0])
        # Report:
        print "======================================================================"
        print typ
        print classification_report(cats, predictions, target_names=['SUPERSET', 'SUBSET'])

In [53]:
dataset = build_dataset(wordentail_data, vector_func=randvec, vector_combo_func=vec_concatenate)

network = ShallowNeuralNetwork(hidden_dim=40, maxiter=100, eta=0.05, display_progress=True)

experiment(dataset, network)

completed iteration 100; error is 60.0332157301

train
             precision    recall  f1-score   support

   SUPERSET       0.99      0.99      0.99      2000
     SUBSET       0.99      0.99      0.99      2000

avg / total       0.99      0.99      0.99      4000

test
             precision    recall  f1-score   support

   SUPERSET       0.87      0.89      0.88       200
     SUBSET       0.88      0.87      0.88       200

avg / total       0.88      0.88      0.88       400

disjoint_vocab_test
             precision    recall  f1-score   support

   SUPERSET       0.40      0.35      0.37        49
     SUBSET       0.43      0.49      0.46        49

avg / total       0.42      0.42      0.42        98



## Shallow neural network in TensorFlow

In [61]:
class TensorFlowShallowNeuralNetwork:
    def __init__(self, 
            input_dim=0, 
            hidden_dim=0, 
            output_dim=0,             
            maxiter=100,
            eta=0.05):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.maxiter = maxiter
        self.eta = eta            
                
    def fit(self, training_data):
        self.sess = tf.InteractiveSession()
        # Network initialization:
        self.x = tf.placeholder(tf.float32, [None, self.input_dim])
        self.W1 = tf.Variable(tf.random_normal([self.input_dim, self.hidden_dim]))
        self.b1 = tf.Variable(tf.random_normal([self.hidden_dim]))
        self.W2 = tf.Variable(tf.random_normal([self.hidden_dim, self.output_dim]))
        self.b2 = tf.Variable(tf.random_normal([self.output_dim]))
        # Network structure:
        self.h = tf.nn.tanh(tf.matmul(self.x, self.W1) + self.b1)
        self.y = tf.nn.tanh(tf.matmul(self.h, self.W2) + self.b2)
        self.y_ = tf.placeholder(tf.float32, [None, self.output_dim])
        # Optimization:
        mean_squared_error = tf.reduce_sum(0.5 * (self.y_-self.y)**2)
        self.optimizer = tf.train.GradientDescentOptimizer(self.eta).minimize(mean_squared_error)
        # Train:
        init = tf.initialize_all_variables()
        self.sess.run(init)        
        x, y_ = zip(*training_data)
        for iteration in range(self.maxiter):            
            self.optimizer.run(feed_dict={self.x: x, self.y_: y_})                       

    def predict(self, ex):
         return self.sess.run(self.y, feed_dict={self.x: [ex]})

In [63]:
dataset = build_dataset(wordentail_data, vector_func=randvec, vector_combo_func=vec_concatenate)
tfnet = TensorFlowShallowNeuralNetwork(hidden_dim=20, maxiter=1000)
experiment(dataset, tfnet)

train
             precision    recall  f1-score   support

   SUPERSET       0.95      0.93      0.94      2000
     SUBSET       0.93      0.95      0.94      2000

avg / total       0.94      0.94      0.94      4000

test
             precision    recall  f1-score   support

   SUPERSET       0.82      0.84      0.83       200
     SUBSET       0.84      0.81      0.83       200

avg / total       0.83      0.83      0.83       400

disjoint_vocab_test
             precision    recall  f1-score   support

   SUPERSET       0.54      0.51      0.53        49
     SUBSET       0.54      0.57      0.55        49

avg / total       0.54      0.54      0.54        98



## Bake-off

### Deep neural network in TensorFlow