In [98]:
import random
from corpora_utils import *

random.seed(31)

In [121]:
# load train data from data/stanford_movie_reviews_train
# data returned is a list of tuples
# each tuple is of the form (label = 1 or -1, dictionary of word counts)
train_data = load_stanford_imdb_train_data()

# prepare a dictionary of weights, initialized to some small random value
weights = {word: 0.0 for _, wc in train_data for word in wc}
bias = 0.0

In [122]:
# define learning parameters
n_iterations = 2500
eta = 0.01  # learning rate

# useful functions
def scalar_product(c, x):
    """
    c - a scalar
    x - dictionary -- word counts
    """
    return {k: c*x[k] for k in x}

def dot_product(dict1, dict2):
    if len(dict1) > len(dict2):
        return dot_product(dict2, dict1)
    dp = 0.0
    for k in dict1:
        dp += dict1[k]*dict2.get(k, 0.0)
    return dp

# score function
def score(y, x, w, b):
    """
    Score is just an affine function, its calculated as y*(x.w + b)
    """
    return y*(dot_product(x, w) + b)

# hinge loss
def hinge_loss(y, x, w, b):
    """
    Computes hinge loss is calculated as max{0.0, 1 - score}
    """
    return max(0.0, (1 - score(y, x, w, b)))

# gradients of w & b
def calc_gradients(y, x, w, b):
    """
    Calculates gradients of weights and bias
    Returns a tuple (weights gradient, bias gradient, boolean indicating if weights need to be updated)
    
    y: float - training sample's label
    x: dict - training sample's features
    w: dict - weights
    b: float - bias
    """
    s = score(y, x, w, b)
    return (scalar_product(-y, x), -y, True) if s < 1 else (0.0, 0.0, False)

# update weights
def update_weights(w, b, w_grad, b_grad, lr):
    """
    w - weights, to be updated. NOTE: weights are updated inplace -- a mutation
    b - bias, to be updated
    w_grad - weights gradient
    b_grad - bias gradient
    lr - learning rate
    """
    for k in w_grad:
        if k in w:
            w[k] -= lr*w_grad[k]
    b -= lr*b_grad
    return w, b  # though weights are returned, they are actually updated inplace

In [123]:
total_hl = 0.0
for i in range(1, n_iterations+1):
    # select a random training sample
    label, features = random.choice(train_data)
    
    # compute loss with current weights and bias
    hl = hinge_loss(label, features, weights, bias)
    
    # calculate gradients with respect to weights and bias
    weight_grad, bias_grad, requires_weight_update = calc_gradients(label, features, weights, bias)
    
    # update weights and bias if required
    if requires_weight_update:
        weights, bias = update_weights(weights, bias, weight_grad, bias_grad, eta)
    
    total_hl += hl
    # print('Iteration# {} avg. training loss = {}'.format(i, total_hl/i))

print('Train loss after training = {}'.format(total_hl/n_iterations))

Train loss after training = 1.5812760000000006


In [124]:
# load test data
test_data = load_stanford_imdb_test_data()

n_test = len(test_data)
n_correct = 0
for y_test, x_test in test_data:
    if score(y_test, x_test, weights, bias) > 0:
        n_correct += 1

print('#correct = {}'.format(n_correct))
print('#wrong = {}'.format(n_test - n_correct))
print('% correct = {}'.format(round(n_correct*100.0/n_test, 1)))


#correct = 3658
#wrong = 1342
% correct = 73.2
