# Applied Machine Learning - Project 2
### Boaz Shvartzman, Ofir Ziv

In [1]:
%matplotlib notebook

In [10]:
import re
import numpy as np
import matplotlib.pyplot as plt

## Part 1

    1.

In [22]:
class DatasetSplitParser(object):
    
    def __init__(self, split_path):
        with open(split_path, 'r')as f:
            file_content = f.read()

        self._ds_by_index = {}
        
        for row in file_content.split('\n')[1:-1]:
            index, ds = row.split(',')
            self._ds_by_index[int(index)] = int(ds)
            
    def get_dataset_by_index(self):
        return self._ds_by_index

    2.

In [60]:
class DatasetsHolder(object):
    
    def __init__(self, sentences_path, datasplit_parser):
        with open(sentences_path, 'r')as f:
            file_content = f.read()
        
        file_content = file_content.decode("ascii", "ignore") # Remove non-ASCII
        file_content = re.sub(r'([^\s\w]|_)+', '', file_content) # Remove non-alphanumeric
        file_content = file_content.lower() # Lowercase
        
        self._trainset = {}
        self._testset = {}
        
        ds_by_index = datasplit_parser.get_dataset_by_index()
        
        for row in file_content.split('\n')[1:-1]:
            
            index, sentence = row.split('\t')
            index, sentence = int(index), re.sub(r'\b\w{1,2}\b', '', sentence).split()
            
            if ds_by_index[index] == 1:
                self._trainset[index] = sentence
                
            elif ds_by_index[index] == 2:
                self._testset[index] = sentence
            
    def get_trainset(self):
        return self._trainset
    
    def get_testset(self):
        return self._testset

## Part 2

    1.

In [63]:
class Hyperparameters(object):
    
    def __init__(self, window_size=1, vector_size=30, negative_words=50, iterations=2000,
                 noise_distribution='unigram', noise_dist_params={'alpha': 3/4.}, random_seed=1234):

        self.window_size = window_size
        self.vector_size = vector_size
        self.negative_words = negative_words
        self.iterations = iterations
        self.noise_distribution = noise_distribution
        self.noise_dist_params = noise_dist_params
        self.random_seed = random_seed
        np.random.seed(random_seed)

    2. + 3.

In [142]:
class ModelParameters(object):
    
    def __init__(self, hyperparameters):
        self._hyperparameters = hyperparameters
        
    def init(self, trainset):
        self._words = np.unique(sum(trainset.values(), []))
        vector_size = self._hyperparameters.vector_size
        
        def sample():
            vectors = np.random.multivariate_normal(np.zeros(vector_size), np.identity(vector_size) * 1e-2, len(self._words))
            return vectors / np.sqrt(np.sum(np.power(vectors, 2), axis=1)).reshape(-1, 1)
            
        self.U, self.V = sample(), sample()
        
    def word_to_index(self, words):
        return np.where(np.isin(self._words, words))[0]
    
    def index_to_words(self, indices):
        return self._words[indices]

    4. + 5.

In [182]:
class unigram_sampler(object):

    def __init__(self, trainset, alpha):
        self.words, self.frequencies = np.unique(sum(trainset.values(), []), return_counts=True)
        factored = np.power(self.frequencies, alpha).astype(np.float64)
        probabilities = factored / factored.sum()
        self._cumsum = np.cumsum(probabilities)
        
    def __call__(self, k):
        indices = [np.where(self._cumsum > np.random.rand())[0][0] for i in range(k)]
        return self.words[indices]

def get_words_sampler(dataset, hyperparams):
    
    if hyperparams.noise_distribution == 'unigram':
        return unigram_sampler(dataset, hyperparams.noise_dist_params['alpha'])

    else:
        raise NotImplemented

    6.

$$ log \left ( \frac{1}{1 + exp(v_c^Tu_t)} \right ) + \sum_{j=1}^K log \left (1 - \frac{1}{1 + exp(v_j^Tu_t)} \right ) $$

In [153]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def log_prob_context_with_negatives(U, V, target_word, context_word, negative_words, model_params):
    sig = np.log(sigmoid(V[context_word].T.dot(U[target_word])))
    neg = np.sum(np.log(1 - sigmoid(V[negative_words].T.dot(U[target_word]))))
    
    return sig + neg

## Part 3

    1.

$$ \frac{\partial f}{du_t} = \left (1 - \sigma(v_c^Tu_t \right )v_c - \sum_{j=1}^K \left (1-\sigma(v_j^Tu_t) \right )v_j $$

$$ \frac{\partial f}{dv_c} = \left (1 - \sigma(v_c^Tu_t \right )u_t $$

$$ \frac{\partial f}{dv_j} = - \left (1 - \sigma(v_j^Tu_t \right )u_t $$

    2.

In [176]:
def log_prob_context_with_negatives_gradient(U, V, target_word, context_word, negative_words):
    neg = (1 - sigmoid(V[context_word].T.dot(U[target_word])))
    
    dFdu = neg * V[context_word] - np.sum((1 - sigmoid(V[negative_words].dot(U[target_word]))) * V[negative_words])
    dFdv_c = neg * U[target_word]
    
    neg = (1 - sigmoid(V[negative_words].T.dot(U[target_word]))).reshape(-1, 1)
    dFdv_j = (-neg).dot(U[target_word].reshape(1, -1))
    
    return dFdu, dFdv_c, dFdv_j

    3.

In [186]:
def sample_target_context(dataset, window_size):
    w = np.random.choice(dataset.values(), 1)
    target = np.random.choice(range(len(w)))
    
    pairs = []
    
    for i in range(window_size):
        if t + i < len(w):
            pairs.append(w[t], w[t + i])
        if t - i >= 0:
            pairs.append(w[t], w[t - i])

    4.

In [187]:
def get_target_context_minibatch(dataset, minibatch_size, window_size):
    return [
        sample_target_context(dataset, window_size) for i in range(minibatch_size)
    ]

    5.

In [185]:
def get_parameters_update(minibatch_samples, model_params, U, V):

    u_gradients = {}
    c_gradients = {}
    
    for samples in minibatch_samples:
        target_index = model_params.word_to_index(samples[0][0])[0]
        for target, context in samples:
            
            context_index = model_params.word_to_index(context)[0]
            negatives = sample_k_words(k)
            negative_indeices = model_params.word_to_index(negatives)
            
            g_t, g_c, g_j = log_prob_context_with_negatives_gradient(target_index, context_index, 
                                                                     negative_indeices, U, V)
            
            u_gradients[target_index] = u_gradients.get(target_index, 0) + g_t
            c_gradients[context_index] = c_gradients.get(context_index, 0) + g_c
            
            for i, index in enumerate(negative_indeices):
                c_gradients[index] = c_gradients.get(index, 0) + g_j[i]
                
    return u_gradients, c_gradients

    6.

In [184]:
class SGDParameters(object):
    
    def __init__(self, learning_rate, minibatch_size, anealing_factor):
        self.learning_rate = learning_rate
        self.minibatch_size = minibatch_size
        self.anealing_factor = anealing_factor

    7.

In [None]:
def LearnParamsUsingSGD(trainset, hyperparameters, sgd_parameters, model_parameters):
    
    U, V = model_parameters.U, model_parameters.V
    learning_rate = sgd_parameters.learning_rate
    
    for i in range(hyperparameters.iterations):
        minibatch = get_target_context_minibatch(trainset, sgd_parameters.minibatch_size, hyperparameters.window_size)
        u_gradients, c_gradients = get_parameters_update(minibatch, model_parameters, U, V)

        if i % sgd_parameters.anealing_factor == 0:
            learning_rate /= 2.0
        
        for index, gradient in u_gradients.items():
            U[index] += learning_rate * gradient
            
        for index, gradient in c_gradients.items():
            V[index] += learning_rate * gradient