# Applied Machine Learning - Project 2
### Boaz Shvartzman, Ofir Ziv

In [1]:
%matplotlib notebook

In [3]:
import re
import numpy as np
import matplotlib.pyplot as plt

In [3]:
np.random.seed(1234)

## Part 1

    1.

In [4]:
class DatasetSplitParser(object):
    
    def __init__(self, split_path):
        with open(split_path, 'r')as f:
            file_content = f.read()

        self._ds_by_index = {}
        
        for row in file_content.split('\n')[1:-1]:
            index, ds = row.split(',')
            self._ds_by_index[int(index)] = int(ds)
            
    def get_dataset_by_index():
        return self._ds_by_index

    2.

In [1]:
class DatasetsHolder(object):
    
    def __init__(self, sentences_path, datasplit_parser):
        with open(sentences_path, 'r')as f:
            file_content = f.read()
        
        file_content = file_content.decode("ascii", "ignore") # Remove non-ASCII
        file_content = re.sub(r'([^\s\w]|_)+', '', file_content) # Remove non-alphanumeric
        file_content = re.sub(r'\b\w{1,2}\b', '', file_content) # Remove words with less than 3 characters
        file_content = file_content.lower() # Lowercase
        
        self._trainset = {}
        self._testset = {}
        
        ds_by_index = datasplit_parser.get_dataset_by_index()
        
        for row in file_content.split('\n')[1:-1]:
            index, sentence = row.split('\t')
            
            if ds_by_index[index] == 1:
                self._trainset[index] = sentence.split(' ')
            else:
                self._testset[index] = sentence.split(' ')
            
    def get_trainset(self):
        return self._trainset
    
    def get_testset(self):
        return self._testset

## Part 2

    1.

In [2]:
class Hyperparameters(object):
    
    def __init__(self, window_size, vector_size, negative_words, iterations, noise_distribution, random_seed):
        self.window_size = window_size
        self.vector_size = vector_size
        self.negative_words = negative_words
        self.iterations = iterations
        self.noise_distribution = noise_distribution
        self.random_seed = random_seed

    2. + 3.

In [None]:
class ModelParameters(object):
    
    def __init__(self, hyperparameters):
        self._hyperparameters = hyperparameters
        
    def init(self, trainset):
        self.words, self.frequencies = numpy.unique(sum(trainset.values(), []), return_counts=True)
        vector_size = self._hyperparameters.vector_size
        
        def sample():
            vectors = np.random.multivariate_normal(np.zeros(vector_size), np.identity(vector_size) * 1e-2, len(self.words))
            return vectors / np.sqrt(np.sum(np.power(vectors, 2), axis=1)).reshape(-1, 1)
            
        self.U, self.V = sample(), sample()

    4.

In [5]:
class words_sampler(object):

    def __init__(self, trainset, alpha):
        self.words, self.frequencies = numpy.unique(sum(trainset.values(), []), return_counts=True)
        factored = np.power(self.frequencies, alpha).astype(np.float64)
        probabilities = factored / factored.sum()
        self._cumsum = np.cumsum(probabilities)
        
    def __call__(self, k):
        indices = [np.where(cumsum > np.random.rand())[0][0] for i in range(k)]
        return self.words[indices]

    5.

In [None]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def cbow(target_word, context_word, negative_words, model_params):
    sig = np.log(sigmoid(model_params.V[context_word].T.dot(model_params.U[target_word])))
    neg = np.sum(np.log(1 - sigmoid(model_params.V[negative_words].T.dot(model_params.U[target_word]))))
    
    return sig - neg