# Applied Machine Learning - Project 2
### Boaz Shvartzman, Ofir Ziv

In [3]:
%matplotlib notebook

In [4]:
import re
import collections
import numpy as np
import matplotlib.pyplot as plt

## Part 1

    1.

In [5]:
class DatasetSplitParser(object):
    
    def __init__(self, split_path):
        with open(split_path, 'r')as f:
            file_content = f.read()

        self._ds_by_index = {}
        
        for row in file_content.split('\n')[1:-1]:
            index, ds = row.split(',')
            self._ds_by_index[int(index)] = int(ds)
            
    def get_dataset_by_index(self):
        return self._ds_by_index

    2.

In [40]:
class DatasetsHolder(object):
    
    def __init__(self, sentences_path, datasplit_parser):
        with open(sentences_path, 'r')as f:
            file_content = f.read()
        
        file_content = file_content.decode("ascii", "ignore") # Remove non-ASCII
        file_content = re.sub(r'([^\s\w]|_)+', '', file_content) # Remove non-alphanumeric
        file_content = file_content.lower() # Lowercase
        
        self._trainset = {}
        self._testset = {}
        
        ds_by_index = datasplit_parser.get_dataset_by_index()

        for row in file_content.split('\n')[1:-1]:
            
            index, sentence = row.split('\t')
            index, sentence = int(index), re.sub(r'\b\w{1,2}\b', '', sentence).split()
            
            if len(sentence) < 2:
                continue
                
            if ds_by_index[index] == 1:
                self._trainset[index] = sentence
                
            elif ds_by_index[index] == 2:
                self._testset[index] = sentence

    def get_trainset(self):
        return self._trainset
    
    def get_testset(self):
        return self._testset

## Part 2

    1.

In [7]:
class Hyperparameters(object):
    
    def __init__(self, window_size=1, vector_size=30, negative_words=50, iterations=2000,
                 noise_distribution='unigram', noise_dist_params={'alpha': 3/4.}, random_seed=1234):

        self.window_size = window_size
        self.vector_size = vector_size
        self.negative_words = negative_words
        self.iterations = iterations
        self.noise_distribution = noise_distribution
        self.noise_dist_params = noise_dist_params
        self.random_seed = random_seed
        np.random.seed(random_seed)

    2. + 3.

In [8]:
class ModelParameters(object):
    
    def __init__(self, hyperparameters):
        self._hyperparameters = hyperparameters
        
    def init(self, trainset):
        self._words = np.unique(sum(trainset.values(), []))
        vector_size = self._hyperparameters.vector_size
        
        def sample():
            vectors = np.random.multivariate_normal(np.zeros(vector_size), np.identity(vector_size) * 1e-2, len(self._words))
            return vectors / np.sqrt(np.sum(np.power(vectors, 2), axis=1)).reshape(-1, 1)

        self.U, self.V = sample(), sample()
        
        return self
        
    def word2index(self, words):
        indices = np.where(np.isin(self._words, words))[0]
        if isinstance(words, collections.Iterable) and not type(words) in [str, unicode]:
            return indices
        else:
            return indices[0]

    def index2word(self, indices):
        return self._words[indices]

    4. + 5.

In [9]:
class unigram_sampler(object):

    def __init__(self, trainset, alpha):
        self.words, self.frequencies = np.unique(sum(trainset.values(), []), return_counts=True)
        factored = np.power(self.frequencies, alpha).astype(np.float64)
        probabilities = factored / factored.sum()
        self._cumsum = np.cumsum(probabilities)
        
    def __call__(self, k):
        indices = [np.where(self._cumsum > np.random.rand())[0][0] for i in range(k)]
        return self.words[indices]

def get_words_sampler(dataset, hyperparams):
    
    if hyperparams.noise_distribution == 'unigram':
        return unigram_sampler(dataset, hyperparams.noise_dist_params['alpha'])

    else:
        raise NotImplemented

    6.

$$ log \left ( \frac{1}{1 + exp(v_c^Tu_t)} \right ) + \sum_{j=1}^K log \left (1 - \frac{1}{1 + exp(v_j^Tu_t)} \right ) $$

In [10]:
def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def log_prob_context_with_negatives(U, V, target_word, context_word, negative_words, model_params):
    sig = np.log(sigmoid(V[context_word].T.dot(U[target_word])))
    neg = np.sum(np.log(1 - sigmoid(V[negative_words].T.dot(U[target_word]))))
    
    return sig + neg

## Part 3

    1.

$$ \frac{\partial f}{du_t} = \left (1 - \sigma(v_c^Tu_t \right )v_c - \sum_{j=1}^K \left (1-\sigma(v_j^Tu_t) \right )v_j $$

$$ \frac{\partial f}{dv_c} = \left (1 - \sigma(v_c^Tu_t \right )u_t $$

$$ \frac{\partial f}{dv_j} = - \left (1 - \sigma(v_j^Tu_t \right )u_t $$

    2.

In [25]:
def log_prob_context_with_negatives_gradient(U, V, target_word, context_word, negative_words):
    u_t, v_c = V[context_word].reshape(-1, 1), U[target_word].reshape(-1, 1)
    neg = (1 - sigmoid(v_c.T.dot(u_t)))
    
    dFdu = neg * v_c - np.sum((1 - sigmoid(v_c.T.dot(u_t))) * V[negative_words], axis=0).reshape(-1, 1)
    dFdv_c = neg * u_t
    
    neg = (1 - sigmoid(V[negative_words].dot(u_t))).reshape(-1, 1)
    dFdv_j = (-neg).dot(u_t.T)
    
    return dFdu, dFdv_c, dFdv_j

    3.

In [12]:
def sample_target_context(dataset, window_size):
    w = np.random.choice(dataset.values(), 1)[0]
    target = np.random.choice(range(len(w)))
    
    pairs = []
    
    for i in range(window_size):
        if target + i < len(w):
            pairs.append((w[target], w[target + i]))
        if target - i >= 0:
            pairs.append((w[target], w[target - i]))
            
    return pairs

    4.

In [13]:
def get_target_context_minibatch(dataset, minibatch_size, window_size):
    return [
        sample_target_context(dataset, window_size) for i in range(minibatch_size)
    ]

    5.

In [35]:
def get_parameters_update(minibatch_samples, hyperparameters, model_params, U, V):

    u_gradients = {}
    c_gradients = {}
    
    for samples in minibatch_samples:
        target_index = model_params.word2index(samples[0][0])
        for target, context in samples:
            context_index = model_params.word2index(context)
            negative_indeices = model_params.word2index(sample_k_words(hyperparameters.negative_words))
            
            g_t, g_c, g_j = log_prob_context_with_negatives_gradient(U, V, target_index, context_index, 
                                                                     negative_indeices)

            u_gradients[target_index] = u_gradients.get(target_index, 0) + g_t
            c_gradients[context_index] = c_gradients.get(context_index, 0) + g_c
            
            for i, index in enumerate(negative_indeices):
                c_gradients[index] = c_gradients.get(index, 0) + g_j[i].reshape(-1, 1)
                
    return u_gradients, c_gradients

    6.

In [15]:
class SGDParameters(object):
    
    def __init__(self, learning_rate=1e-3, minibatch_size=50, anealing_factor=300):
        self.learning_rate = learning_rate
        self.minibatch_size = minibatch_size
        self.anealing_factor = anealing_factor

    7.

In [36]:
def LearnParamsUsingSGD(trainset, hyperparameters, sgd_parameters, model_parameters):
    
    U, V = model_parameters.U, model_parameters.V
    learning_rate = sgd_parameters.learning_rate
    
    for i in range(hyperparameters.iterations):
        minibatch = get_target_context_minibatch(trainset, sgd_parameters.minibatch_size, hyperparameters.window_size)
        u_gradients, c_gradients = get_parameters_update(minibatch, hyperparameters, model_parameters, U, V)
        print i
        if i % sgd_parameters.anealing_factor == 0:
            learning_rate /= 2.0
        
        for index, gradient in u_gradients.items():
            U[index] += (learning_rate * gradient).reshape(-1)

        for index, gradient in c_gradients.items():
            V[index] += (learning_rate * gradient).reshape(-1)
            
    return U, V

In [42]:
split_parser = DatasetSplitParser('datasetSplit.txt')
dataset_holder = DatasetsHolder('datasetSentences.txt', split_parser)

In [43]:
hyperparameters = Hyperparameters()
model_parameters = ModelParameters(hyperparameters).init(dataset_holder.get_trainset())
sample_k_words = get_words_sampler(dataset_holder.get_trainset(), hyperparameters)
sgd_parameters = SGDParameters()

In [44]:
U, V = LearnParamsUsingSGD(dataset_holder.get_trainset(), hyperparameters, sgd_parameters, model_parameters)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183


KeyboardInterrupt: 

> [0;32m/Users/boazsh/.virtualenvs/ml_course/lib/python2.7/site-packages/numpy/lib/arraysetops.py[0m(274)[0;36m_unique1d[0;34m()[0m
[0;32m    272 [0;31m[0;34m[0m[0m
[0m[0;32m    273 [0;31m    [0;32mif[0m [0moptional_indices[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m--> 274 [0;31m        [0mperm[0m [0;34m=[0m [0mar[0m[0;34m.[0m[0margsort[0m[0;34m([0m[0mkind[0m[0;34m=[0m[0;34m'mergesort'[0m [0;32mif[0m [0mreturn_index[0m [0;32melse[0m [0;34m'quicksort'[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    275 [0;31m        [0maux[0m [0;34m=[0m [0mar[0m[0;34m[[0m[0mperm[0m[0;34m][0m[0;34m[0m[0m
[0m[0;32m    276 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0m
[0m
ipdb> q
