# Word embedding

In [2]:
import numpy as np
import re
import pandas as pd
from typing import List

## Read data

In [3]:
def read_file(file_name: str) -> list:
    with open(file_name) as file:
         return [word for line in file for word in re.findall(r'\w+', line)]

In [4]:
data = read_file(file_name= 'data.txt')

## Build word-word co-occurrence matrix

In [5]:
def build_co_occurrence_matrix(data: List[str], window_size = (10, 10)) -> np.ndarray:
    words = list(set(data))
    vocab = {word:index for index, word in enumerate(words)}
    number_data = np.array([vocab[word] for word in data])
    
    left_window =  np.arange(start= 1, stop= window_size[0] + 1, step= 1)
    right_window =  np.arange(start= 1, stop= window_size[1] + 1, step= 1)
    indices = np.arange(start= 0, stop= len(number_data), step= 1).reshape(-1, 1)
    
    left_windows = np.take(number_data, indices[window_size[0]:] - left_window)
    right_windows = np.take(number_data, indices[:-window_size[1]] + right_window)
    
    
    number_of_words = len(words)
    co_occurrence_matrix = np.zeros(shape= (number_of_words, number_of_words), dtype= np.uint64)
    co_occurrence_matrix[np.repeat(number_data[window_size[0]:], window_size[0]), left_windows.ravel()] += 1
    co_occurrence_matrix[np.repeat(number_data[:-window_size[1]], window_size[1]), right_windows.ravel()] += 1
  

    #1. adaugare distanta fata de cuvantul curent
    #2. adaugare +1 la fiecare element al matricii pentru log
    co_occurrence_matrix += 1

    return pd.DataFrame(data = co_occurrence_matrix,
                       index = words,
                       columns= words), co_occurrence_matrix

In [134]:
dataframe, co_occurrence = build_co_occurrence_matrix(data, window_size=(2, 2))

### Check if co-occurence matrix is symmetric

In [135]:
print(f'Symmetric: {np.all(co_occurence[1] == co_occurence[1].T)}')

Symmetric: True


In [136]:
print(f'Max value: {np.max(co_occurence)}')
print(f'Min value: {np.min(co_occurence)}')    

Max value: 3
Min value: 1


In [137]:
dataframe

Unnamed: 0,gravida,senectus,purus,Cras,erat,Lorem,fermentum,elit,libero,luctus,...,conubia,lacinia,morbi,malesuada,vitae,Nulla,molestie,facilisi,rhoncus,sociosqu
gravida,1,1,3,2,3,1,2,3,3,2,...,1,1,1,2,2,1,2,1,1,1
senectus,1,1,1,1,1,1,1,1,1,1,...,1,1,2,1,1,1,1,1,1,1
purus,3,1,1,2,3,1,3,2,2,3,...,1,3,1,3,3,2,3,1,3,1
Cras,2,1,2,1,3,1,1,2,2,2,...,1,3,1,3,2,1,2,1,3,1
erat,3,1,3,3,3,1,3,1,2,3,...,1,2,1,3,3,2,2,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Nulla,1,1,2,1,2,1,3,2,2,2,...,1,1,1,1,3,1,2,2,2,1
molestie,2,1,3,2,2,1,2,1,2,1,...,1,1,1,2,1,2,3,1,1,1
facilisi,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,2,1,1,1,1
rhoncus,1,1,3,3,3,1,1,2,2,2,...,1,1,1,2,2,2,1,1,1,1


## Build context probability matrix

In [138]:
def build_probability_matrix(co_occurrence_matrix: np.ndarray) -> np.ndarray:
    return co_occurrence_matrix / np.sum(co_occurrence_matrix, axis= 1, keepdims= True)

In [139]:
probability = build_probability_matrix(co_occurence)

In [140]:
print(f'Probability matrix correctness: {np.all(np.isclose(np.sum(probability, axis = 1), 1))}')

Probability matrix correctness: True


In [141]:
print(probability)

[[0.00283286 0.00283286 0.00849858 ... 0.00283286 0.00283286 0.00283286]
 [0.00492611 0.00492611 0.00492611 ... 0.00492611 0.00492611 0.00492611]
 [0.0075188  0.00250627 0.00250627 ... 0.00250627 0.0075188  0.00250627]
 ...
 [0.00471698 0.00471698 0.00471698 ... 0.00471698 0.00471698 0.00471698]
 [0.0027248  0.0027248  0.00817439 ... 0.0027248  0.0027248  0.0027248 ]
 [0.00492611 0.00492611 0.00492611 ... 0.00492611 0.00492611 0.00492611]]


## Model

### Notations

$ w \to \verb|word| $

$ V \to \verb|number of unique words - size of vocabulary| $

$ X \to \verb|word-word co-occurrence matrix, symmetrically built - | X_{ij} = X_{ji} $

$ X_{ij} \to \verb|the number of times word j occurs in the context of word i| $

$ X_{i} = \sum_{k}{X_{k}} \to \verb|the number of times any word appears in the context of word i| $

$ P_{ij} = P\left( j\mid i \right) = \frac{X_{ij}}{X_{i}} \to \verb|the probability that word j appear in the context of word i| $

### Word representation

$ w_{i}^{T}w_{j} = \log{P\left( j\mid i \right)} \to w_{i} \verb| and | w_{j} \verb|unknown vectors that represents i-th, j-th words| $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - \log{X_{i}} $

$ w_{j}^{T}w_{i} = \log{X_{ij}} - \log{X_{j}} \quad (X_{ij} = X_{ji})$

$ 2 w_{i}^{T}w_{j} = 2\log{X_{ij}} - 2\log{X_{i}} - 2\log{X_{j}} $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - \frac{1}{2}\log{X_{i}} - \frac{1}{2}\log{X_{j}} $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - b_{i} -  b_{j} $

$ w_{i}^{T}w_{j} + b_{i} + b_{j} = \log{X_{ij}}  $

### Cost function

$ J = \sum_{i = 1}^{V}\sum_{j = 1}^{V} f\left( X_{i,j} \right){\left(w_{i}^{T}\tilde{w_{j}}+b_{i}+\tilde{b_{j}}-\log{X_{ij}}\right)}^2 $

### Learning

In [142]:
def cost_function(co_occurrence_matrix: np.ndarray, W: np.ndarray, b: np.ndarray) -> float:
    return np.sum((co_occurrence_matrix - W@b)**2)

In [198]:
def gradient_descent(co_occurrence_matrix: np.ndarray, encoded_dimensions: int= 50, steps= 10000, alpha= 0.1, beta= 0.01):
    vocab_size = co_occurrence_matrix.shape[0]
    W = np.random.rand(vocab_size, encoded_dimensions)
    b = np.random.rand(encoded_dimensions, vocab_size)

    errors = []
    for step in range(steps):
        old_W = W.copy()
        old_b = b.copy()
        
        diff = co_occurrence_matrix - W@b
        
        W = old_W + alpha * (2 * diff @ b.T - beta * old_W)
        b = b + alpha * (2 * (diff @ old_W).T - beta * b)
        errors.append(cost_function(co_occurrence_matrix, W, b))
    return errors, W, b

In [199]:
co_occurence

array([[1, 1, 3, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [3, 1, 1, ..., 1, 3, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 3, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]], dtype=uint64)

In [200]:
errors, W, b =gradient_descent(co_occurence, 3, 5, alpha= 0.1, beta= 0.01)

  


In [201]:
errors

[70028668494.55399,
 5.496771601846837e+29,
 2.6573151166971605e+86,
 3.0022659238988606e+256,
 inf]

In [150]:
W

array([[-3.83590042e+79, -3.45162361e+79, -3.54181353e+79, ...,
        -3.68222633e+79, -3.67609026e+79, -3.60963194e+79],
       [-2.21842757e+80, -1.99618763e+80, -2.04834743e+80, ...,
        -2.12955278e+80, -2.12600409e+80, -2.08756905e+80],
       [ 2.42914434e+79,  2.18579500e+79,  2.24290919e+79, ...,
         2.33182780e+79,  2.32794204e+79,  2.28585626e+79],
       ...,
       [-1.60583477e+80, -1.44496379e+80, -1.48272027e+80, ...,
        -1.54150172e+80, -1.53893295e+80, -1.51111130e+80],
       [-4.82901588e+79, -4.34524972e+79, -4.45878983e+79, ...,
        -4.63555553e+79, -4.62783083e+79, -4.54416643e+79],
       [-9.21414770e+79, -8.29108325e+79, -8.50772684e+79, ...,
        -8.84500992e+79, -8.83027056e+79, -8.67063222e+79]])

In [202]:
class MatrixFactorization(object):

    def __init__(self, matrix: np.ndarray):
        self.__matrix = matrix
        
    def train(self, desired_dimensions: int, alpha: float= 0.1, beta: float= 0.01, iterations: int= 1000):
        P = np.random.rand(())
        Q = np.random.rand(())

    
    def __gradient_descent(self, ):