# Word embedding

In [1]:
import numpy as np
import re
import pandas as pd
from typing import List

## Read data

In [3]:
def read_file(file_name: str) -> list:
    with open(file_name) as file:
         return [word for line in file for word in re.findall(r'\w+', line)]

In [4]:
data = read_file(file_name= 'data.txt')

## Build word-word co-occurrence matrix

In [5]:
def build_co_occurrence_matrix(data: List[str], window_size = (10, 10)) -> np.ndarray:
    words = list(set(data))
    vocab = {word:index for index, word in enumerate(words)}
    number_data = np.array([vocab[word] for word in data])
    
    left_window =  np.arange(start= 1, stop= window_size[0] + 1, step= 1)
    right_window =  np.arange(start= 1, stop= window_size[1] + 1, step= 1)
    indices = np.arange(len(number_data)).reshape(-1, 1)
    
    left_windows = np.take(number_data, indices[window_size[0]:] - left_window)
    right_windows = np.take(number_data, indices[:-window_size[1]] + right_window)
    
    
    number_of_words = len(words)
    co_occurrence_matrix = np.zeros(shape= (number_of_words, number_of_words), dtype= np.uint64)
    co_occurrence_matrix[np.repeat(number_data[window_size[0]:], window_size[0]), left_windows.ravel()] += 1
    co_occurrence_matrix[np.repeat(number_data[:-window_size[1]], window_size[1]), right_windows.ravel()] += 1
  

    #1. adaugare distanta fata de cuvantul curent
    co_occurrence_matrix += 1

    return pd.DataFrame(data = co_occurrence_matrix,
                       index = words,
                       columns= words), co_occurrence_matrix

In [6]:
build_co_occurrence_matrix(data, window_size=(3, 3))

(           mus  elit  dis  Lorem  senectus  odio  tincidunt  turpis  \
 mus          1     1    1      1         1     1          1       1   
 elit         1     3    1      2         1     2          3       2   
 dis          1     1    1      1         1     1          1       1   
 Lorem        1     2    1      1         1     1          2       1   
 senectus     1     1    1      1         1     1          1       1   
 ...        ...   ...  ...    ...       ...   ...        ...     ...   
 facilisis    1     3    1      1         1     2          3       3   
 Fusce        1     2    1      1         1     3          2       3   
 ad           1     1    1      1         1     1          1       1   
 in           2     3    1      1         1     3          3       3   
 metus        1     2    1      1         1     3          3       2   
 
            Vestibulum  tempor  ...  a  pharetra  Morbi  justo  scelerisque  \
 mus                 2       1  ...  1         1      2

In [7]:
dataframe, co_occurrence = build_co_occurrence_matrix(data, window_size=(3, 3))

### Check if co-occurence matrix is symmetric

In [8]:
print(f'Symmetric: {np.all(co_occurrence[1] == co_occurrence[1].T)}')

Symmetric: True


In [9]:
print(f'Max value: {np.max(co_occurrence)}')
print(f'Min value: {np.min(co_occurrence)}')    

Max value: 3
Min value: 1


In [10]:
dataframe

Unnamed: 0,mus,elit,dis,Lorem,senectus,odio,tincidunt,turpis,Vestibulum,tempor,...,a,pharetra,Morbi,justo,scelerisque,facilisis,Fusce,ad,in,metus
mus,1,1,1,1,1,1,1,1,2,1,...,1,1,2,2,1,1,1,1,2,1
elit,1,3,1,2,1,2,3,2,3,1,...,3,3,2,3,2,3,2,1,3,2
dis,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
Lorem,1,2,1,1,1,1,2,1,1,1,...,2,1,1,1,1,1,1,1,1,1
senectus,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
facilisis,1,3,1,1,1,2,3,3,2,2,...,3,1,2,3,2,1,3,1,3,3
Fusce,1,2,1,1,1,3,2,3,1,3,...,3,2,1,2,2,3,1,1,3,3
ad,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
in,2,3,1,1,1,3,3,3,2,3,...,3,3,3,3,3,3,3,1,3,3


## Build context probability matrix

In [11]:
def build_probability_matrix(co_occurrence_matrix: np.ndarray) -> np.ndarray:
    return co_occurrence_matrix / np.sum(co_occurrence_matrix, axis= 1, keepdims= True)

In [12]:
probability = build_probability_matrix(co_occurrence)

In [13]:
print(f'Probability matrix correctness: {np.all(np.isclose(np.sum(probability, axis = 1), 1))}')

Probability matrix correctness: True


In [14]:
print(probability)

[[0.00440529 0.00440529 0.00440529 ... 0.00440529 0.00881057 0.00440529]
 [0.00222717 0.00668151 0.00222717 ... 0.00222717 0.00668151 0.00445434]
 [0.00487805 0.00487805 0.00487805 ... 0.00487805 0.00487805 0.00487805]
 ...
 [0.00487805 0.00487805 0.00487805 ... 0.00487805 0.00487805 0.00487805]
 [0.0038835  0.00582524 0.00194175 ... 0.00194175 0.00582524 0.00582524]
 [0.00232019 0.00464037 0.00232019 ... 0.00232019 0.00696056 0.00696056]]


## Model

### Notations

$ w \to \verb|word| $

$ V \to \verb|number of unique words - size of vocabulary| $

$ X \to \verb|word-word co-occurrence matrix, symmetrically built - | X_{ij} = X_{ji} $

$ X_{ij} \to \verb|the number of times word j occurs in the context of word i| $

$ X_{i} = \sum_{k}{X_{k}} \to \verb|the number of times any word appears in the context of word i| $

$ P_{ij} = P\left( j\mid i \right) = \frac{X_{ij}}{X_{i}} \to \verb|the probability that word j appear in the context of word i| $

### Word representation

$ w_{i}^{T}w_{j} = \log{P\left( j\mid i \right)} \to w_{i} \verb| and | w_{j} \verb|unknown vectors that represents i-th, j-th words| $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - \log{X_{i}} $

$ w_{j}^{T}w_{i} = \log{X_{ij}} - \log{X_{j}} \quad (X_{ij} = X_{ji})$

$ 2 w_{i}^{T}w_{j} = 2\log{X_{ij}} - 2\log{X_{i}} - 2\log{X_{j}} $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - \frac{1}{2}\log{X_{i}} - \frac{1}{2}\log{X_{j}} $

$ w_{i}^{T}w_{j} = \log{X_{ij}} - b_{i} -  b_{j} $

$ w_{i}^{T}w_{j} + b_{i} + b_{j} = \log{X_{ij}}  $

### Cost function

$ J = \sum_{i = 1}^{V}\sum_{j = 1}^{V} f\left( X_{i,j} \right){\left(w_{i}^{T}\tilde{w_{j}}+b_{i}+\tilde{b_{j}}-\log{X_{ij}}\right)}^2 $

### Learning

In [25]:
class MatrixFactorization(object):

    def __init__(self, matrix: np.ndarray):
        self.__matrix = matrix
        
    def train(self,
              desired_dimensions: int,
              alpha: float= 1e-2,
              beta: float= 1e-2,
              epsilon= 1e-5,
              iterations: int= 1e3,
              auto_adjust_initial_values= True):
        self.U = np.random.rand(self.__matrix.shape[0], desired_dimensions)
        self.V = np.random.rand(self.__matrix.shape[1], desired_dimensions)
        self.b = np.zeros(self.__matrix.shape)
        
        if auto_adjust_initial_values:
            mean = np.mean(self.__matrix)
            self.U *= mean
            self.V *= mean
            self.b *= mean
        
        return self.__gradient_descent(alpha, beta, epsilon, iterations)
       
    def get_predicted_matrix(self):
        return self.U@self.V.T + self.b
    
    def get_vector(self, line: int, column: int) -> np.ndarray:
        return self.U[line] * self.V[column] + self.b[line, column]

    def __gradient_descent(self, alpha, beta, epsilon, iterations) -> np.ndarray:
        current_step = 0
        errors = []
        while True:
            error = self.__matrix - self.get_predicted_matrix()
            old_U = self.U.copy()
            
            self.b += alpha * (error - beta * self.b)
            
            self.U += alpha * (error @ self.V - beta * self.U)
            self.V += alpha * (error.T @ old_U - beta * self.V)
            errors.append(self.__compute_mean_squared_error())
            current_step += 1
            
            if np.linalg.norm(self.U - old_U) < epsilon or current_step > iterations:
                return {
                    'errors' : errors,
                    'iterations' : current_step
                }
    
    def __compute_mean_squared_error(self) -> np.ndarray:
        return np.mean((self.__matrix - self.get_predicted_matrix()) ** 2)

In [26]:
mf2 = MatrixFactorization(matrix= probability)

In [27]:
mf2.train(desired_dimensions= 100)

{'errors': [2.2133630529785586e-05,
  2.159643878519427e-05,
  2.1070740897994096e-05,
  2.055632953323904e-05,
  2.005300071476888e-05,
  1.9560553746278404e-05,
  1.9078791135305663e-05,
  1.8607518520076028e-05,
  1.81465445991373e-05,
  1.769568106371981e-05,
  1.7254742532753923e-05,
  1.682354649047651e-05,
  1.6401913226556396e-05,
  1.598966577866821e-05,
  1.5586629877442816e-05,
  1.519263389372193e-05,
  1.480750878804379e-05,
  1.4431088062286178e-05,
  1.4063207713392722e-05,
  1.3703706189108024e-05,
  1.3352424345647119e-05,
  1.3009205407224594e-05,
  1.2673894927368868e-05,
  1.2346340751947338e-05,
  1.202639298382837e-05,
  1.1713903949106738e-05,
  1.1408728164819465e-05,
  1.1110722308080013e-05,
  1.081974518655936e-05,
  1.0535657710243576e-05,
  1.025832286439857e-05,
  9.987605683673725e-06,
  9.723373227277604e-06,
  9.465494555159994e-06,
  9.213840705136332e-06,
  8.968284670891787e-06,
  8.72870138080409e-06,
  8.49496767752581e-06,
  8.266962298268573e-06,

In [28]:
mf2.get_predicted_matrix()

array([[0.00436789, 0.00437429, 0.00436724, ..., 0.00436752, 0.0087379 ,
        0.00437381],
       [0.00221145, 0.00662836, 0.00221096, ..., 0.00221114, 0.00663031,
        0.00442259],
       [0.00483577, 0.00484238, 0.00483544, ..., 0.00483531, 0.00484432,
        0.00484177],
       ...,
       [0.00483546, 0.00484219, 0.00483512, ..., 0.0048349 , 0.00484395,
        0.0048419 ],
       [0.00385157, 0.00578074, 0.00192858, ..., 0.00192856, 0.00578288,
        0.00577992],
       [0.00230354, 0.00460737, 0.00230311, ..., 0.0023034 , 0.00690671,
        0.00690423]])

In [29]:
probability

array([[0.00440529, 0.00440529, 0.00440529, ..., 0.00440529, 0.00881057,
        0.00440529],
       [0.00222717, 0.00668151, 0.00222717, ..., 0.00222717, 0.00668151,
        0.00445434],
       [0.00487805, 0.00487805, 0.00487805, ..., 0.00487805, 0.00487805,
        0.00487805],
       ...,
       [0.00487805, 0.00487805, 0.00487805, ..., 0.00487805, 0.00487805,
        0.00487805],
       [0.0038835 , 0.00582524, 0.00194175, ..., 0.00194175, 0.00582524,
        0.00582524],
       [0.00232019, 0.00464037, 0.00232019, ..., 0.00232019, 0.00696056,
        0.00696056]])

In [30]:
np.isclose(mf2.get_predicted_matrix(), probability, rtol= 0.0001, atol= 0.0001)

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [31]:
np.sum(np.isclose(mf2.get_predicted_matrix(), probability, rtol= 0.0001, atol= 0.0001) == False)/ \
probability.size

0.0003787783136789475

In [32]:
np.sum(np.isclose(mf2.get_predicted_matrix(), probability, rtol= 0.0001, atol= 0.0001) == False)

15

In [33]:
mf2.get_vector(0,0)

array([0.00285255, 0.00284588, 0.00285997, 0.00284194, 0.00285534,
       0.00285054, 0.00284797, 0.00283339, 0.00283723, 0.00285401,
       0.00284084, 0.00285232, 0.00285958, 0.00284535, 0.00283506,
       0.00285233, 0.0028422 , 0.00284377, 0.00285921, 0.00283754,
       0.00285684, 0.00285312, 0.00284283, 0.00284948, 0.00284186,
       0.00283875, 0.00284709, 0.00284633, 0.00284338, 0.00285914,
       0.00284352, 0.00285014, 0.00283798, 0.00283748, 0.00286518,
       0.00283741, 0.00284088, 0.00285524, 0.00285506, 0.0028551 ,
       0.00285051, 0.00284431, 0.00284726, 0.00285302, 0.00285161,
       0.00284471, 0.00283854, 0.00284183, 0.00283567, 0.00285055,
       0.00284977, 0.00283496, 0.00284387, 0.00283767, 0.00283907,
       0.00283599, 0.00284081, 0.00285354, 0.00284288, 0.00283856,
       0.00284518, 0.00284807, 0.00284481, 0.00285893, 0.00284716,
       0.00286306, 0.00284434, 0.00283731, 0.00284494, 0.00283725,
       0.00285674, 0.00284321, 0.00285371, 0.00284356, 0.00284