In [1]:
import numpy as np
import re
import pandas as pd
from typing import List

## Read data

In [3]:
def read_file(file_name: str) -> list:
    with open(file_name) as file:
         return [word for line in file for word in re.findall(r'\w+', line)]

In [4]:
data = read_file(file_name= 'data.txt')

## Matrix factorization

In [5]:
class MatrixFactorization(object):

    def __init__(self, matrix: np.ndarray):
        self.__matrix = matrix
        
    def train(self,
              desired_dimensions: int,
              alpha: float= 1e-2,
              beta: float= 1e-2,
              epsilon: float= 1e-5,
              iterations: int= 1e3,
              auto_adjust_initial_values: bool= True):
        self.U = np.random.rand(self.__matrix.shape[0], desired_dimensions)
        self.V = np.random.rand(self.__matrix.shape[1], desired_dimensions)
        self.b = np.zeros(self.__matrix.shape)
        
        if auto_adjust_initial_values:
            mean = np.mean(self.__matrix)
            self.U *= mean
            self.V *= mean
            self.b *= mean
        
        return self.__gradient_descent(alpha, beta, epsilon, iterations)
       
    def get_predicted_matrix(self):
        return self.U@self.V.T + self.b
    
    def get_vector(self, line: int, column: int) -> np.ndarray:
        return self.U[line] * self.V[column] + self.b[line, column]

    def __gradient_descent(self, alpha, beta, epsilon, iterations) -> np.ndarray:
        current_step = 0
        errors = []
        while True:
            error = self.__matrix - self.get_predicted_matrix()
            old_U = self.U.copy()
            
            self.b += alpha * (error - beta * self.b)
            
            self.U += alpha * (error @ self.V - beta * self.U)
            self.V += alpha * (error.T @ old_U - beta * self.V)
            errors.append(self.__compute_mean_squared_error())
            current_step += 1
            
            if np.linalg.norm(self.U - old_U) < epsilon or current_step > iterations:
                return {
                    'errors' : errors,
                    'iterations' : current_step
                }
    
    def __compute_mean_squared_error(self) -> np.ndarray:
        return np.mean((self.__matrix - self.get_predicted_matrix()) ** 2)

## Word embedding

In [6]:
class WordEmbedding(object):
    
    def __init__(self, data: List[str]):
        self.__data = data
        
    def train(self,
              window_size,
              desired_dimensions: int,
              alpha: float= 1e-2 ,
              beta: float= 1e-2,
              epsilon: float= 1e-5,
              iterations: int= 1e3,
              auto_adjust_initial_values: bool= True):
        
        self.co_occurrence_matrix = self.__build_co_occurrence_matrix(window_size)
        self.probability_matrix = self.__build_probability_matrix()
        matrix_factorization = MatrixFactorization(matrix= self.probability_matrix)
        training_result = matrix_factorization.train(desired_dimensions, alpha, beta, epsilon, iterations)
        self.__embeddings = matrix_factorization.U * matrix_factorization.V + matrix_factorization.b
        return training_result
        
    def get_embeddings(self) -> np.ndarray:
        return self.__embeddings
        
    def __build_co_occurrence_matrix(self, window_size = (10, 10)) -> np.ndarray:
        words = list(set(self.__data))
        number_of_words = len(words)
        vocab = {word:index for index, word in enumerate(words)}
        number_data = np.array([vocab[word] for word in self.__data])
        
        indices = np.arange(len(number_data))[window_size[0]:-window_size[1]].reshape(-1, 1)
        window = np.concatenate((np.arange(start= -window_size[0], stop= 0, step= 1),
                            np.arange(start= 1, stop= window_size[1] + 1, step= 1)))
        co_occurrence_matrix = np.zeros(shape= (number_of_words, number_of_words), dtype= np.float)
        co_occurrence_matrix[number_data[indices], number_data[indices + window]] += 1/np.abs(window)
        
        return co_occurrence_matrix
        
    def __build_probability_matrix(self) -> np.ndarray:
        return self.co_occurrence_matrix / np.sum(self.co_occurrence_matrix, axis= 1, keepdims= True)

In [7]:
embeddings = WordEmbedding(data)

In [8]:
embeddings.train(window_size= (2, 2), desired_dimensions= 100)

{'errors': [0.00020934334082292302,
  0.00020507267851614067,
  0.00020088816778097423,
  0.0001967881070815985,
  0.00019277082829901508,
  0.00018883469603951653,
  0.0001849781069586457,
  0.00018119948910033323,
  0.00017749730125090322,
  0.00017387003230763962,
  0.0001703162006616087,
  0.00016683435359443827,
  0.00016342306668875608,
  0.00016008094325199422,
  0.00015680661375327002,
  0.00015359873527305627,
  0.0001504559909653579,
  0.0001473770895321152,
  0.00014436076470955778,
  0.00014140577476623537,
  0.00013851090201245686,
  0.00013567495232087116,
  0.00013289675465792764,
  0.00013017516062595745,
  0.00012750904401561978,
  0.00012489730036846283,
  0.0001223388465493506,
  0.00011983262032851212,
  0.00011737757997297297,
  0.00011497270384713292,
  0.00011261699002225769,
  0.00011030945589465669,
  0.00010804913781232265,
  0.00010583509070981374,
  0.0001036663877511621,
  0.00010154211998059811,
  9.94613959808828e-05,
  9.742334153904643e-05,
  9.54270993

In [9]:
res = embeddings.get_embeddings()
print(res[0].shape, res[1].shape)
print(res)

AttributeError: 'WordEmbedding' object has no attribute '_WordEmbedding__embeddings'