In [None]:
# My RAM kept crashing, that is why I could not complete the training

In [12]:
import zipfile
import os
import re
import nltk
import numpy as np
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten
from tensorflow.keras.models import Model

In [None]:
nltk.download('punkt')

In [3]:
def preprocess_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^Ա-Ֆա-ֆ]+", " ", text)
    tokens = word_tokenize(text)
    return tokens

In [4]:
zip_file_path = "corpus_100k.zip"
extracted_dir = "./corpus_data"
if not os.path.exists(extracted_dir):
    os.makedirs(extracted_dir)

In [5]:
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extracted_dir)

In [6]:
corpus_file_path = os.path.join(extracted_dir, 'corpus_100k')
with open(corpus_file_path, "r", encoding="utf-8") as file:
    corpus_text = file.read()

In [7]:
texts = [preprocess_text(corpus_text)]

In [8]:
EMBEDDING_DIM = 50
WINDOW_SIZE = 5
VOCAB_SIZE = len(set(texts[0]))
NUM_NEGATIVE_SAMPLES = 5
BATCH_SIZE = 32

In [9]:
word_to_index = {word: idx for idx, word in enumerate(set(texts[0]))}

In [10]:
text_indices = [word_to_index.get(word, VOCAB_SIZE) for word in texts[0]]

In [13]:
word_pairs, labels = [], []
for target_idx, target_word in enumerate(text_indices):
    if target_word == VOCAB_SIZE:
        continue

    context_indices = [
        word_to_index.get(text_indices[context_idx], VOCAB_SIZE)
        for context_idx in range(max(0, target_idx - WINDOW_SIZE), min(target_idx + WINDOW_SIZE + 1, len(text_indices)))
        if context_idx != target_idx and text_indices[context_idx] != VOCAB_SIZE]

    if context_indices:
        for context_word in context_indices:
            word_pairs.append([target_word, context_word])
            labels.append(1)
            for _ in range(NUM_NEGATIVE_SAMPLES):
                negative_word = np.random.randint(0, VOCAB_SIZE)
                word_pairs.append([target_word, negative_word])
                labels.append(0)

In [None]:
word_pairs = np.array(word_pairs, dtype = np.int32)
labels = np.array(labels, dtype = np.int32)

In [15]:
input_target = Input(shape = (1,))
input_context = Input(shape = (1,))
word_embedding = Embedding(input_dim = VOCAB_SIZE + 1, output_dim = EMBEDDING_DIM)
target_embedding = word_embedding(input_target)
context_embedding = word_embedding(input_context)
dot_product = Dot(axes = 2)([target_embedding, context_embedding])
flatten = Flatten()(dot_product)
model = Model(inputs = [input_target, input_context], outputs = flatten)

In [16]:
def custom_loss(y_true, y_pred):
    return tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.cast(y_true, dtype = tf.float32), logits = y_pred)

In [17]:
model.compile(optimizer = 'adam', loss = custom_loss)

In [None]:
model.fit([word_pairs[:, 0], word_pairs[:, 1]], labels, epochs = 10, batch_size = BATCH_SIZE)

In [None]:
word_embeddings = model.get_layer('embedding').get_weights()[0]
np.save("word_embeddings.npy", word_embeddings)

In [None]:
def reduce_dimensions_with_tsne(word_embeddings):
    tsne = TSNE(n_components = 2, random_state = 0)
    vectors_tsne = tsne.fit_transform(word_embeddings)
    return vectors_tsne

In [None]:
vectors_tsne = reduce_dimensions_with_tsne(word_embeddings)