In [1]:
# Sample corpus data
import json
sux_data = json.load(open("sux_corpus.json"))

# Convert the corpus into a suitable format
sux_data = [[str(word) if word != 'X' else '_' for word in sentence] for sentence in sux_data]


In [4]:
import pandas as pd
sux_dict = pd.read_csv("../UD-ETCSUX/sux_dictionary.csv")
cf_dict = {}
for idx, row in sux_dict.iterrows():
    cf_dict[row.form] = row.cf

In [10]:
### Run this for lemma embedding
new_sux_data = []
for i in sux_data:
    temp = []
    for k in i:
        if k in cf_dict:
            temp.append(cf_dict[k])
        else:
            temp.append(k)
    new_sux_data.append(temp)

In [12]:
import numpy as np
from collections import defaultdict, Counter
from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD
from gensim.models import FastText
import io

# Sample data
data = new_sux_data
# Create vocabulary
vocab = set()
for sentence in data:
    for word in sentence:
        vocab.add(word)

vocab = list(vocab)
word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}

# Create sparse co-occurrence matrix
cooc_matrix = lil_matrix((len(vocab), len(vocab)), dtype=np.float64)

window_size = 2  # You can change the window size as needed

for sentence in data:
    indices = [word2idx[word] for word in sentence]
    for center_i in range(len(indices)):
        center_word = indices[center_i]
        context_indices = list(range(max(0, center_i - window_size), min(len(indices), center_i + window_size + 1)))
        context_indices.remove(center_i)
        for context_i in context_indices:
            context_word = indices[context_i]
            cooc_matrix[center_word, context_word] += 1

def calculate_pmi(cooc_matrix, positive=True):
    total_sum = cooc_matrix.sum()
    word_sum = np.array(cooc_matrix.sum(axis=1)).flatten()
    pmi_matrix = lil_matrix(cooc_matrix.shape, dtype=np.float64)

    rows, cols = cooc_matrix.nonzero()
    for i, j in zip(rows, cols):
        p_ij = cooc_matrix[i, j] / total_sum
        p_i = word_sum[i] / total_sum
        p_j = word_sum[j] / total_sum
        pmi = np.log2(p_ij / (p_i * p_j))
        if positive:
            pmi = max(0, pmi)
        pmi_matrix[i, j] = pmi

    return pmi_matrix

pmi_matrix = calculate_pmi(cooc_matrix)

# Set n_components to be less than the number of unique words
n_components = 512

# Apply SVD to the PMI matrix
svd = TruncatedSVD(n_components=n_components)
word_embeddings = svd.fit_transform(pmi_matrix)

# Create a dictionary to hold the word embeddings
word_embeddings_dict = {word: word_embeddings[word2idx[word]] for word in vocab}

# Save embeddings in FastText format
with io.open('svd_fasttext.vec', 'w', encoding='utf-8', newline='\n', errors='ignore') as f:
    f.write(f"{len(vocab)} {n_components}\n")
    for word, embedding in word_embeddings_dict.items():
        embedding_str = ' '.join(map(str, embedding))
        f.write(f"{word} {embedding_str}\n")

# Load the embeddings into a FastText model
fasttext_model = FastText(vector_size=n_components, window=3, min_count=1)
fasttext_model.build_vocab([list(word_embeddings_dict.keys())])
fasttext_model.wv.vectors = np.array(list(word_embeddings_dict.values()))

# Save the FastText model
fasttext_model.save("svd_fasttext_lemma_model.bin")


In [None]:
# from gensim.models import FastText
# import numpy as np

# # Load the trained FastText model
# model = FastText.load("svd_fasttext_lemma_model.bin")

In [None]:
# # Function to retrieve word embeddings
# def get_word_embedding(word):
#     if word in model.wv:
#         return model.wv[word]
#     else:
#         print(f"Word '{word}' not in vocabulary.")
#         return None

# # Function to compute cosine similarity between two word embeddings
# def cosine_similarity(vec1, vec2):
#     if vec1 is not None and vec2 is not None:
#         return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
#     else:
#         return None

# # Test words
# word1 = 'sa₂'
# word2 = 'in-da-sa₂'

# # Retrieve embeddings
# embedding1 = get_word_embedding(word1)
# embedding2 = get_word_embedding(word2)

# # Compute similarity
# similarity = cosine_similarity(embedding1, embedding2)

# # Print results
# print(f"Embedding for '{word1}': {embedding1}")
# print(f"Embedding for '{word2}': {embedding2}")
# print(f"Cosine similarity between '{word1}' and '{word2}': {similarity}")

# # Test with words not in the vocabulary
# word3 = 'non_existent_word'
# embedding3 = get_word_embedding(word3)
