In [4]:
!pip install tqdm



In [5]:
import numpy as np
from scipy.linalg import svd
import pandas as pd
from tqdm import tqdm

In [8]:
corpus_1 = open('/content/pre_processed_corpus/Corpus.txt', 'r', encoding='utf-8').readlines()

# Concatenating the corpora
corpus = corpus_1
# we can define multiple corpora and concatenate them

# Creating the vocabulary
vocab = list(set(" ".join(corpus).split()))
vocab_size = len(vocab)
word_to_index = {word: i for i, word in enumerate(vocab)}

# Creating the co-occurrence matrix
co_occurrence = np.zeros((vocab_size, vocab_size))

# Processing corpus
for sentence in tqdm(corpus, desc="Processing corpus", unit=" sentences"):
    words = sentence.split()
    for i in range(len(words)):
        for j in range(max(0, i-1), min(len(words), i+2)):
            if i!= j:
                co_occurrence[word_to_index[words[i]], word_to_index[words[j]]] += 1

# Performing SVD
with tqdm(total=3, desc="Performing SVD") as pbar:
    U, Sigma, Vt = svd(co_occurrence)
    pbar.update(1)

#rank approximation (100)
k = 100

with tqdm(total=2, desc="Performing matrix operations") as pbar:
    U_k = U[:, :k]
    pbar.update(1)
    Sigma_k = np.diag(Sigma[:k])
    pbar.update(1)
    Vt_k = Vt[:k, :]

# Getting the word representations
with tqdm(total=1, desc="Computing word representations") as pbar:
    word_representations = np.dot(U_k, Sigma_k)
    pbar.update(1)

df = pd.DataFrame(word_representations, index=vocab)

df.to_csv('/content/word_representations_SVD.csv', header=False, index=True, encoding='utf-8')

Processing corpus: 100%|██████████| 150/150 [00:00<00:00, 18080.97 sentences/s]
Performing SVD:  33%|███▎      | 1/3 [00:02<00:04,  2.35s/it]
Performing matrix operations: 100%|██████████| 2/2 [00:00<00:00, 6533.18it/s]
Computing word representations: 100%|██████████| 1/1 [00:00<00:00, 102.16it/s]


In [None]:
import pandas as pd
df.to_excel('/content/word_representations_SVD.xlsx', encoding='utf-8', index=True)