In [4]:
!pip install tqdm



In [5]:
import numpy as np
from scipy.linalg import svd
import pandas as pd
from tqdm import tqdm

In [6]:
# Load the corpora
corpus_1 = open('/content/pre_processed_corpus/Corpus.txt', 'r', encoding='utf-8').readlines()

# Concatenate the corpora
corpus = corpus_1

# Create the vocabulary
vocab = list(set(" ".join(corpus).split()))
vocab_size = len(vocab)
word_to_index = {word: i for i, word in enumerate(vocab)}

# Create the co-occurrence matrix
co_occurrence = np.zeros((vocab_size, vocab_size))

# Add a progress bar to show the percentage of the corpus processed
for sentence in tqdm(corpus, desc="Processing corpus", unit=" sentences"):
    words = sentence.split()
    for i in range(len(words)):
        for j in range(max(0, i-1), min(len(words), i+2)):
            if i!= j:
                co_occurrence[word_to_index[words[i]], word_to_index[words[j]]] += 1

# Perform SVD with a progress bar
with tqdm(total=3, desc="Performing SVD") as pbar:
    U, Sigma, Vt = svd(co_occurrence)
    pbar.update(1)

# Choose the best rank approximation (e.g., 100)
k = 100

# Perform matrix operations with progress bars
with tqdm(total=2, desc="Performing matrix operations") as pbar:
    U_k = U[:, :k]
    pbar.update(1)
    Sigma_k = np.diag(Sigma[:k])
    pbar.update(1)
    Vt_k = Vt[:k, :]

# Get the word representations with a progress bar
with tqdm(total=1, desc="Computing word representations") as pbar:
    word_representations = np.dot(U_k, Sigma_k)
    pbar.update(1)

# Create a DataFrame to store the word representations
df = pd.DataFrame(word_representations, index=vocab)

# Save the word representations to a CSV file
df.to_csv('/content/word_representations_SVD.csv', header=False, index=True, encoding='utf-8')

Processing corpus: 100%|██████████| 150/150 [00:00<00:00, 17310.38 sentences/s]
Performing SVD:  33%|███▎      | 1/3 [00:00<00:01,  1.26it/s]
Performing matrix operations: 100%|██████████| 2/2 [00:00<00:00, 5890.88it/s]
Computing word representations: 100%|██████████| 1/1 [00:00<00:00, 437.32it/s]
