In [1]:
import pandas as pd
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab
import numpy as np
import html

In [2]:
filename = 'reviews_Movies_and_TV.json'
lim = 100000

In [2]:
with open(filename, "r") as f:
    with open("newfile_svd.json", "w") as fp:
        for i in range(lim):
            fp.write(f.readline())

NameError: name 'filename' is not defined

In [3]:
df = pd.read_json("newfile_svd.json", lines=True)

In [4]:
corpus = df["reviewText"].to_numpy()

In [9]:
corpus = corpus[:50000]

In [10]:
tokenizer = get_tokenizer("basic_english")

In [11]:
def clean_text(text):
    import re

    text = re.sub(r"([a-zA-Z]+)n[\'’]t", r"\1 not", text)
    text = re.sub(r"([iI])[\'’]m", r"\1 am", text)
    text = re.sub(r"([iI])[\'’]ll", r"\1 will", text)
    text = re.sub(r"[^a-zA-Z0-9\:\$\-\,\%\.\?\!]+", " ", text)
    text = html.unescape(text)
    # text = re.sub(r"([a-zA-Z]+)[\'’]s", r"\1 is", text)

    text = re.sub(r"_(.*?)_", r"\1", text)
    return text


In [12]:
tokens = []

for sent in corpus:
    tokens.append(tokenizer(clean_text(sent)))


In [13]:
# tokens = np.array(tokens)
len(tokens)

50000

In [14]:
MIN_WORD_FREQUENCY = 10

vocab = build_vocab_from_iterator(tokens, min_freq=MIN_WORD_FREQUENCY, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])


In [15]:
vocab.get_stoi().keys()



In [16]:
len(vocab)

18486

In [17]:
vocab["titanic"]

3704

In [18]:
print(f"Total words in text: {len(tokens)}")
print(f"Unique words: {len(vocab)}")


Total words in text: 50000
Unique words: 18486


In [19]:
# from nltk.tokenize import sent_tokenize, word_tokenize
WINDOW_LENGTH = 4

In [20]:
n = len(vocab)
co_occ_matrix = np.zeros((n, n))


In [21]:
co_occ_matrix.shape

(18486, 18486)

In [22]:
def build_co_occ(sentence):
    sent = tokenizer(sentence)
    for idx, word in enumerate(sent):
        for context_id in range((max(0, idx - WINDOW_LENGTH)), (min(len(sent), idx + WINDOW_LENGTH + 1))):
            row = vocab[word]
            col = vocab[sent[context_id]]
            co_occ_matrix[row][col] += 1


In [23]:
for sentence in corpus:
    build_co_occ(sentence)


In [24]:
print("Co-occurrence Matrix: ")
print(co_occ_matrix)


Co-occurrence Matrix: 
[[6.48002e+05 1.46355e+05 1.34357e+05 ... 1.00000e+01 3.00000e+00
  5.00000e+00]
 [1.46355e+05 5.36099e+05 1.36321e+05 ... 9.00000e+00 8.00000e+00
  1.00000e+00]
 [1.34357e+05 1.36321e+05 4.52751e+05 ... 3.00000e+00 2.00000e+00
  8.00000e+00]
 ...
 [1.00000e+01 9.00000e+00 3.00000e+00 ... 1.00000e+01 0.00000e+00
  0.00000e+00]
 [3.00000e+00 8.00000e+00 2.00000e+00 ... 0.00000e+00 1.00000e+01
  0.00000e+00]
 [5.00000e+00 1.00000e+00 8.00000e+00 ... 0.00000e+00 0.00000e+00
  1.00000e+01]]


In [25]:
def normalize_co_occurrence_matrix(co_occ_matrix):
    """
    Normalize co-occurrence matrix using PPMI (Positive Pointwise Mutual Information).

    Args:
        co_occ_matrix (np.array): Co-occurrence matrix.

    Returns:
        np.array: Normalized co-occurrence matrix.
    """
    # calculate sum of all elements in the matrix
    total_sum = np.sum(co_occ_matrix)
    
    # calculate row and column sums
    row_sums = np.sum(co_occ_matrix, axis=1)
    col_sums = np.sum(co_occ_matrix, axis=0)
    
    # calculate PPMI values
    pmi_matrix = np.log((co_occ_matrix * total_sum) / (np.outer(row_sums, col_sums) + 1e-6))
    ppmi_matrix = np.maximum(pmi_matrix, 0)
    
    # normalize using row-wise L2 norm
    normalized_matrix = ppmi_matrix / np.linalg.norm(ppmi_matrix, ord=2, axis=1, keepdims=True)
    
    return normalized_matrix

In [26]:
# co_occ_matrix = normalize_co_occurrence_matrix(co_occ_matrix)

In [27]:
co_occ_matrix

array([[6.48002e+05, 1.46355e+05, 1.34357e+05, ..., 1.00000e+01,
        3.00000e+00, 5.00000e+00],
       [1.46355e+05, 5.36099e+05, 1.36321e+05, ..., 9.00000e+00,
        8.00000e+00, 1.00000e+00],
       [1.34357e+05, 1.36321e+05, 4.52751e+05, ..., 3.00000e+00,
        2.00000e+00, 8.00000e+00],
       ...,
       [1.00000e+01, 9.00000e+00, 3.00000e+00, ..., 1.00000e+01,
        0.00000e+00, 0.00000e+00],
       [3.00000e+00, 8.00000e+00, 2.00000e+00, ..., 0.00000e+00,
        1.00000e+01, 0.00000e+00],
       [5.00000e+00, 1.00000e+00, 8.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.00000e+01]])

In [28]:
df = pd.DataFrame(co_occ_matrix, index=vocab.get_stoi().keys(), columns=vocab.get_stoi().keys())


In [29]:
# Co-occurence matrix
df.head(10)


Unnamed: 0,zooms,zelda,zegart,yearbook,working-class,wires,wily,willed,wide-eyed,whirlwind,...,shani,actual,video,modern,ambitious,loneliness,whistle,out,cows,every
zooms,648002.0,146355.0,134357.0,133756.0,77774.0,62912.0,61217.0,47290.0,37526.0,45632.0,...,5.0,0.0,9.0,11.0,4.0,12.0,3.0,10.0,3.0,5.0
zelda,146355.0,536099.0,136321.0,60376.0,51543.0,61583.0,50426.0,43707.0,61056.0,60573.0,...,1.0,6.0,4.0,4.0,13.0,4.0,4.0,9.0,8.0,1.0
zegart,134357.0,136321.0,452751.0,101620.0,71634.0,29929.0,116499.0,51924.0,54195.0,23589.0,...,0.0,3.0,2.0,5.0,0.0,3.0,2.0,3.0,2.0,8.0
yearbook,133756.0,60376.0,101620.0,372585.0,74702.0,49278.0,40761.0,30746.0,39824.0,33442.0,...,7.0,7.0,3.0,4.0,6.0,2.0,3.0,6.0,1.0,7.0
working-class,77774.0,51543.0,71634.0,74702.0,198301.0,29473.0,29222.0,24857.0,22470.0,22083.0,...,5.0,3.0,4.0,2.0,3.0,7.0,5.0,3.0,0.0,3.0
wires,62912.0,61583.0,29929.0,49278.0,29473.0,177109.0,35560.0,23621.0,31330.0,20139.0,...,2.0,1.0,1.0,6.0,1.0,2.0,5.0,2.0,0.0,2.0
wily,61217.0,50426.0,116499.0,40761.0,29222.0,35560.0,169304.0,14273.0,19438.0,10372.0,...,3.0,1.0,1.0,3.0,0.0,0.0,0.0,2.0,1.0,1.0
willed,47290.0,43707.0,51924.0,30746.0,24857.0,23621.0,14273.0,153879.0,13875.0,20589.0,...,0.0,0.0,3.0,2.0,1.0,1.0,2.0,1.0,3.0,0.0
wide-eyed,37526.0,61056.0,54195.0,39824.0,22470.0,31330.0,19438.0,13875.0,127154.0,21082.0,...,0.0,0.0,1.0,1.0,2.0,0.0,2.0,1.0,2.0,0.0
whirlwind,45632.0,60573.0,23589.0,33442.0,22083.0,20139.0,10372.0,20589.0,21082.0,122787.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [30]:
from numpy import array
from scipy.sparse.linalg import svds
from numpy import diag
from numpy import dot

co_occ_np = df.to_numpy()

U, s, VT = svds(co_occ_np, k=100)


In [27]:
# from sklearn.decomposition import IncrementalPCA
# from tqdm import tqdm

# incr_pca = IncrementalPCA(n_components=300, batch_size=1000)
# reviews_ = np.array_split(np.nan_to_num(co_occ_matrix), int(len(co_occ_matrix) / 1000))  # integer division

# for batch in tqdm(reviews_):
#     incr_pca.partial_fit(batch)

# U = incr_pca.components_.T


100%|██████████| 27/27 [02:48<00:00,  6.23s/it]


In [31]:
U_df = pd.DataFrame(U, index=vocab.get_stoi().keys())
U_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
zooms,-0.001276,0.003798,6.839778e-04,-0.000619,-0.003669,0.001280,0.003136,0.000571,-9.540583e-04,0.000577,...,-1.876944e-02,-0.031067,2.027219e-02,0.012022,-2.017350e-02,-0.057752,-2.241063e-01,2.700415e-01,0.705137,0.591592
zelda,-0.003155,0.002881,-9.690841e-04,-0.000480,0.003986,-0.003346,0.002020,0.003776,-1.507482e-03,0.000098,...,-1.693811e-02,-0.055463,-1.232426e-01,0.098986,2.592532e-02,-0.214762,1.003472e-01,5.884717e-01,-0.596280,0.453844
zegart,-0.000974,0.002625,1.305612e-02,0.005689,-0.001247,-0.003149,-0.007369,0.003759,4.900423e-03,-0.001420,...,1.989022e-01,-0.159493,7.731798e-02,-0.081831,4.692135e-04,-0.168677,-5.113932e-01,-5.795508e-01,-0.302446,0.412401
yearbook,-0.003694,0.000397,-1.519490e-03,0.001294,0.001039,-0.004094,0.002541,0.001677,6.486077e-04,-0.000733,...,-2.059019e-02,-0.013887,-9.318470e-02,-0.009026,-1.261578e-01,-0.355662,7.540110e-01,-4.072164e-01,0.107179,0.303967
working-class,-0.004462,0.003005,-2.467590e-04,0.003062,0.001968,0.000289,-0.001578,0.006057,-2.747387e-03,0.002155,...,-7.465117e-03,-0.007389,4.012310e-02,0.367661,8.149588e-01,0.332097,1.495876e-01,-1.441045e-01,-0.002896,0.175434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loneliness,0.000018,-0.000008,-6.439142e-05,-0.000072,-0.000017,-0.000013,-0.000038,-0.000006,2.901590e-05,-0.000005,...,7.859892e-06,-0.000014,-4.878341e-06,0.000016,2.844170e-05,0.000008,-2.617436e-06,4.996465e-06,0.000011,0.000012
whistle,0.000036,-0.000020,5.419515e-06,0.000045,-0.000016,0.000018,-0.000054,0.000021,6.696416e-07,0.000019,...,1.112112e-05,-0.000017,-5.135802e-06,0.000008,1.018663e-05,0.000020,9.585597e-06,-2.311262e-07,-0.000002,0.000007
out,-0.000013,0.000013,-9.141110e-07,-0.000005,0.000002,-0.000019,0.000013,0.000020,-6.697515e-06,-0.000003,...,4.668393e-07,0.000003,-7.859957e-06,0.000009,3.093290e-06,-0.000007,8.505260e-06,7.629463e-06,0.000002,0.000014
cows,0.000010,0.000015,1.454918e-05,0.000010,-0.000010,-0.000009,0.000032,0.000002,7.637732e-05,0.000041,...,-1.539550e-05,-0.000004,-7.981229e-06,-0.000011,-6.380811e-07,-0.000003,5.495851e-07,9.194426e-06,-0.000008,0.000008


In [32]:
from sklearn.metrics.pairwise import cosine_similarity


In [49]:
word = "camera"
word_index = vocab[word]


In [50]:
word_vector = U[word_index]


In [51]:
U.shape

(18486, 100)

In [52]:
# Calculate the cosine similarities between the chosen word and all other words
similarities = []
for i in range(U.shape[0]):
    simi = cosine_similarity(U[i].reshape(1, -1), [word_vector])
    similarities.extend(simi.flatten())


In [53]:
# Sort the similarities in descending order and return the top k words
k = 10
top_k_similar_words = np.argsort(similarities)[-k:]


In [54]:
top_k_similar_words


array([1611, 2736,  797, 4522, 4791, 3894, 2230, 6541,  746,  750])

In [55]:
index_to_words = vocab.get_itos()

In [56]:
list(reversed([index_to_words[idx] for idx in top_k_similar_words]))

['camera',
 'battle',
 'border',
 'chase',
 'piano',
 'bathroom',
 'tricks',
 'storyline',
 'wall',
 'trip']

In [95]:
np.save("svd_matrix2.npy", U)

In [40]:
V = np.load("svd_matrix2.npy", allow_pickle=True)

In [41]:
V.shape

(27035, 300)

In [42]:
U.shape

(27035, 300)

In [96]:
import pickle
with open('vocab2.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [44]:
with open('vocab2.pkl', 'rb') as f:
    vocab2 = pickle.load(f)

In [45]:
vocab2["titanic"]

1657