# Project 1 Task 1

In [None]:
from nltk.stem import PorterStemmer
import nltk
import pickle
from src.utils import *
from scipy.sparse import lil_matrix, csr_matrix
from scipy.sparse.linalg import norm as snorm

In [None]:
# Load data
corpus, queries, train_set, test_set = load_task1_data(F_reduced_dataset = False)

# Only queries in test_set
unique_query_ids = set(test_set.to_list())
queries = queries[queries.index.isin(unique_query_ids)] 

In [None]:
# Tokenization
nltk.download('stopwords')
stemmer = PorterStemmer()
token_dir = "Data/tokens/"

# Tokenize corpus and queries and save it, OR load it
if False:
    tokenize_corpus_queries(corpus, queries, stemmer)
    save_tokenized_corpus_queries(token_dir, corpus, queries)
else:
    corpus, queries = load_tokenized_corpus_queries(token_dir, corpus, queries)

In [None]:
# Count presence of each term in each document
vocab = sorted(list(set([token for tokens in corpus['tokens'].to_list() for token in tokens])))
if True:
    count_corpus, count_queries = count_terms(corpus, queries, vocab)

    with open("Data/tfidf/count_corpus.npz","wb") as f:
        pickle.dump(count_corpus, f)
    with open("Data/tfidf/count_queries.npz","wb") as f:
        pickle.dump(count_queries, f)

else:
    with open("Data/tfidf/count_corpus.npz","rb") as f:
        count_corpus = pickle.load(f)
    with open("Data/tfidf/count_queries.npz","rb") as f:
        count_queries = pickle.load(f)

In [None]:
# Dimensions and indices
T = len(vocab)
D = len(corpus)
Q = len(queries)
doc_indices, term_indices = count_corpus.nonzero()
indices = np.array((doc_indices, term_indices)).swapaxes(0,1)

# N doc with term t
n_doc_with_term = np.zeros((T))
for i in term_indices:
    n_doc_with_term[i] +=1

# N of different term for each doc, + mean
n_different_term = np.zeros((D))
for i in doc_indices:
    n_different_term[i] +=1
n_unique_avg = np.mean(n_different_term) # 1

# Highest count
highest_count = np.zeros((D)) # D
for (d,t) in indices:
    a = highest_count[d]
    b = count_corpus[d,t]
    if a < b:
        highest_count[d] = b

# IDF
log_D = np.full(shape=(T), fill_value=np.log(D))
idf = log_D - n_doc_with_term # T

# TF
s = 0.2
tf_corpus = lil_matrix((D,T), dtype=np.float32)
for (d,t) in indices:
    up = count_corpus[d,t]/highest_count[d]
    below = ((1-s)*n_unique_avg + s*n_different_term[d])
    tf_corpus[d,t]=up/below

# TFIDF
tfidf_corpus = lil_matrix((D,T), dtype=np.float32)
for (d,t) in indices:
    tfidf_corpus[d,t] = tf_corpus[d,t]*idf[t]

with open("Data/tfidf/tfidf_corpus.npz","wb") as f:
        pickle.dump(tfidf_corpus, f)

In [None]:
# Indices
query_indices, term_indices = count_queries.nonzero()
indices = np.array((query_indices, term_indices)).swapaxes(0,1)

# Highest_count
highest_count = np.zeros((Q)) 
for (q,t) in indices:
    a = highest_count[q]
    b = count_queries[q,t]
    if a < b:
        highest_count[q] = b

# N of different term for each doc, + mean
n_different_term = np.zeros((Q))
for i in query_indices:
    n_different_term[i] +=1

# TF
s = 0.2
tf_queries = lil_matrix((Q,T), dtype=np.float32)
for (q,t) in indices:
    up = count_queries[q,t]/highest_count[q]
    below = ((1-s)*n_unique_avg + s*n_different_term[q])
    tf_queries[q,t]=up/below

# TFIDF
tfidf_queries = lil_matrix((Q,T), dtype=np.float32)
for (q,t) in indices:
    tfidf_queries[q,t] = tf_queries[q,t]*idf[t]

with open("Data/tfidf/tfidf_queries.npz","wb") as f:
    pickle.dump(tfidf_queries, f)

In [None]:
with open("Data/tfidf/tfidf_corpus.npz","rb") as f:
    tfidf_corpus = pickle.load(f)
with open("Data/tfidf/tfidf_queries.npz","rb") as f:
    tfidf_queries = pickle.load(f)

In [None]:
# norms
D = tfidf_corpus.shape[0]
Q = tfidf_queries.shape[0]

tfidf_corpus = csr_matrix(tfidf_corpus)
tfidf_queries = csr_matrix(tfidf_queries)

corpus_norms = np.zeros((D))
for i in range(D):
    corpus_norms[i] = snorm(tfidf_corpus[i])

queries_norms = np.zeros((Q))
for i in range(Q):
    queries_norms[i] = snorm(tfidf_queries[i])

In [None]:
k_top = 10
similarities = lil_matrix((D,Q), dtype=np.float32)
doc_indices, doc_term_indices = tfidf_corpus.nonzero()
corpus_indices = np.array((doc_indices, doc_term_indices)).swapaxes(0,1)
query_indices, query_term_indices = tfidf_queries.nonzero()
queries_indices = np.array((query_indices, query_term_indices)).swapaxes(0,1)

for q in range(Q):
    print(q)
    non_zero_q = queries_indices[query_indices==q]
    if len(non_zero_q) == 0:
        continue
    for d in range(D):
        non_zero_d = corpus_indices[doc_indices==d]
        if len(non_zero_d) == 0:
            continue
        non_zero_coos_index = np.array(np.intersect1d(non_zero_q[:,1], non_zero_d[:,1], return_indices=True))
        if np.prod(non_zero_coos_index.shape) == 0:
            continue
        for t in non_zero_coos_index:
            t = t[0]
            similarities[d,q] += tfidf_corpus[d,t] * tfidf_queries[q,t]
        if similarities[d,q]!=0:
            similarities[d,q] /= queries_norms[q] * corpus_norms[d]
    
"""
    similarities_q.append(tfidf_corpus[d] @ tfidf_queries[q].T / queries_norms[q] / corpus_norms[d])
    print(d)
    similarities_indices[q] = np.argsort(similarities_q)[::-1][:k_top]
 """

#### TEST PCA

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD, IncrementalPCA
import pickle

In [None]:
# Load the data with corrected file paths
corpus = pd.read_json("Data/raw/corpus.jsonl", lines=True)
queries = pd.read_json("Data/raw/queries.jsonl", lines=True)
test_1 = pd.read_csv("Data/raw/task1_test.tsv", delimiter='\t')

unique_query_ids = set(test_1['query-id'])
filtered_queries = queries[queries['_id'].isin(unique_query_ids)]

In [None]:
# Data preprocessing and feature engineering can be done here.
# Vectorize the corpus and queries as sparse matrices
tfidf_vectorizer = TfidfVectorizer()
corpus_tfidf = csr_matrix(tfidf_vectorizer.fit_transform(corpus['text']))
query_tfidf = csr_matrix(tfidf_vectorizer.transform(filtered_queries['text']))

n_dim=500
clf = IncrementalPCA(n_components = n_dim, batch_size=500)
corpus_pca = clf.fit_transform(corpus_tfidf)
query_pca = clf.transform(query_tfidf)
print(f"Variance explained : {clf.explained_variance_ratio_.cumsum()[-1]*100:2f}%")

In [None]:

# Data preprocessing and feature engineering can be done here.
# Vectorize the corpus and queries as sparse matrices
tfidf_vectorizer = TfidfVectorizer()
corpus_tfidf = csr_matrix(tfidf_vectorizer.fit_transform(corpus['text']))
query_tfidf = csr_matrix(tfidf_vectorizer.transform(filtered_queries['text']))

n_dim=600
clf = TruncatedSVD(n_dim, algorithm='arpack')
corpus_pca = clf.fit_transform(corpus_tfidf)
query_pca = clf.transform(query_tfidf)
print(f"Variance explained : {clf.explained_variance_ratio_.cumsum()[-1]*100:2f}%") # env. 24% for n_dim = 600

In [None]:
with open('corpus_pca_600.npy', 'wb') as f:
    np.save(f, corpus_pca )
with open('query_pca_600.npy', 'wb') as f:
    np.save(f,query_pca )

with open('corpus_pca_600.npy', 'rb') as f:
    corpus_pca = np.load(f)
with open('query_pca_600.npy', 'rb') as f:
    query_pca = np.load(f)


In [None]:
top_k = 10
batch_size = 50000
num_queries = query_pca.shape[0]
num_corpus_docs = corpus_pca.shape[0]
corpus_ids = corpus['_id'].to_list()

# Initialize arrays to store the top 10 document indices and their similarity scores
top_10_indices = np.zeros((num_queries, 0), dtype=int)
top_10_similarities = np.zeros((num_queries, 0))

for i in range(0, num_corpus_docs, batch_size):
    start = i
    end = min(i + batch_size, num_corpus_docs)
    batch_corpus_pca = corpus_pca[start:end]
    batch_similarity = cosine_similarity(query_pca, batch_corpus_pca)

    # Find the top 10 indices and their similarity scores for each query in this batch
    top_10_batch_indices = np.argpartition(batch_similarity, -top_k, axis=1)[:, -top_k:]
    top_10_batch_corpus_indices = np.array(corpus_ids[start:end])[top_10_batch_indices]
    top_10_batch_similarities = np.partition(batch_similarity, -top_k, axis=1)[:, -top_k:]

    # Update the top_10_indices and top_10_similarities arrays
    top_10_indices = np.hstack((top_10_indices, top_10_batch_corpus_indices))
    top_10_similarities = np.hstack((top_10_similarities, top_10_batch_similarities))
    if i%100000==0:
        print(f"{i/num_corpus_docs*100:.2f}%")

# Now, top_10_indices contains the top 10 document indices for each query, and
# top_10_similarities contains their similarity scores.

indices = np.argpartition(top_10_similarities, -top_k, axis=1)[:, -top_k:]
overall_top_10_indices = []
for i in range(len(top_10_indices)):
    overall_top_10_indices.append(top_10_indices[i][indices[i]])
print("100%")


In [None]:
overall_top_10_indices[0]