# Project 1 Task 1

In [None]:
from nltk.stem import PorterStemmer
import nltk
from src.tokenization import *
from src.vectorization import *

F_reduced_dataset = False  # If true load only 1% of corpus and small portion of queries and train_set
F_do_tokenization = False  # If true tokenize corpus + queries, else load already tokenized documents

In [None]:
# Load data
corpus, queries, train_set, test_set = load_task1_data(F_reduced_dataset)

In [None]:
# Tokenization
nltk.download('stopwords')
stemmer = PorterStemmer()
token_dir = "Data/tokens/"

# Tokenize corpus and queries and save it, OR load it
if F_do_tokenization:
    tokenize_corpus_queries(corpus, queries, stemmer)
    save_tokenized_corpus_queries(token_dir, corpus, queries)
else:
    load_tokenized_corpus_queries(token_dir, corpus, queries)

vectorized_corpus, vectorized_queries = vectorize_corpus_queries(corpus, queries)

In [None]:
# Find top 10 relevant document for each test query
vectorized_queries

#### TEST PCA

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Load the data with corrected file paths
corpus = pd.read_json("Data/raw/corpus.jsonl", lines=True)
queries = pd.read_json("Data/raw/queries.jsonl", lines=True)
test_1 = pd.read_csv("Data/raw/task1_test.tsv", delimiter='\t')

unique_query_ids = set(test_1['query-id'])
filtered_queries = queries[queries['_id'].isin(unique_query_ids)]

In [3]:

# Data preprocessing and feature engineering can be done here.
# Vectorize the corpus and queries as sparse matrices
tfidf_vectorizer = TfidfVectorizer()
corpus_tfidf = csr_matrix(tfidf_vectorizer.fit_transform(corpus['text']))
query_tfidf = csr_matrix(tfidf_vectorizer.transform(filtered_queries['text']))

clf = TruncatedSVD(300)
corpus_pca = clf.fit_transform(corpus_tfidf)
query_pca = clf.transform(query_tfidf)
print(f"Variance explained : {clf.explained_variance_ratio_.cumsum()[-1]*100:2f}%")

: 

In [None]:
top_k = 10
batch_size = 50000
num_queries = query_pca.shape[0]
num_corpus_docs = corpus_pca.shape[0]
corpus_ids = corpus['_id'].to_list()

# Initialize arrays to store the top 10 document indices and their similarity scores
top_10_indices = np.zeros((num_queries, 0), dtype=int)
top_10_similarities = np.zeros((num_queries, 0))

for i in range(0, num_corpus_docs, batch_size):
    start = i
    end = min(i + batch_size, num_corpus_docs)
    batch_corpus_pca = corpus_pca[start:end]
    batch_similarity = cosine_similarity(query_pca, batch_corpus_pca)

    # Find the top 10 indices and their similarity scores for each query in this batch
    top_10_batch_indices = np.argpartition(batch_similarity, -top_k, axis=1)[:, -top_k:]
    top_10_batch_corpus_indices = np.array(corpus_ids[start:end])[top_10_batch_indices]
    top_10_batch_similarities = np.partition(batch_similarity, -top_k, axis=1)[:, -top_k:]

    # Update the top_10_indices and top_10_similarities arrays
    top_10_indices = np.hstack((top_10_indices, top_10_batch_corpus_indices))
    top_10_similarities = np.hstack((top_10_similarities, top_10_batch_similarities))
    if i%100000==0:
        print(f"{i/num_corpus_docs*100:.2f}%")

# Now, top_10_indices contains the top 10 document indices for each query, and
# top_10_similarities contains their similarity scores.

indices = np.argpartition(top_10_similarities, -top_k, axis=1)[:, -top_k:]
overall_top_10_indices = []
for i in range(len(top_10_indices)):
    overall_top_10_indices.append(top_10_indices[i][indices[i]])
print("100%")


In [None]:
overall_top_10_indices = []
for i in range(len(top_10_indices)):
    overall_top_10_indices.append(top_10_indices[i][indices[i]])
overall_top_10_indices = np.array(overall_top_10_indices)

In [None]:
overall_top_10_indices[0]