In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from src.evaluation import evaluate_queries, precision_at_k, recall_at_k
from src.vector_database import VectorDatabase, ClusterDatabase
from src.helpers import process_query_results
from sklearn.cluster import KMeans
from pprint import pprint

In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
base_path = Path("/Users/stighellemans/Desktop/Information_Retrieval/assignments/data")
base_small_doc_path = base_path / "full_docs_small"
base_large_doc_path = base_path / "full_docs"

small_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_small_doc_path.glob("*.txt")}
large_docs = {int(re.search(r'\d+', doc_path.name).group()): doc_path for doc_path in base_large_doc_path.glob("*.txt")}

small_queries = pd.read_csv(base_path / "dev_small_queries - dev_small_queries.csv", index_col="Query number").to_dict()["Query"]
small_query_results = pd.read_csv(base_path / "dev_query_results_small.csv", index_col="Query_number")
small_query_results = process_query_results(small_query_results)

# large_queries = pd.read_csv(base_path / "dev_queries.tsv", delimiter="\t", index_col="Query number").to_dict()["Query"]
# large_query_results = pd.read_csv(base_path / "dev_query_results.csv", index_col="Query_number")
# large_query_results = process_query_results(large_queries, large_query_results)

# test_queries = pd.read_csv(base_path / "queries.csv", delimiter="\t", index_col="Query number").to_dict()["Query"]

In [9]:
db = VectorDatabase(model)
db.load_database("./results/small_db")

In [25]:
%%timeit
evaluate_queries(small_queries, small_query_results, k_values=[1, 3, 5, 10], database=db)

359 ms ± 40.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [42]:
cluster_db = ClusterDatabase(model)
cluster_db.load_database("./results/large_db")

In [None]:
vector1 = db.doc_vectors[1]
vector2 = db.doc_vectors[2]

np.dot(vector1, vector2), np.linalg.norm(vector1 - vector2), np.linalg.norm(vector1), np.linalg.norm(vector2)

(0.060675777, 1.370638, 1.0, 0.99999994)

In [26]:
%%timeit
evaluate_queries(small_queries, small_query_results, k_values=[1, 3, 5, 10], database=cluster_db, top_c=5)

416 ms ± 26.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [55]:
query_vectors = np.array(db.encoder.encode(list(small_queries.values())))
sims = np.dot(db.doc_vectors, query_vectors.T)

top_k_indices = np.argsort(-sims, axis=0)[:k]

sims.shape

(1557, 248)

In [24]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(db.doc_vectors)
cluster_labels = kmeans.labels_
doc_vectors_split = [
            db.doc_vectors[cluster_labels == i] for i in range(n_clusters)
        ]
doc_ids_split = [
            np.where(cluster_labels == i)[0] for i in range(n_clusters)
        ]

In [41]:
queries = list(small_queries.values())
top_c = 4
top_k = 10

query_vectors = np.array(db.encoder.encode(queries))
sims = np.dot(db.doc_vectors, query_vectors.T)
top_k_indices = np.argsort(-sims, axis=0)[:top_k]


print(top_k_indices[:, 0])
print(np.unique(kmeans.labels_))
print(kmeans.labels_[top_k_indices[:, 0]])


# Compute similarity of cluster centers to queries
sims = np.dot(kmeans.cluster_centers_, query_vectors.T)
query_top_k_clusters = np.argsort(-sims, axis=0)[:top_c]

print(sims[:, 0])
print(query_top_k_clusters[: , 0])

retrieved_doc_ids = np.empty((top_c * top_k, len(small_queries)))
retrieved_sims = np.empty((top_c * top_k, len(small_queries)))

for c, (doc_vector_cluster, doc_ids_cluster) in enumerate(zip(doc_vectors_split, doc_ids_split)):
    for i in range(top_c):
        queries_per_cluster = np.where(query_top_k_clusters[i, :] == c)[0]

        if queries_per_cluster.size == 0:
            continue

        sims = np.dot(doc_vector_cluster, query_vectors[queries_per_cluster].T)

        top_k_cluster_indices = np.argsort(-sims, axis=0)[:top_k]
        top_k_indices = doc_ids_cluster[top_k_cluster_indices]
        top_k_sims = sims[top_k_cluster_indices, np.arange(len(queries_per_cluster))]

        retrieved_doc_ids[i * top_k:(i + 1) * top_k, queries_per_cluster] = top_k_indices
        retrieved_sims[i * top_k:(i + 1) * top_k, queries_per_cluster] = top_k_sims

top_k_indices = np.argsort(-retrieved_sims, axis=0)[:top_k]
retrieved_doc_ids = retrieved_doc_ids[top_k_indices, np.arange(len(small_queries))]

vectorized_translate = np.vectorize(db.translate_id)
top_k_indices = vectorized_translate(retrieved_doc_ids)

k_values = [1, 3, 5, 10]
query_results = small_query_results

output = {}
for k in k_values:
    precisions = []
    recalls = []
    for i, query_id in enumerate(list(small_queries.keys())):
        retrieved = top_k_indices[:k, i]
        relevant = query_results[query_id]
        precisions.append(precision_at_k(list(relevant), list(retrieved), k))
        recalls.append(recall_at_k(list(relevant), list(retrieved), k))
    precisions = np.mean(precisions)
    recalls = np.mean(recalls)

    output[k] = {"Precision": precisions, "Recall": recalls}

output

[  52   45  271  326 1112   56 1358  894 1082 1226]
[0 1 2 3 4 5 6 7 8 9]
[9 9 2 9 2 9 2 9 1 9]
[ 0.0225224   0.01120792  0.02444224  0.00127908  0.00766633 -0.00567381
  0.01702095  0.01442289  0.00767733  0.06765988]
[9 2 0 6]


{1: {'Precision': 0.4435483870967742, 'Recall': 0.4375},
 3: {'Precision': 0.1948924731182796, 'Recall': 0.5725806451612904},
 5: {'Precision': 0.12580645161290321, 'Recall': 0.6129032258064516},
 10: {'Precision': 0.06774193548387096, 'Recall': 0.6612903225806451}}

{1: {'Precision': 0.0, 'Recall': 0.0},
 3: {'Precision': 0.0, 'Recall': 0.0},
 5: {'Precision': 0.0, 'Recall': 0.0},
 10: {'Precision': 0.0, 'Recall': 0.0}}