Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deep memory fixes #2662

Merged
merged 11 commits into from
Oct 24, 2023
Merged
99 changes: 47 additions & 52 deletions deeplake/core/vectorstore/deep_memory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import uuid
from collections import defaultdict
from typing import Any, Dict, Optional, List, Union, Callable, Tuple
from time import time

Expand Down Expand Up @@ -404,26 +405,20 @@
]:
eval_type = "with" if use_model else "without"
print(f"---- Evaluating {eval_type} model ---- ")
callect_data = False
for k in top_k:
callect_data = k == 10

recall, queries_dict = recall_at_k(
self.dataset,
indra_dataset,
relevance,
top_k=k,
query_embs=query_embs,
metric=metric,
collect_data=callect_data,
use_model=use_model,
)
avg_recalls, queries_dict = recall_at_k(
indra_dataset,
relevance,
top_k=top_k,
query_embs=query_embs,
metric=metric,
use_model=use_model,
)

if callect_data:
queries_data.update(queries_dict)
queries_data.update(queries_dict)

print(f"Recall@{k}:\t {100*recall: .1f}%")
recalls[f"{eval_type} model"][f"recall@{k}"] = recall
for recall, recall_value in avg_recalls.items():
print(f"Recall@{recall}:\t {100*recall_value: .1f}%")
recalls[f"{eval_type} model"][f"recall@{recall}"] = recall_value

log_queries = parsed_qvs_params.get("log_queries")
branch = parsed_qvs_params.get("branch")
Expand Down Expand Up @@ -454,16 +449,14 @@


def recall_at_k(
dataset: Dataset,
indra_dataset: Any,
relevance: List[List[Tuple[str, int]]],
query_embs: Union[List[np.ndarray], List[List[float]]],
metric: str,
top_k: int = 10,
collect_data: bool = False,
top_k: List[int] = [1, 3, 5, 10, 50, 100],
use_model: bool = False,
):
recalls = []
recalls = defaultdict(list)
top_k_list = []

for query_idx, _ in enumerate(query_embs):
Expand All @@ -473,54 +466,56 @@
correct_labels = [rel[0] for rel in query_relevance]

# Compute the cosine similarity between the query and all data points
view_top_k = get_view_top_k(
view = get_view(
metric=metric,
query_emb=query_emb,
top_k=top_k,
indra_dataset=indra_dataset,
)

top_k_retrieved = [
sample.id.numpy() for sample in view_top_k
] # TODO: optimize this

# Compute the recall: the fraction of relevant items found in the top k
num_relevant_in_top_k = len(
set(correct_labels).intersection(set(top_k_retrieved))
)
if len(correct_labels) == 0:
continue
recall = num_relevant_in_top_k / len(correct_labels)

if collect_data:
top_k_list.append(top_k_retrieved)
recalls.append(recall)
for k in top_k:
collect_data = k == 10
view_top_k = view[:k]

# Average the recalls for each query
avg_recall = np.mean(np.array(recalls))
queries_data = {}
if collect_data:
model_type = "deep_memory" if use_model else "vector_search"
top_k_retrieved = [
sample.id.numpy() for sample in view_top_k
] # TODO: optimize this

queries_data = {
f"{model_type}_top_10": top_k_list,
f"{model_type}_recall": recalls,
}
return avg_recall, queries_data
# Compute the recall: the fraction of relevant items found in the top k
num_relevant_in_top_k = len(
set(correct_labels).intersection(set(top_k_retrieved))
)
if len(correct_labels) == 0:
continue

Check warning on line 488 in deeplake/core/vectorstore/deep_memory.py

View check run for this annotation

Codecov / codecov/patch

deeplake/core/vectorstore/deep_memory.py#L488

Added line #L488 was not covered by tests
recall = num_relevant_in_top_k / len(correct_labels)

if collect_data:
top_k_list.append(top_k_retrieved)
recalls[k].append(recall)

def get_view_top_k(
# Average the recalls for each query
avg_recalls = {
f"{recall}": np.mean(np.array(recall_list))
for recall, recall_list in recalls.items()
}
model_type = "deep_memory" if use_model else "vector_search"
queries_data = {
f"{model_type}_top_10": top_k_list,
f"{model_type}_recall": recalls[10],
}
return avg_recalls, queries_data


def get_view(
metric: str,
query_emb: Union[List[float], np.ndarray],
top_k: int,
indra_dataset: Any,
return_tensors: List[str] = ["text", "metadata", "id"],
tql_filter: str = "",
):
tql_filter_str = tql_filter if tql_filter == "" else " where " + tql_filter
query_emb_str = ",".join([f"{q}" for q in query_emb])
return_tensors_str = ", ".join(return_tensors)
tql = f"SELECT * FROM (SELECT {return_tensors_str}, ROW_NUMBER() as indices, {metric}(embedding, ARRAY[{query_emb_str}]) as score {tql_filter_str} order by {metric}(embedding, ARRAY[{query_emb_str}]) desc limit {top_k})"
tql = f"SELECT * FROM (SELECT {return_tensors_str}, ROW_NUMBER() as indices, {metric}(embedding, ARRAY[{query_emb_str}]) as score {tql_filter_str} order by {metric}(embedding, ARRAY[{query_emb_str}]) desc limit 100)"
indra_view = indra_dataset.query(tql)
return indra_view

Expand Down