In [7]:
# Query: "python data"
retrieved = ["doc1.txt", "doc2.txt", "doc3.txt"]  # system's ranked output
relevant = ["doc1.txt", "doc2.txt"]               # ground truth

In [9]:
import math

# True Positives, False Positives, False Negatives
tp = len([d for d in retrieved if d in relevant])
fp = len([d for d in retrieved if d not in relevant])
fn = len([d for d in relevant if d not in retrieved])

# Precision, Recall
precision = tp / (tp + fp)
recall = tp / (tp + fn)

# F1 and E-Measure
f1 = 2 * precision * recall / (precision + recall)
e_measure = 1 - f1  # complement (simplified)

In [10]:
# NDCG Calculation
relevance_scores = [1 if d in relevant else 0 for d in retrieved]

def dcg(scores):
    return sum(s / math.log2(i + 2) for i, s in enumerate(scores))

ideal = sorted(relevance_scores, reverse=True)
ndcg = dcg(relevance_scores) / dcg(ideal) if dcg(ideal) != 0 else 0

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Measure: {f1:.2f}")
print(f"E-Measure: {e_measure:.2f}")
print(f"NDCG: {ndcg:.2f}")

Precision: 0.67
Recall: 1.00
F1-Measure: 0.80
E-Measure: 0.20
NDCG: 1.00
