# BM25 retrieval

Self-contained implementation and evaluation.


## Setup


In [1]:
from pathlib import Path
import json

DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_full.json'
if not DATASET_PATH.exists():
    DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_new.json'
print('Dataset:', DATASET_PATH)


Dataset: ../data/annotations_dataset_new.json


## Dataset stats


In [2]:
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

songs = len(data)
annotations = sum(len(s.get('annotations', [])) for s in data)
print('Songs:', songs)
print('Annotations:', annotations)
print('Avg annotations per song:', round(annotations / songs, 2) if songs else 0)


Songs: 3291
Annotations: 22220
Avg annotations per song: 6.75


## Load pairs


In [3]:
fragments = []
annotations = []
metadata = []

for song in data:
    for ann in song.get('annotations', []):
        fragments.append(ann.get('fragment', ''))
        annotations.append(ann.get('annotation', ''))
        metadata.append({
            'artist': song.get('artist', ''),
            'title': song.get('title', ''),
            'votes': ann.get('votes', 0),
        })

print('Pairs:', len(fragments))

MAX_EXAMPLES = 2000  # set None for full run
if MAX_EXAMPLES:
    import random
    idx = list(range(len(fragments)))
    random.seed(42)
    random.shuffle(idx)
    idx = idx[:MAX_EXAMPLES]
    fragments = [fragments[i] for i in idx]
    annotations = [annotations[i] for i in idx]
    metadata = [metadata[i] for i in idx]
    print('Using subset:', len(fragments))


Pairs: 22220
Using subset: 2000


## Implementation


In [4]:
import math
import re
from collections import Counter
import numpy as np
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

def tokenize(text):
    return re.findall(r'[A-Za-z0-9\u0400-\u04FF]+', text.lower())

class BM25Retriever:
    def __init__(self, fragments, annotations, metadata, k1=1.5, b=0.75):
        self.fragments = fragments
        self.annotations = annotations
        self.metadata = metadata
        self.k1 = k1
        self.b = b

        self.doc_tokens = [tokenize(t) for t in self.fragments]
        self.doc_lens = np.array([len(t) for t in self.doc_tokens], dtype=np.float32)
        self.avgdl = float(np.mean(self.doc_lens)) if self.doc_lens.size else 0.0
        self.N = len(self.doc_tokens)

        self.term_freqs = [Counter(t) for t in self.doc_tokens]
        self.doc_freqs = {}
        for tf in self.term_freqs:
            for term in tf.keys():
                self.doc_freqs[term] = self.doc_freqs.get(term, 0) + 1

        self.idf = {
            term: math.log((self.N - df + 0.5) / (df + 0.5) + 1.0)
            for term, df in self.doc_freqs.items()
        }

    def _score(self, query_tokens):
        scores = np.zeros(self.N, dtype=np.float32)
        if self.N == 0:
            return scores

        for i, tf in enumerate(self.term_freqs):
            dl = self.doc_lens[i]
            denom_base = self.k1 * (1.0 - self.b + self.b * (dl / self.avgdl)) if self.avgdl > 0 else 0.0
            score = 0.0
            for term in query_tokens:
                if term not in self.idf:
                    continue
                freq = tf.get(term, 0)
                if freq == 0:
                    continue
                score += self.idf[term] * (freq * (self.k1 + 1.0)) / (freq + denom_base)
            scores[i] = score
        return scores

    def find_similar(self, query, top_k=3):
        scores = self._score(tokenize(query))
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [
            {
                'fragment': self.fragments[idx],
                'annotation': self.annotations[idx],
                'similarity': float(scores[idx]),
                'artist': self.metadata[idx]['artist'],
                'title': self.metadata[idx]['title'],
                'votes': self.metadata[idx]['votes'],
            }
            for idx in top_indices
        ]

def evaluate_bm25(fragments, annotations, metadata):
    retriever = BM25Retriever(fragments, annotations, metadata)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

    correct_top1 = 0
    correct_top3 = 0
    predictions = []
    references = []
    similarities = []

    for i, fragment in enumerate(fragments):
        scores = retriever._score(tokenize(fragment))
        scores[i] = -1e9
        top_indices = np.argsort(scores)[-3:][::-1]

        predicted = annotations[top_indices[0]]
        true_annotation = annotations[i]

        predictions.append(predicted)
        references.append(true_annotation)
        similarities.append(scores[top_indices[0]])

        if predicted == true_annotation:
            correct_top1 += 1
        if true_annotation in [annotations[idx] for idx in top_indices]:
            correct_top3 += 1

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    bleu = corpus_bleu(predictions, [[r] for r in references])

    return {
        'method': 'BM25',
        'top1_accuracy': correct_top1 / len(fragments) if fragments else 0.0,
        'top3_accuracy': correct_top3 / len(fragments) if fragments else 0.0,
        'avg_similarity': float(np.mean(similarities)) if similarities else 0.0,
        'rouge1': float(np.mean(rouge_scores['rouge1'])) if rouge_scores['rouge1'] else 0.0,
        'rouge2': float(np.mean(rouge_scores['rouge2'])) if rouge_scores['rouge2'] else 0.0,
        'rougeL': float(np.mean(rouge_scores['rougeL'])) if rouge_scores['rougeL'] else 0.0,
        'bleu': bleu.score,
        'total_examples': len(fragments),
    }


## Evaluate


In [None]:
bm25_results = evaluate_bm25(fragments, annotations, metadata)
bm25_results


{'method': 'BM25 Retrieval',
 'top1_accuracy': None,
 'top3_accuracy': None,
 'avg_similarity': None,
 'rouge1': 0.014,
 'rouge2': 0.0019,
 'rougeL': 0.0124,
 'bleu': None,
 'total_examples': 2000}

In [6]:
import json
from pathlib import Path

out_path = Path('..') / 'data' / 'bm25_results.json'
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(bm25_results, f, ensure_ascii=False, indent=2)
print('Saved:', out_path)


Saved: ../data/bm25_results.json


## Demo query


In [7]:
retriever = BM25Retriever(fragments, annotations, metadata)
retriever.find_similar('Я вижу город под подошвой', top_k=2)


[{'fragment': 'Город под подошвой\n Город под подошвой — этот город под подошвой',
  'annotation': '«Город под подошвой» — песня российского рэпера Оксимирона (Oxxxymiron).',
  'similarity': 27.536102294921875,
  'artist': 'CMH',
  'title': 'GAZZ',
  'votes': 3},
 {'fragment': 'Город «А», город «Z»',
  'annotation': 'Буквы, которыми обозначали два крупнейших города Казахстана на номерных знаках автомобилей:\n\nА – Алматы\nZ – Астана\n\nqurt коренной Астанчанин, но в данный момент проживает и развивается в южной столице – Алматы.',
  'similarity': 9.259902000427246,
  'artist': '104',
  'title': 'КОПЕР (COPER)',
  'votes': 4}]