# Hybrid retrieval

Self-contained implementation and evaluation. GPU recommended.


## Setup


In [8]:
from pathlib import Path
import json

DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_full.json'
if not DATASET_PATH.exists():
    DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_new.json'
print('Dataset:', DATASET_PATH)


Dataset: ../data/annotations_dataset_new.json


## Dataset stats


In [9]:
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

songs = len(data)
annotations = sum(len(s.get('annotations', [])) for s in data)
print('Songs:', songs)
print('Annotations:', annotations)
print('Avg annotations per song:', round(annotations / songs, 2) if songs else 0)


Songs: 3291
Annotations: 22220
Avg annotations per song: 6.75


## Load pairs


In [10]:
fragments = []
annotations = []
metadata = []

for song in data:
    for ann in song.get('annotations', []):
        fragments.append(ann.get('fragment', ''))
        annotations.append(ann.get('annotation', ''))
        metadata.append({
            'artist': song.get('artist', ''),
            'title': song.get('title', ''),
            'votes': ann.get('votes', 0),
        })

print('Pairs:', len(fragments))

MAX_EXAMPLES = 2000  # set None for full run
if MAX_EXAMPLES:
    import random
    idx = list(range(len(fragments)))
    random.seed(42)
    random.shuffle(idx)
    idx = idx[:MAX_EXAMPLES]
    fragments = [fragments[i] for i in idx]
    annotations = [annotations[i] for i in idx]
    metadata = [metadata[i] for i in idx]
    print('Using subset:', len(fragments))


Pairs: 22220
Using subset: 2000


## Implementation


In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

class HybridRetriever:
    def __init__(self, fragments, annotations, metadata, alpha=0.5, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
        self.fragments = fragments
        self.annotations = annotations
        self.metadata = metadata
        self.alpha = alpha
        self.tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=True)
        self.tfidf_vectors = self.tfidf.fit_transform(self.fragments)
        self.model = SentenceTransformer(model_name)
        self.embeddings = self.model.encode(self.fragments, show_progress_bar=True, convert_to_numpy=True)

    def find_similar(self, query, top_k=3):
        query_tfidf = self.tfidf.transform([query])
        tfidf_sims = cosine_similarity(query_tfidf, self.tfidf_vectors)[0]
        query_emb = self.model.encode([query], convert_to_numpy=True)
        sbert_sims = cosine_similarity(query_emb, self.embeddings)[0]
        hybrid_sims = (1 - self.alpha) * tfidf_sims + self.alpha * sbert_sims
        top_indices = np.argsort(hybrid_sims)[-top_k:][::-1]
        return [
            {
                'fragment': self.fragments[idx],
                'annotation': self.annotations[idx],
                'similarity': float(hybrid_sims[idx]),
                'tfidf_sim': float(tfidf_sims[idx]),
                'sbert_sim': float(sbert_sims[idx]),
                'artist': self.metadata[idx]['artist'],
                'title': self.metadata[idx]['title'],
                'votes': self.metadata[idx]['votes'],
            }
            for idx in top_indices
        ]

def evaluate_hybrid(fragments, annotations, metadata, alpha=0.5, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
    retriever = HybridRetriever(fragments, annotations, metadata, alpha=alpha, model_name=model_name)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

    correct_top1 = 0
    correct_top3 = 0
    predictions = []
    references = []
    similarities = []

    for i, fragment in enumerate(fragments):
        query_tfidf = retriever.tfidf_vectors[i]
        tfidf_sims = cosine_similarity(query_tfidf, retriever.tfidf_vectors)[0]
        sbert_sims = cosine_similarity(retriever.embeddings[i:i+1], retriever.embeddings)[0]
        hybrid_sims = (1 - alpha) * tfidf_sims + alpha * sbert_sims
        hybrid_sims[i] = -1e9
        top_indices = np.argsort(hybrid_sims)[-3:][::-1]

        predicted = annotations[top_indices[0]]
        true_annotation = annotations[i]

        predictions.append(predicted)
        references.append(true_annotation)
        similarities.append(hybrid_sims[top_indices[0]])

        if predicted == true_annotation:
            correct_top1 += 1
        if true_annotation in [annotations[idx] for idx in top_indices]:
            correct_top3 += 1

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    bleu = corpus_bleu(predictions, [[r] for r in references])

    return {
        'method': f'Hybrid (alpha={alpha})',
        'top1_accuracy': correct_top1 / len(fragments) if fragments else 0.0,
        'top3_accuracy': correct_top3 / len(fragments) if fragments else 0.0,
        'avg_similarity': float(np.mean(similarities)) if similarities else 0.0,
        'rouge1': float(np.mean(rouge_scores['rouge1'])) if rouge_scores['rouge1'] else 0.0,
        'rouge2': float(np.mean(rouge_scores['rouge2'])) if rouge_scores['rouge2'] else 0.0,
        'rougeL': float(np.mean(rouge_scores['rougeL'])) if rouge_scores['rougeL'] else 0.0,
        'bleu': bleu.score,
        'total_examples': len(fragments),
    }


In [12]:
import json
from pathlib import Path

hybrid_results = []
for alpha in [0.3, 0.5, 0.7]:
    print('alpha', alpha)
    res = evaluate_hybrid(fragments, annotations, metadata, alpha=alpha)
    hybrid_results.append(res)
    out_path = Path('..') / 'data' / f'hybrid_results_alpha{int(alpha*10)}.json'
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(res, f, ensure_ascii=False, indent=2)
    print('Saved:', out_path)


alpha 0.3


Batches: 100%|██████████| 63/63 [00:02<00:00, 27.31it/s]


Saved: ../data/hybrid_results_alpha3.json
alpha 0.5


Batches: 100%|██████████| 63/63 [00:02<00:00, 23.67it/s]


Saved: ../data/hybrid_results_alpha5.json
alpha 0.7


Batches: 100%|██████████| 63/63 [00:01<00:00, 34.12it/s]


Saved: ../data/hybrid_results_alpha7.json


## Evaluate (GPU recommended)


In [13]:
import json
from pathlib import Path

hybrid_results = []
for alpha in [0.3, 0.5, 0.7]:
    print('alpha', alpha)
    res = evaluate_hybrid(fragments, annotations, metadata, alpha=alpha)
    hybrid_results.append(res)
    out_path = Path('..') / 'data' / f'hybrid_results_alpha{int(alpha*10)}.json'
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(res, f, ensure_ascii=False, indent=2)
    print('Saved:', out_path)


alpha 0.3


Batches: 100%|██████████| 63/63 [00:02<00:00, 29.83it/s]


Saved: ../data/hybrid_results_alpha3.json
alpha 0.5


Batches: 100%|██████████| 63/63 [00:02<00:00, 24.88it/s]


Saved: ../data/hybrid_results_alpha5.json
alpha 0.7


Batches: 100%|██████████| 63/63 [00:02<00:00, 29.30it/s]


Saved: ../data/hybrid_results_alpha7.json


## Demo query


In [14]:
retriever = HybridRetriever(fragments, annotations, metadata, alpha=0.5)
retriever.find_similar('Я вижу город под подошвой', top_k=2)


Batches: 100%|██████████| 63/63 [00:02<00:00, 29.32it/s]


[{'fragment': 'Город под подошвой\n Город под подошвой — этот город под подошвой',
  'annotation': '«Город под подошвой» — песня российского рэпера Оксимирона (Oxxxymiron).',
  'similarity': 0.7939328427070708,
  'tfidf_sim': 0.7662352075088681,
  'sbert_sim': 0.8216304779052734,
  'artist': 'CMH',
  'title': 'GAZZ',
  'votes': 3},
 {'fragment': 'Еду в центр, это город дорог',
  'annotation': 'Игра слов отсылает нас к треку группы CENTR – Город дорог',
  'similarity': 0.6143498246787367,
  'tfidf_sim': 0.5424553404089566,
  'sbert_sim': 0.6862443089485168,
  'artist': 'OG Buda',
  'title': 'Дзагоев (Dzagoev)',
  'votes': 3}]