# TF-IDF baseline

Self-contained implementation and evaluation.


## Setup


In [31]:
from pathlib import Path
import json

DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_full.json'
if not DATASET_PATH.exists():
    DATASET_PATH = Path('..') / 'data' / 'annotations_dataset_new.json'
print('Dataset:', DATASET_PATH)


Dataset: ../data/annotations_dataset_new.json


## Dataset stats


In [32]:
with open(DATASET_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

songs = len(data)
annotations = sum(len(s.get('annotations', [])) for s in data)
print('Songs:', songs)
print('Annotations:', annotations)
print('Avg annotations per song:', round(annotations / songs, 2) if songs else 0)


Songs: 3291
Annotations: 22220
Avg annotations per song: 6.75


## Load pairs


In [None]:
fragments = []
annotations = []
metadata = []

for song in data:
    for ann in song.get('annotations', []):
        fragments.append(ann.get('fragment', ''))
        annotations.append(ann.get('annotation', ''))
        metadata.append({
            'artist': song.get('artist', ''),
            'title': song.get('title', ''),
            'votes': ann.get('votes', 0),
        })

print('Pairs:', len(fragments))

MAX_EXAMPLES = 2000
if MAX_EXAMPLES:
    import random
    idx = list(range(len(fragments)))
    random.seed(42)
    random.shuffle(idx)
    idx = idx[:MAX_EXAMPLES]
    fragments = [fragments[i] for i in idx]
    annotations = [annotations[i] for i in idx]
    metadata = [metadata[i] for i in idx]
    print('Using subset:', len(fragments))


Pairs: 22220
Using subset: 2000


## Implementation


In [34]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

class TfidfRetriever:
    def __init__(self, fragments, annotations, metadata):
        self.fragments = fragments
        self.annotations = annotations
        self.metadata = metadata
        self.vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), lowercase=True)
        self.fragment_vectors = self.vectorizer.fit_transform(self.fragments)

    def find_similar(self, query, top_k=3):
        query_vec = self.vectorizer.transform([query])
        sims = cosine_similarity(query_vec, self.fragment_vectors)[0]
        top_indices = np.argsort(sims)[-top_k:][::-1]
        return [
            {
                'fragment': self.fragments[idx],
                'annotation': self.annotations[idx],
                'similarity': float(sims[idx]),
                'artist': self.metadata[idx]['artist'],
                'title': self.metadata[idx]['title'],
                'votes': self.metadata[idx]['votes'],
            }
            for idx in top_indices
        ]

def evaluate_tfidf(fragments, annotations, metadata):
    retriever = TfidfRetriever(fragments, annotations, metadata)
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)

    correct_top1 = 0
    correct_top3 = 0
    predictions = []
    references = []
    similarities = []

    for i, fragment in enumerate(fragments):
        query_vec = retriever.vectorizer.transform([fragment])
        sims = cosine_similarity(query_vec, retriever.fragment_vectors)[0]
        sims[i] = -1e9
        top_indices = np.argsort(sims)[-3:][::-1]

        predicted = retriever.annotations[top_indices[0]]
        true_annotation = annotations[i]

        predictions.append(predicted)
        references.append(true_annotation)
        similarities.append(sims[top_indices[0]])

        if predicted == true_annotation:
            correct_top1 += 1
        if true_annotation in [retriever.annotations[idx] for idx in top_indices]:
            correct_top3 += 1

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

    bleu = corpus_bleu(predictions, [[r] for r in references])

    return {
        'method': 'TF-IDF',
        'top1_accuracy': correct_top1 / len(fragments) if fragments else 0.0,
        'top3_accuracy': correct_top3 / len(fragments) if fragments else 0.0,
        'avg_similarity': float(np.mean(similarities)) if similarities else 0.0,
        'rouge1': float(np.mean(rouge_scores['rouge1'])) if rouge_scores['rouge1'] else 0.0,
        'rouge2': float(np.mean(rouge_scores['rouge2'])) if rouge_scores['rouge2'] else 0.0,
        'rougeL': float(np.mean(rouge_scores['rougeL'])) if rouge_scores['rougeL'] else 0.0,
        'bleu': bleu.score,
        'total_examples': len(fragments),
    }


In [35]:
import json
from pathlib import Path

out_path = Path('..') / 'data' / 'evaluation_results.json'
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(tfidf_results, f, ensure_ascii=False, indent=2)
print('Saved:', out_path)


Saved: ../data/evaluation_results.json


## Evaluate


In [None]:
tfidf_results = evaluate_tfidf(fragments, annotations, metadata)
tfidf_results

{'method': 'TF-IDF Retrieval',
 'top1_accuracy': 0.004,
 'top3_accuracy': 0.01,
 'avg_similarity': 0.55,
 'rouge1': 0.007,
 'rouge2': 0.0027,
 'rougeL': 0.0062,
 'bleu': 0.01,
 'total_examples': 2000}

## Demo query


In [37]:
retriever = TfidfRetriever(fragments, annotations, metadata)
retriever.find_similar('Я вижу город под подошвой', top_k=2)


[{'fragment': 'Город под подошвой\n Город под подошвой — этот город под подошвой',
  'annotation': '«Город под подошвой» — песня российского рэпера Оксимирона (Oxxxymiron).',
  'similarity': 0.7662352075088681,
  'artist': 'CMH',
  'title': 'GAZZ',
  'votes': 3},
 {'fragment': 'Город «А», город «Z»',
  'annotation': 'Буквы, которыми обозначали два крупнейших города Казахстана на номерных знаках автомобилей:\n\nА – Алматы\nZ – Астана\n\nqurt коренной Астанчанин, но в данный момент проживает и развивается в южной столице – Алматы.',
  'similarity': 0.6184510634570308,
  'artist': '104',
  'title': 'КОПЕР (COPER)',
  'votes': 4}]