# Kneser–Ney + GridSearch
Este notebook implementa una mejora del baseline model KN sobre **progresiones funcionales** a partir de `songdb_funcional_v4.csv`:

- **Modelo n-grama con Kneser–Ney (n-gramas)** sobre tokens funcionales (e.g., `I`, `ii`, `V7`, `bVII7`), con `<unk>` para raros y límites `<s>`, `</s>`.
- **Busqueda de mejoras**: se exploran diferentes configuraciones de hiperparámetros (orden, D) para optimizar el rendimiento del modelo.

Se evalúa **predicción del siguiente acorde** con **Top-k** y **MRR**, con partición **train/val/test por canción**.


## 1) Carga de los datos
Leemos el CSV y preparamos las secuencias de tokens por canción. Usamos `title` + `composedby` como ID de canción.

In [4]:
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import List, Tuple, Dict

In [5]:
# Importamos el dataset de canciones con las progresiones funcionales
csv_path = Path("../../data/songdb_funcional_v4.csv")
df = pd.read_csv(csv_path)

def tokenize_progression(prog: str):
    """
    Tokeniza una progresión en acordes separados
    """
    if pd.isna(prog):
        return []
    return [t for t in str(prog).strip().split() if t]

def build_sequences_by_song(df: pd.DataFrame):
    """
    Construye secuencias de acordes por canción a partir del DataFrame.
    """
    if "title" in df.columns and "composedby" in df.columns:
        song_ids = (df["title"].astype(str) + " — " + df["composedby"].astype(str)).tolist()
    elif "title" in df.columns:
        song_ids = df["title"].astype(str).tolist()
    else:
        song_ids = [f"song_{i}" for i in range(len(df))]
    seqs = {}
    for sid, prog in zip(song_ids, df["funcional_prog"].tolist()):
        seqs[sid] = tokenize_progression(prog)
    return seqs

seqs = build_sequences_by_song(df)
seqs = {k:v for k,v in seqs.items() if len(v) >= 3}
len(seqs), list(seqs)[:3]  # tamaño y primeras claves


(2591,
 ['Lullaby of Birdland — George Shearing',
  "It's A Most Unusual Day — Jimmy McHugh and HYarold Adamson",
  'Jump Monk — Charles Mingus'])

## 2) Partición train/val/test por canción
Usamos 80/10/10 con barajado determinista.

In [6]:

rng = np.random.default_rng(42)
song_list = list(seqs.keys())
rng.shuffle(song_list)

n = len(song_list)
n_train = int(n * 0.8)
n_val   = int(n * 0.1)
train_ids = song_list[:n_train]
val_ids   = song_list[n_train:n_train+n_val]
test_ids  = song_list[n_train+n_val:]

train_seqs = [seqs[sid] for sid in train_ids]
val_seqs   = [seqs[sid] for sid in val_ids]
test_seqs  = [seqs[sid] for sid in test_ids]

len(train_seqs), len(val_seqs), len(test_seqs)


(2072, 259, 260)

## 3) Kneser–Ney genérico (orden N) + grid search

In [7]:
# === KN interpolado genérico (orden N) ===
from collections import Counter, defaultdict
from functools import lru_cache

class KNInterpolatedNGram:
    def __init__(self, order=3, discount=0.75, unk_threshold=1):
        assert order >= 1
        self.N = order
        self.D = discount
        self.unk_threshold = unk_threshold
        self.vocab = set()
        self.counts = {n: Counter() for n in range(1, self.N+1)}      # n-gram counts
        self.context_totals = {n: Counter() for n in range(1, self.N)} # c(context)
        self.unique_continuations = {n: Counter() for n in range(1, self.N)}  # N1+(context •)
        self.continuation_counts_unigram = Counter()  # N1+(• w)
        self.total_unique_bigrams = 0
        self._rank_cache = {}
        self._prob_cache = {}
        self.fitted = False

    def _add_bounds(self, seq):
        """
        Añade marcas de comienzo y fin. "<s>" y "</s>"
        """
        return ["<s>"]*(self.N-1) + seq + ["</s>"]

    def fit(self, sequences):
        """
        Ajusta el modelo a las secuencias de entrenamiento.
        """
        token_counts = Counter(t for seq in sequences for t in seq)
        vocab = set([t for t,c in token_counts.items() if c > self.unk_threshold])
        vocab.update({"<s>","</s>","<unk>"})
        self.vocab = vocab

        def map_unk(seq): 
            return [t if t in vocab else "<unk>" for t in seq]

        for seq in sequences:
            s = self._add_bounds(map_unk(seq))
            for i in range(len(s)):
                for n in range(1, self.N+1):
                    if i-n+1 < 0: 
                        continue
                    ngram = tuple(s[i-n+1:i+1])
                    self.counts[n][ngram] += 1

        # context totals + unique continuations
        for n in range(2, self.N+1):
            seen = defaultdict(set)
            for ngram, c in self.counts[n].items():
                ctx, w = ngram[:-1], ngram[-1]
                self.context_totals[n-1][ctx] += c
                seen[ctx].add(w)
            for ctx, ws in seen.items():
                self.unique_continuations[n-1][ctx] = len(ws)

        # unigram continuation counts
        left_contexts = defaultdict(set)
        for (w1, w2) in self.counts[2].keys():
            left_contexts[w2].add(w1)
        self.continuation_counts_unigram = Counter({w: len(ctxs) for w, ctxs in left_contexts.items()})
        self.total_unique_bigrams = len(self.counts[2])
        self.fitted = True

    @lru_cache(maxsize=None)
    def _p_cont_unigram(self, w):
        if self.total_unique_bigrams == 0:
            return 1.0 / max(1, len(self.vocab))
        return self.continuation_counts_unigram.get(w, 0) / self.total_unique_bigrams

    def _lambda(self, ctx):
        m = len(ctx)
        if m == 0:
            return 1.0
        cont_types = self.unique_continuations[m].get(ctx, 0)
        total = self.context_totals[m].get(ctx, 0)
        if total == 0:
            return 1.0
        return (self.D * cont_types) / total

    def _base(self, ctx, w):
        m = len(ctx)
        if m == 0:
            return self._p_cont_unigram(w)
        total = self.context_totals[m].get(ctx, 0)
        c = self.counts[m+1].get(tuple(list(ctx)+[w]), 0)
        if total == 0:
            return 0.0
        return max(c - self.D, 0) / total

    def prob(self, ctx, w):
        key = (ctx, w)
        if key in self._prob_cache:
            return self._prob_cache[key]
        m = len(ctx)
        if m == 0:
            p = self._p_cont_unigram(w)
        else:
            p = self._base(ctx, w) + self._lambda(ctx) * self.prob(ctx[1:], w)
        self._prob_cache[key] = p
        return p

    def predict_ranking(self, history):
        # mapeo a <unk> interno para usar directamente evaluate_next_token_ranking(...)
        hist = ["<s>"]*(self.N-1) + [t if t in self.vocab else "<unk>" for t in history]
        ctx = tuple(hist[-(self.N-1):]) if self.N > 1 else tuple()
        if ctx in self._rank_cache:
            return self._rank_cache[ctx]
        cands = [w for w in self.vocab if w not in {"<s>"}]
        scores = [(w, self.prob(ctx, w)) for w in cands]
        scores.sort(key=lambda x: x[1], reverse=True)
        self._rank_cache[ctx] = scores
        return scores

## 4) KN + Grid search
Parametros a probar: orden {3, 4}, D {0.5, 0.7, 0.75, 0.9, 1.0}

In [8]:
def evaluate_next_token_ranking(model, test_sequences, topk_list=(1,3,5)):
    total_positions = 0
    topk_hits = {k: 0 for k in topk_list}
    mrr_sum = 0.0
    for seq in test_sequences:
        for i in range(len(seq)-1):
            history = seq[:i+1]
            gold = seq[i+1]
            ranking = model.predict_ranking(history)
            ranks = {w: r+1 for r, (w, _) in enumerate(ranking)}
            rank_gold = ranks.get(gold, None)
            total_positions += 1
            if rank_gold is not None:
                for k in topk_list:
                    if rank_gold <= k:
                        topk_hits[k] += 1
                mrr_sum += 1.0 / rank_gold
    results = {
        "positions": total_positions,
        "MRR": mrr_sum / total_positions if total_positions > 0 else 0.0,
    }
    for k in topk_list:
        results[f"Top@{k}"] = topk_hits[k] / total_positions if total_positions > 0 else 0.0
    return results

class KNWrapperForEval:
    def __init__(self, kn_model):
        self.kn = kn_model
    def predict_ranking(self, history):
        hist = [t if t in self.kn.vocab else "<unk>" for t in history]
        return self.kn.predict_ranking(hist)

In [9]:
# === Grid search en validación y evaluación en test ===
import pandas as pd

def grid_search_kn(train_seqs, val_seqs, orders=(3,4), Ds=(0.5,0.7,0.75,0.9,1.0), unk_threshold=1, subsample_val=None):
    if subsample_val is not None:
        val_seqs = val_seqs[:subsample_val]
    rows = []
    best = None
    best_model = None
    for N in orders:
        for D in Ds:
            kn = KNInterpolatedNGram(order=N, discount=D, unk_threshold=unk_threshold)
            kn.fit(train_seqs)
            metrics = evaluate_next_token_ranking(kn, val_seqs, topk_list=(1,3,5))
            row = {"order": N, "D": D, **metrics}
            rows.append(row)
            if (best is None) or (metrics["MRR"] > best["MRR"]):
                best, best_model = row, kn
    val_table = pd.DataFrame(rows).sort_values("MRR", ascending=False).reset_index(drop=True)
    display(val_table)
    print("Mejor configuración (val):", best)
    return best_model, best, val_table

# Ejecuta búsqueda (puedes ajustar subsample_val si quieres acelerar una primera pasada)
best_kn, best_cfg, kn_val_table = grid_search_kn(train_seqs, val_seqs, orders=(3,4), Ds=(0.5,0.7,0.75,0.9,1.0), subsample_val=None)

Unnamed: 0,order,D,positions,MRR,Top@1,Top@3,Top@5
0,4,0.9,13270,0.597218,0.465712,0.680181,0.762396
1,4,1.0,13270,0.596795,0.464883,0.679653,0.762321
2,4,0.75,13270,0.595597,0.463225,0.678598,0.759834
3,4,0.7,13270,0.594594,0.462472,0.676488,0.758855
4,4,0.5,13270,0.590907,0.45795,0.67272,0.755991
5,3,0.9,13270,0.582862,0.444838,0.663904,0.76006
6,3,0.75,13270,0.582846,0.444537,0.663301,0.75893
7,3,1.0,13270,0.582747,0.445139,0.663376,0.759156
8,3,0.7,13270,0.582426,0.444461,0.662547,0.758176
9,3,0.5,13270,0.581306,0.443406,0.661115,0.757046


Mejor configuración (val): {'order': 4, 'D': 0.9, 'positions': 13270, 'MRR': 0.5972178067596974, 'Top@1': 0.46571213262999245, 'Top@3': 0.680180859080633, 'Top@5': 0.7623963828183873}


## 5) Evaluación (Top-k, MRR)
Medimos la calidad de **predicción del siguiente token** en test. Reportamos **Top@1/3/5** y **MRR**.

In [10]:
# Evalúa en test con el mejor KN
kn_best_test = evaluate_next_token_ranking(best_kn, test_seqs, topk_list=(1,3,5))
pd.DataFrame([{"Model": f"KN (ord={best_cfg['order']}, D={best_cfg['D']})", **kn_best_test}])

Unnamed: 0,Model,positions,MRR,Top@1,Top@3,Top@5
0,"KN (ord=4, D=0.9)",12779,0.572782,0.432898,0.661006,0.740825


Kneser–Ney (ord=4, D=0.9) v2
(utilizando las progresiones funcionales con mejoras en la transcripción)

Comparación con modelo Kneser-Ney (ord=3, D=0.75):
- Top@1: +0,017 ptos (0.4152 → 0.4329) 
- Top@3: +0,009 ptos (0.6516 → 0.6610)
- Top@5: +0,003 ptos (0.7375 → 0.7408)
- MRR: +0,0123 (0.5605 → 0.5728)

Aunque la mejora es modesta, es consistente en todas las métricas, indicando que el modelo se beneficia de un mayor contexto (orden 4) y un ajuste del parámetro de descuento (D=0.9).

*Nota sobre las mejora en el preprocesado*

*Comparación con modelo Kneser-Ney (ord=4, D=0.9) v1 (sin las últimas mejoras en la transcripción):*
- *Top@1: +0,011 ptos (0.4219 → 0.4329)*
- *Top@3: +0,008 ptos (0.6526 → 0.6610)*
- *Top@5: +0,002 ptos (0.7445 → 0.7408)*
- *MRR: +0,0091 (0.5667 → 0.5728)*

*El modelo explota mejor la “nueva gramática” (V/·, Vsub/·, #iv°, natIII, natVI, etc.). El aumento en Top@1 y MRR indica que ahora el modelo “acierta antes” con más frecuencia: señal de secuencias más consistentes y menos ambigüedad en el vocabulario.*

*¿Por qué mejoran las métricas?*
- *Menos tokens “residuales” tipo (4), (6), (9) ⇒ menos ruido.*
- *Señales funcionales explícitas (V/ii, Vsub/V) ⇒ mejor predicción local (n-gramas lo agradecen)*

## 7) Exportamos el modelo como 'best_kn_model.pkl'

In [12]:
import joblib
joblib.dump(best_kn, '../../models/kn_gs_v2/best_kn_model.pkl')

['../../models/kn_gs_v2/best_kn_model.pkl']

## 8) Demo: sugerencias y autocompletado

In [13]:
# === DEMO: sugerencias y autocompletado con el KN óptimo ===
import random
import pandas as pd

assert 'best_kn' in globals(), "No encuentro 'best_kn'. Ejecuta antes la celda de grid search."
assert 'test_seqs' in globals(), "No encuentro 'test_seqs'. Ejecuta antes la partición de datos."

def topk_next(model, context, k=5, exclude_special=True):
    """Devuelve las k mejores sugerencias (token, prob) dado el contexto."""
    ranking = model.predict_ranking(context)
    if exclude_special:
        ranking = [(w,p) for (w,p) in ranking if w not in {"<s>", "</s>", "<unk>"}]
    return ranking[:k]

### 7.1) Probamos algunas secuencias clásicas

Modo mayor

In [14]:
#@title Echamos un vistazo a las sugerencias desde el acorde de tónica.
pred = topk_next(best_kn, ["I"], k=5)
pred

[('I', 0.3734150220734017),
 ('ii', 0.10753082407664535),
 ('vi', 0.105420704083276),
 ('V7', 0.052316504322354795),
 ('IV', 0.04821778901796282)]

In [15]:
#@title ["II", "V7"] > "I" @Top1
pred = topk_next(best_kn, ["II", "V7"], k=5)
pred

[('I', 0.5147374520306233),
 ('V7', 0.20810002151938234),
 ('I7', 0.06348083569710526),
 ('II', 0.05433766157067823),
 ('i', 0.029277076949103812)]

In [None]:
#@title pop-punk ["I", "V", "vi" "IV"] 
pred = topk_next(best_kn, ["I", "V"], k=5) # @Top4
pred

[('V', 0.3619163310072302),
 ('IV', 0.3145394182731477),
 ('I', 0.12915102984178167),
 ('vi', 0.09827329856710684),
 ('v', 0.04347285267236621)]

In [None]:
#@title pop-punk ["I", "V", "vi" "IV"] 
pred = topk_next(best_kn, ["I", "V", 'vi'], k=5) # No propone el acorde esperado
pred

[('II7', 0.21186953407468012),
 ('V', 0.16701901045989845),
 ('V/V', 0.1431200511922982),
 ('Vsub/II', 0.1044223682400282),
 ('vi', 0.09645031890317396)]

In [None]:
pred = topk_next(best_kn, ["I", "IV"], k=5)
pred

[('I', 0.5523479680300832),
 ('iii', 0.17868779919129707),
 ('V7', 0.09357061851630752),
 ('V', 0.07075552203473516),
 ('IV', 0.06705918922323578)]

In [None]:
#@title ["IV", "iv"] > "I" (@Top1)
pred = topk_next(best_kn, ["IV", "iv"], k=5)
pred

[('I', 0.6852342845268492),
 ('bVII7', 0.0911883723274025),
 ('iii', 0.07167197964826987),
 ('iv', 0.05531088804241253),
 ('V', 0.0168095248648733)]

In [None]:
pred = topk_next(best_kn, ["I", "ii", "iii"], k=5)
pred

[('ii', 0.2536295068957499),
 ('IV', 0.19301160791927907),
 ('vi', 0.12225064852545692),
 ('V/II', 0.11544736712344839),
 ('biiio', 0.0866727151744685)]

In [None]:
pred = topk_next(best_kn, ["vi", "ii"], k=5)
pred

[('V7', 0.8007619646811053),
 ('ii', 0.0912653006604839),
 ('vi', 0.030674614403312553),
 ('Vsub/III', 0.020604351350864265),
 ('I', 0.010689323093255666)]

In [None]:
pred = topk_next(best_kn, ["I", "bVII"], k=5)
pred

[('I', 0.5923288712978145),
 ('IV', 0.2997835833604025),
 ('bVII', 0.038251036525888314),
 ('V/IV', 0.01421183877548663),
 ('V7', 0.013465340638542427)]

Modo menor

In [None]:
pred = topk_next(best_kn, ["i"], k=5)
pred

[('i', 0.4567366640398732),
 ('iiø', 0.11593073164806672),
 ('V7', 0.09815538622614726),
 ('VI', 0.049291912898090054),
 ('iv', 0.043694122784600234)]

In [None]:
pred = topk_next(best_kn, ["iiø", "V7"], k=5)
pred

[('i', 0.3797665999357742),
 ('I', 0.3726490852969921),
 ('V7', 0.10379978030365061),
 ('iiø', 0.092943737965714),
 ('VI', 0.005684761530808764)]

In [None]:
pred = topk_next(best_kn, ["VI", "iiø"], k=5)
pred

[('V7', 0.9499133790613948),
 ('iiø', 0.0404212015707477),
 ('i', 0.0018882724017321784),
 ('I', 0.0014848214171321683),
 ('bVI7', 0.0013082564348126887)]

In [None]:
pred = topk_next(best_kn, ["i", "iv"], k=5)
pred

[('V7', 0.3352135194999175),
 ('i', 0.2714959128569826),
 ('iv', 0.15174976164566029),
 ('V/III', 0.07723979242357881),
 ('bVII7', 0.04196969293049771)]

### 7.2) Experimento: autocompletado "greedy" (+3 pasos)

In [None]:
def greedy_complete(model, seed, steps=3):
    """Completa secuencialmente el siguiente token escogiendo siempre el de mayor prob."""
    seq = list(seed)
    for _ in range(steps):
        preds = topk_next(model, seq, k=1)
        if not preds:
            break
        next_tok = preds[0][0]
        seq.append(next_tok)
        if next_tok == "</s>":
            break
    return seq

In [None]:
rows = []
for seed in [s[:CTX_LEN] for s in random.sample(pool, min(3, len(pool)))]:
    completed = greedy_complete(best_kn, seed, steps=3)
    rows.append({"seed": " ".join(seed), "greedy_+3": " ".join(completed)})

display(pd.DataFrame(rows))

Unnamed: 0,seed,greedy_+3
0,I I V7 V7,I I V7 V7 I I I
1,I VII7 I vi,I VII7 I vi IV V7 I
2,#iv ii ii bII7,#iv ii ii bII7 I I I


¡Ojo! Problema con las repeticiones

## Roadmap (siguientes mejoras)
- Implementar un modelo de **re-ranking** para mejorar la precisión de las predicciones: evitar las repeticiones, ajuste a una tonalidad dada.

- **Tabla de errores** para análisis de fallos en las predicciones.
- Exportar pickle y crear API REST para usar el modelo en producción.