# Aligned Beethoven Sonata Data for chord-eval

In [None]:
from glob import glob
import logging
from pathlib import Path
from tqdm import tqdm

import harmonic_inference.data.piece as piece
from harmonic_inference.utils.eval_utils import get_labels_df

OUTPUT_DIR = Path("Beethoven-labels")
if not OUTPUT_DIR.exists():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
from harmonic_inference.data.corpus_reading import load_clean_corpus_dfs

files_df, measures_df, chords_df, notes_df = load_clean_corpus_dfs('corpus_data')

In [None]:
file_ids = list(files_df.loc[files_df["corpus_name"] == "Beethoven-Sonatas"].index)
print(files_df.loc[92])
chords_beethoven = chords_df.loc[file_ids]

chords_beethoven.loc[chords_beethoven["changes"] == "#6", ["label", "numeral", "chord_tones", "globalkey", "localkey"]]
# chords_beethoven["changes"].value_counts().index

In [None]:
for fh_filename in tqdm(glob(str(Path("../functional-harmony/data/BPS/scores/*.mxl")))):
    music_xml_path = Path(fh_filename)
    label_csv_path = (
        music_xml_path.parent.parent /
        "chords" /
        Path(str(music_xml_path.stem) + ".csv")
    )

    if not label_csv_path.exists():
        logging.error(f"Label file {label_csv_path} does not exist. Skipping.")
        continue

    fh_score = piece.get_score_piece_from_music_xml(music_xml_path, label_csv_path)

    _, number, movement = music_xml_path.stem.split("_")

    dcml_corpus = "Beethoven-Sonatas"
    dcml_file_name = f"{number}-{movement[-1]}.tsv"

    df = files_df.loc[(files_df["corpus_name"] == dcml_corpus) & (files_df["file_name"] == dcml_file_name)]

    if len(df) == 0:
        logging.error(f"No matching df file_id found for score {music_xml_path}. Skipping.")
        continue

    if len(df) > 1:
        logging.error(f"Multiple matching df file_ids found for score {music_xml_path}: {df}.\nUsing the first.")

    file_id = df.index[0]
    
    try:
        dcml_score = piece.get_score_piece_from_data_frames(
            notes_df.loc[file_id],
            chords_df.loc[file_id],
            measures_df.loc[file_id],
            use_suspensions=True,
        )
    except KeyError:
        logging.error(f"No matching chord_df data found for score {music_xml_path} (file_id {file_id}). Skipping.")
        continue

    fh_label_df = get_labels_df(fh_score, tpc_c=0)
    fh_label_df.to_csv(OUTPUT_DIR / f"fh-{number}-{movement[-1]}.tsv", index=False, sep="\t")

    dcml_label_df = get_labels_df(dcml_score, tpc_c=0)
    dcml_label_df.to_csv(OUTPUT_DIR / f"dcml-{number}-{movement[-1]}.tsv", index=False, sep="\t")