In [None]:
import logging
from pathlib import Path
import shutil

from tqdm import tqdm

from harmonic_inference.data.data_types import PitchType, KeyMode, TRIAD_REDUCTION
from harmonic_inference.data.piece import ScorePiece
from harmonic_inference.data.corpus_reading import load_clean_corpus_dfs
import harmonic_inference.utils.harmonic_utils as hu

logging.basicConfig(level=logging.DEBUG)

In [None]:
files_df, measures_df, chords_df, notes_df = load_clean_corpus_dfs('corpus_data')

In [None]:
files_df

In [None]:
measures_df

In [None]:
chords_df

In [None]:
notes_df

In [None]:
composers = sorted(set(name.split('-')[0].strip() for name in files_df.corpus_name.unique()))
composers

In [None]:
# Make data directories
base_dir = Path("ML_course_data")

chord_dir = Path(base_dir / "chord")
shutil.rmtree(chord_dir, ignore_errors=True)
chord_dir.mkdir(parents=True, exist_ok=True)

key_dir = Path(base_dir / "key")
shutil.rmtree(key_dir, ignore_errors=True)
key_dir.mkdir(parents=True, exist_ok=True)

In [None]:
chord_reduction = TRIAD_REDUCTION
use_inversions = False
use_relative = False

chord_labels = {KeyMode.MAJOR: set(), KeyMode.MINOR: set()}
key_labels = {KeyMode.MAJOR: set(), KeyMode.MINOR: set()}

for composer in composers:
    # startswith instead of contains because of WFBach/Bach
    composer_df = files_df.loc[files_df["corpus_name"].str.startswith(composer)]

    for file_id, file_row in tqdm(
        composer_df.iterrows(),
        desc=f"Creating {composer} data",
        total=len(composer_df),
    ):
        try:
            piece = ScorePiece(
                notes_df.loc[file_id],
                chords_df.loc[file_id],
                measures_df.loc[file_id],
                chord_reduction=chord_reduction,
                use_inversions=use_inversions,
                use_relative=use_relative,
                name=f"{file_id}: {file_row['corpus_name']}",
            )
        except Exception as e:
            logging.error(f"No data created for file_id {file_id}")
            logging.exception(e)
            continue

        with open(key_dir / f"{composer}.csv", "a+") as key_file:
            key_symbols = [
                hu.get_scale_degree_from_interval(key.local_tonic - key.global_tonic, key.global_mode, PitchType.TPC)
                + ":" + str(key.local_mode).split('.')[1]
                for key in piece.get_keys()
            ]
            key_labels[piece.get_keys()[0].global_mode].add([symbol for symbol in key_symbols])
            key_file.write(",".join(key_symbols) + "\n")

        with open(chord_dir / f"{composer}.csv", "a+") as chord_file:
            for start, end in zip(
                piece.get_key_change_indices(),
                list(piece.get_key_change_indices()[1:]) + [len(piece.get_chords())]
            ):
                mode = piece.get_chords()[start].key_mode
                chord_symbols = [
                    hu.get_scale_degree_from_interval(chord.root - chord.key_tonic, mode, PitchType.TPC) +
                    ":" + str(chord.chord_type).split('.')[1][:3]
                    for chord in piece.get_chords()[start:end]
                ]
                chord_labels[mode].add([symbol for symbol in chord_symbols])
                chord_file.write(str(mode).split(".")[1] + ";" + ",".join(chord_symbols) + "\n")

In [None]:
Path(base_dir / "chord_vocab_major.txt").write_text("\n".join(sorted(chord_labels[KeyMode.MAJOR])))
Path(base_dir / "chord_vocab_minor.txt").write_text("\n".join(sorted(chord_labels[KeyMode.MINOR])))
Path(base_dir / "chord_vocab_full.txt").write_text(
    "\n".join(sorted(set(list(chord_labels[KeyMode.MINOR]) + list(chord_labels[KeyMode.MAJOR]))))
)

Path(base_dir / "key_vocab_major.txt").write_text("\n".join(sorted(key_labels[KeyMode.MAJOR])))
Path(base_dir / "key_vocab_minor.txt").write_text("\n".join(sorted(key_labels[KeyMode.MINOR])))
Path(base_dir / "key_vocab_full.txt").write_text(
    "\n".join(sorted(set(list(key_labels[KeyMode.MINOR]) + list(key_labels[KeyMode.MAJOR]))))
)

In [None]:
# Recreate labels from csv, if necessary
chord_labels = {KeyMode.MAJOR: set(), KeyMode.MINOR: set()}
key_labels = {KeyMode.MAJOR: set(), KeyMode.MINOR: set()}

from glob import glob

for path in glob('ML_course_data/chord/*.csv'):
    with open(path, 'r') as fp:
        for line in fp:
            key_label, chords = line.strip().split(";")

            label_set = chord_labels[KeyMode.MAJOR] if key_label == "MAJOR" else chord_labels[KeyMode.MINOR]
            for label in chords.split(","):
                label_set.add(label)

for path in glob('ML_course_data/key/*.csv'):
    with open(path, 'r') as fp:
        for line in fp:
            keys = line.strip().split(",")
            key_label = keys[0].split(":")[1]

            label_set = key_labels[KeyMode.MAJOR] if key_label == "MAJOR" else key_labels[KeyMode.MINOR]
            for label in keys:
                label_set.add(label)

In [None]:
# Making labels version 2: numeral/chord_type/relativeroots

# Make data directories
base_dir = Path("ML_course_data")

chord_dir = Path(base_dir / "chord_v2")
shutil.rmtree(chord_dir, ignore_errors=True)
chord_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Re-create
chord_type_reduction = {
    '': ['M', 'm', 'Mm7', 'mm7', 'MM7', 'mM7'],
    'o': ['o', 'o7', '%7'],
    '+': ['+', '+7'],
}
use_inversions = False

# Only save important columns
if use_inversions:
    label_columns = ["numeral", "chord_type", "figbass", "relativeroot"]
else:
    label_columns = ["numeral", "chord_type", "relativeroot"]

key_columns = ["globalkey", "globalkey_is_minor", "localkey", "localkey_is_minor"]
all_columns = label_columns + key_columns

chords_df_v2 = chords_df.loc[~chords_df.numeral.isnull(), all_columns].copy()

# Add / to front of non-empty relative roots
chords_df_v2.loc[~chords_df_v2["relativeroot"].isnull(), ["relativeroot"]] = (
    "/" + chords_df_v2.loc[~chords_df_v2["relativeroot"].isnull(), ["relativeroot"]]
)

# Reduce chord types
for new_type, orig_types in chord_type_reduction.items():
    chords_df_v2.loc[chords_df_v2["chord_type"].isin(orig_types), "chord_type"] = new_type

# Fill null cells with empty string
chords_df_v2 = chords_df_v2.fillna("")

# Concatenate important columns
chords_df_v2["label"] = chords_df_v2[columns].values.sum(axis=1)

for composer in composers:
    # startswith instead of contains because of WFBach/Bach
    composer_df = files_df.loc[files_df["corpus_name"].str.startswith(composer)]

    for file_id, file_row in tqdm(
        composer_df.iterrows(),
        desc=f"Creating {composer} data",
        total=len(composer_df),
    ):
        try:
            this_chords_df = chords_df_v2.loc[file_id]
        except:
            logging.warning("No chord data for file_id %s", file_id)
            continue

        key_changes = (
            this_chords_df[key_columns].shift() != this_chords_df[key_columns]
        ).any(axis=1)
        key_indexes = list(this_chords_df.index[key_changes])

        for key_start, key_end in zip(key_indexes, key_indexes[1:] + [None]):
            if key_end is not None:
                key_end -= 1

            this_key_chords_df = this_chords_df.loc[key_start:key_end]
            key_string = "MINOR" if this_key_chords_df.iloc[0]["localkey_is_minor"] else "MAJOR"
            chord_changes = this_key_chords_df["label"].shift() != this_key_chords_df["label"]

            labels_list = list(this_key_chords_df.loc[chord_changes, "label"])

            # Write out to file
            with open(chord_dir / f"{composer}.csv", "a+") as chord_file:
                chord_file.write(key_string + ";" + ",".join(labels_list) + "\n")
