# MIDI Feature Exploration Template

This notebook demonstrates how to stream classical MIDI files from Hugging Face, extract musical features, and run exploratory analyses.


In [None]:
# Uncomment the following line the first time you run this notebook
# %pip install datasets pretty_midi numpy scipy pandas scikit-learn seaborn requests


In [None]:
from pathlib import Path

import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

from src.data import extract_features_from_dataset, load_hf_dataset
from src.features import extract_midi_features


In [None]:
DATASET_NAME = "drengskapur/midi-classical-music"
DATASET_SPLIT = "train"
OUTPUT_PATH = Path("data/processed/midi_composer_features.csv")
MAX_ITEMS = 200  # set to None to process full split


# Helper to extract composer from URL (dataset specific)
def composer_from_url(url: str) -> str:
    return url.split("/")[-2]



In [None]:
dataset: Dataset = load_hf_dataset(
    DATASET_NAME,
    split=DATASET_SPLIT,
    streaming=True,
)
dataset


In [None]:
def stream_features():
    progress = tqdm(total=MAX_ITEMS, disable=MAX_ITEMS is None, desc="MIDI files")

    def update(count):
        progress.n = count
        progress.refresh()

    for result in extract_features_from_dataset(
        dataset,
        extract_midi_features,
        max_items=MAX_ITEMS,
        composer_from_url=composer_from_url,
        progress_callback=update,
    ):
        yield {**result.metadata, **result.features}

    progress.close()


records = list(stream_features())
len(records)


In [None]:
df = pd.DataFrame.from_records(records)
df.head()


In [None]:
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_PATH, index=False)
OUTPUT_PATH


In [None]:
df.groupby("composer").agg({
    "pitch_class_entropy": "mean",
    "rhythmic_entropy": "mean",
    "polyphony": "mean",
    "note_density": "mean",
    "pitch_mean": "mean",
    "pitch_variance": "mean",
    "note_duration_mean": "mean",
    "note_duration_variance": "mean",
}).sort_values("pitch_class_entropy", ascending=False)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(
    df,
    vars=[
        "pitch_class_entropy",
        "rhythmic_entropy",
        "polyphony",
        "note_density",
        "pitch_mean",
        "pitch_variance",
        "note_duration_mean",
        "note_duration_variance",
    ],
    hue="composer",
    corner=True,
)
plt.show()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

FEATURE_COLUMNS = [
    "pitch_class_entropy",
    "rhythmic_entropy",
    "polyphony",
    "note_density",
    "pitch_mean",
    "pitch_variance",
    "note_duration_mean",
    "note_duration_variance",
]
TARGET_COLUMN = "tempo_mean"

X = df[FEATURE_COLUMNS].fillna(0)
y = df[TARGET_COLUMN].fillna(0)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_test)

print("R^2:", r2_score(y_test, y_pred))
pd.DataFrame(
    {
        "feature": FEATURE_COLUMNS,
        "coefficient": model.coef_,
    }
)


## Next Steps

- Tune `MAX_ITEMS` and experiment with different composers.
- Expand `extract_midi_features` to include additional metrics relevant to your analysis.
- Replace the linear model with a more appropriate estimator if needed (e.g., classification by composer).
