In [None]:
from abc_utils import *
import pandas as pd
from hmmlearn import hmm
import numpy as np
from sklearn.metrics import accuracy_score


In [15]:
train_df, train_lengths, train_indicies, val_df, val_lengths, val_indicies = load_datasets()

Demonstration of using OG_Dataset()

In [None]:
# Instantiate the object
og_dataset = OG_Dataset()

# Filter the training set to only contain songs in 'major' keys
major_train_df, major_train_lengths = og_dataset.filter_df(train_df, train_lengths, col_to_filter='keys', filter_str='major', split_type='train')

# Filter the validation set to only contain songs in 'major' keys
major_val_df, major_val_lengths  = og_dataset.filter_df(train_df, train_lengths, col_to_filter='keys', filter_str='major', split_type = 'val')

In [17]:
# Filter the training set to only contain songs in 'minor' keys
minor_train_df, minor_train_lengths = og_dataset.filter_df(train_df, train_lengths, col_to_filter='keys', filter_str='minor', split_type='train')

# Filter the validation set to only contain songs in 'minor' keys
minor_val_df, minor_val_lengths  = og_dataset.filter_df(train_df, train_lengths, col_to_filter='keys', filter_str='minor', split_type = 'val')

### Using these Data subsets (This is basically identical to pomegrante_test_Adam.ipynb)

In [18]:
def load_song_subset(train_df, train_lengths, indices):
    """
    Create a subset of the train"""
    end_positions = np.cumsum(train_lengths)
    positions = np.insert(end_positions, 0, np.array([0]))
    songs = []
    for i in indices:
        song = train_df.iloc[positions[i] : positions[i+1]]
        songs.append(song)

    # return songs and lengths
    return pd.concat(songs), train_lengths.iloc[indices]

In [19]:
def ffill_obs(melody_obs: np.ndarray, unique_obs: dict) -> np.ndarray:
    # make a smaller array out of the unique observations
    possible_obs = list(set(unique_obs.flatten()))

    df_melody_obs = pd.Series(melody_obs)
    df_melody_obs[~df_melody_obs.isin(possible_obs)] = np.nan

    # fill forward first to fill all the holes
    df_melody_obs.ffill(inplace=True)

    # then fill backward to catch the case where
    # the beginning is empty
    df_melody_obs.bfill(inplace=True)

    return df_melody_obs.values.flatten()

In [20]:
def chord_accuracy(full_pred: np.array, true_states: np.array, num_chords: int=None, num_notes: int=None):
    '''
    Given the predicted matrix of states, compute the misclassification rate compared with the true_observations.
    Could be edited in the future to also compute the accuracy of our predicted note sequence.
    '''
    # check to make sure these are specified correctly
    if num_chords is None:
        raise ValueError("num_chords must be specified")
    if num_notes is None:
        raise ValueError("num_notes must be specified")
    
    # obtain the actual predicted chords 
    pred_chords = full_pred[:, num_chords-1]
    true_chords = true_states[:len(pred_chords), num_chords-1]

    # obtain the accuracy
    chord_acc = accuracy_score(true_chords, pred_chords)
    
    return chord_acc

In [21]:
def fit_model(train_df: pd.DataFrame, train_lengths: pd.Series, num_chords: int=1, num_notes: int=0, subset: bool=False, indices=None, lam: int=None, trans_prior: float=0.1, emissions_prior: float=0.1):
    """ 
    Takes in the train set and parameters for the state space and returns the trained model, along with all of the dictionaries needed to decode the model as a tuple.

    To train on a smaller subset of the full train set, use the subset argument and pass in the indices needed. Uses the load_song_subset function.
    """
    # check if we want to do a subset of the full train set; if so, perform it
    if subset:
        # check that indices are specified; raise and error if not
        if indices is None:
            raise ValueError("Indices must be specified if subset=True")
        train_df, _ = load_song_subset(train_df, train_lengths, indices)

    # obtain the states and observations from the songs
    true_states, true_observations = dataframe_to_states(train_df, num_chords, num_notes)
    
    # create the transition matrices for the model
    transition_matrix, emission_probs, unique_states, unique_obs, states_to_index, observation_to_index = states_to_transition(true_states, true_observations, lam, trans_prior, emissions_prior)

    # now initialize the model and set the matrices for it
    model = hmm.CategoricalHMM(n_components=transition_matrix.shape[0], init_params='')
    model.transmat_ = transition_matrix.T
    model.emissionprob_ = emission_probs.T

    # starting_state = np.zeros(unique_states.shape[1])
    # starting_state_index = states_to_index[tuple(starting_state)]

    # start_probs = np.zeros(transition_matrix.shape[0])
    # start_probs[starting_state_index] = 1
    
    model.startprob_ = np.ones(transition_matrix.shape[0]) / transition_matrix.shape[0]

    # return the model,  the dictionaries
    return model, (unique_states, unique_obs, states_to_index, observation_to_index)

In [22]:
def predict_states(model: hmm.CategoricalHMM, all_dicts: tuple, observation: np.ndarray, song_lengths: list):
    """
    Uses the model to decode an observation. The all_dicts tuple should contain the model dictionaries returned from fit_model
    Returns the predicted states.
    """
    # unpack the tuple to get what we need
    unique_states, unique_obs, _, observation_to_index = all_dicts

    # perform a forward fill on the observation in case there are any values in it that we have never seen before
    observation = ffill_obs(observation, unique_obs)
    
    # get the indices of the observation
    observation_indices = np.array([int(observation_to_index[(o,)]) for o in observation])

    # get the predicted state indices
    _, pred_indices = model.decode(observation_indices.reshape(-1, 1), lengths=song_lengths)

    # use the unique_states dictionary to take the indices to the actual states
    pred_states = unique_states[pred_indices, :]

    # return the predicted states
    return pred_states

In [23]:
def redact(seq, lam):
    """
    Redact a sequence of chords to only contain lam repetitions of any given chord in the sequence. Takes in a one-dimensional sequence and returns the 
    shortened sequence
    """
    if len(seq.shape) != 1:
        raise TypeError("array must be 1-dimensional")

    # start building a mask for the sequence: True if we are below lam repetitions, False otherwise
    curr_val = seq[0]
    length = 1
    mask = [True]

    # iterate through and create the mask
    for i in range(1, len(seq)):
        if seq[i] == curr_val:
            if length <= lam - 1:
                mask.append(True)
            else:
                mask.append(False)
            length += 1
        else:
            mask.append(True)
            curr_val = seq[i]
            length = 1

    # mask out the values and return
    mask = np.array(mask)
    return seq[mask]

In [24]:
def get_prediction(model, all_dicts, val_df: pd.DataFrame, val_lengths: pd.Series, num_chords: int=1, num_notes: int=0, subset: bool=False, indices=None, do_print: bool=True, print_accuracy: bool=True):
    if subset:
        val_df, val_lengths = load_song_subset(val_df, val_lengths, indices)
    
    true_states, new_song_obs = dataframe_to_states(val_df, num_chords, num_notes)

    # get the predicted states (chop off the first element of the songs because it added a 0)
    pred_states = predict_states(model, all_dicts, new_song_obs[1:], val_lengths.values.flatten().tolist())

    
    # print the results, then return the results and the accuracy
    if do_print:
        print("Pred\t\tTrue")
        cumul = np.cumsum(val_lengths.values)
        for i in range(len(pred_states)):
            if i in set(cumul):
                print("----- New Song -----")
            print(f"{pred_states[i]}\t\t{true_states[i]}")

    # get the accuracy
    accuracy = chord_accuracy(pred_states, true_states, num_chords, num_notes)

    if print_accuracy:
        print("Accuracy:", accuracy)

    return pred_states, accuracy

#### Major Performance

In [25]:
# fit the model
num_chords = 1
num_notes = 0

# run through the lambda values and get the accuracy on the validation set for that lambda value
for lam in [None, 4, 3, 2, 1]:

    # get the prediction
    print(f"\n\nlambda = {lam}")

    # train on major data only
    model, all_dicts = fit_model(major_train_df, major_train_lengths, num_chords, num_notes, lam=lam, trans_prior=0, emissions_prior=0)
    pred_states, major_model_major_acc = get_prediction(model, all_dicts, major_val_df, major_val_lengths, do_print=False)
    pred_states, major_model_all_key_acc = get_prediction(model, all_dicts, val_df, val_lengths, do_print=False)


    # train on all data
    model, all_dicts = fit_model(train_df, train_lengths, num_chords, num_notes, lam=lam, trans_prior=0, emissions_prior=0)
    pred_states, all_key_model_major_acc = get_prediction(model, all_dicts, major_val_df, major_val_lengths, do_print=False)
    pred_states, all_key_model_all_key_acc = get_prediction(model, all_dicts, val_df, val_lengths, do_print=False)

    print("Accuracy on Major only Dataset\n----------------------")
    print(f"major model: {major_model_major_acc:.4f}", f"all key model: {all_key_model_major_acc:.4f}")

    print("Accuracy on All Key Dataset\n----------------------")
    print(f"major model: {major_model_all_key_acc:.4f}", f"all key model: {all_key_model_all_key_acc:.4f}")
 




lambda = None


Processing states: 100%|██████████| 1931101/1931101 [02:12<00:00, 14567.27it/s]
Processing states: 100%|██████████| 495132/495132 [00:38<00:00, 12841.25it/s]


Accuracy: 0.11363232430947706


Processing states: 100%|██████████| 604396/604396 [00:41<00:00, 14616.11it/s]


Accuracy: 0.11216321749316673


Processing states: 100%|██████████| 2414976/2414976 [03:22<00:00, 11945.05it/s]
Processing states: 100%|██████████| 495132/495132 [00:31<00:00, 15967.11it/s]


Accuracy: 0.24673622387565336


Processing states: 100%|██████████| 604396/604396 [00:37<00:00, 15950.08it/s]


Accuracy: 0.242969840965195
Accuracy on Major only Dataset
----------------------
major model: 0.1136 all key model: 0.2467
Accuracy on All Key Dataset
----------------------
major model: 0.1122 all key model: 0.2430


lambda = 4


Processing states: 100%|██████████| 1931101/1931101 [01:59<00:00, 16158.15it/s]
Processing states: 100%|██████████| 495132/495132 [00:29<00:00, 16615.55it/s]


Accuracy: 0.24434696202224862


Processing states: 100%|██████████| 604396/604396 [00:36<00:00, 16589.81it/s]


Accuracy: 0.2516578534603141


Processing states: 100%|██████████| 2414976/2414976 [02:38<00:00, 15251.95it/s]
Processing states: 100%|██████████| 495132/495132 [00:30<00:00, 16093.15it/s]


Accuracy: 0.34373056073935837


Processing states: 100%|██████████| 604396/604396 [00:39<00:00, 15377.31it/s]


Accuracy: 0.3450684650460956
Accuracy on Major only Dataset
----------------------
major model: 0.2443 all key model: 0.3437
Accuracy on All Key Dataset
----------------------
major model: 0.2517 all key model: 0.3451


lambda = 3


Processing states: 100%|██████████| 1931101/1931101 [02:10<00:00, 14851.70it/s]
Processing states: 100%|██████████| 495132/495132 [00:32<00:00, 15069.42it/s]


Accuracy: 0.2681163810862558


Processing states: 100%|██████████| 604396/604396 [00:40<00:00, 14932.27it/s]


Accuracy: 0.2782397633339731


Processing states: 100%|██████████| 2414976/2414976 [02:43<00:00, 14784.86it/s]
Processing states: 100%|██████████| 495132/495132 [00:30<00:00, 16101.97it/s]


Accuracy: 0.3630950938335636


Processing states: 100%|██████████| 604396/604396 [00:36<00:00, 16446.69it/s]


Accuracy: 0.36602988768952804
Accuracy on Major only Dataset
----------------------
major model: 0.2681 all key model: 0.3631
Accuracy on All Key Dataset
----------------------
major model: 0.2782 all key model: 0.3660


lambda = 2


Processing states: 100%|██████████| 1931101/1931101 [02:17<00:00, 14042.31it/s]
Processing states: 100%|██████████| 495132/495132 [00:30<00:00, 16311.23it/s]


Accuracy: 0.2933258201853243


Processing states: 100%|██████████| 604396/604396 [00:38<00:00, 15634.42it/s]


Accuracy: 0.3041135282165997


Processing states: 100%|██████████| 2414976/2414976 [02:45<00:00, 14557.66it/s]
Processing states: 100%|██████████| 495132/495132 [00:32<00:00, 15176.45it/s]


Accuracy: 0.37633398770428894


Processing states: 100%|██████████| 604396/604396 [00:39<00:00, 15160.18it/s]


Accuracy: 0.3815230411849185
Accuracy on Major only Dataset
----------------------
major model: 0.2933 all key model: 0.3763
Accuracy on All Key Dataset
----------------------
major model: 0.3041 all key model: 0.3815


lambda = 1


Processing states: 100%|██████████| 1931101/1931101 [02:06<00:00, 15275.46it/s]
Processing states: 100%|██████████| 495132/495132 [00:35<00:00, 13978.53it/s]


Accuracy: 0.24422780187909487


Processing states: 100%|██████████| 604396/604396 [00:39<00:00, 15247.47it/s]


Accuracy: 0.24764889244799768


Processing states: 100%|██████████| 2414976/2414976 [02:38<00:00, 15224.76it/s]
Processing states: 100%|██████████| 495132/495132 [00:34<00:00, 14443.35it/s]


Accuracy: 0.2708974576476576


Processing states: 100%|██████████| 604396/604396 [00:41<00:00, 14499.13it/s]


Accuracy: 0.2731503848470208
Accuracy on Major only Dataset
----------------------
major model: 0.2442 all key model: 0.2709
Accuracy on All Key Dataset
----------------------
major model: 0.2476 all key model: 0.2732


In [26]:
# get the true states (should be the same no matter what)
num_chords = 1
num_notes = 0
true_states, _ = dataframe_to_states(major_val_df, num_chords, num_notes)

# get the accuracy of the all I sequence
baseline = accuracy_score(true_states[:, num_chords-1].flatten(), np.ones(len(true_states)))
print("Baseline Accuracy:", baseline)

Processing states: 100%|██████████| 495132/495132 [00:36<00:00, 13737.24it/s]

Baseline Accuracy: 0.3368670640009856





#### Minor Performance

In [27]:
# fit the model
num_chords = 1
num_notes = 0

# run through the lambda values and get the accuracy on the validation set for that lambda value
for lam in [None, 4, 3, 2, 1]:

    # get the prediction
    print(f"\n\nlambda = {lam}")

    # train on minor data only
    model, all_dicts = fit_model(minor_train_df, minor_train_lengths, num_chords, num_notes, lam=lam, trans_prior=0, emissions_prior=0)
    pred_states, minor_model_minor_acc = get_prediction(model, all_dicts, minor_val_df, minor_val_lengths, do_print=False)
    pred_states, minor_model_all_key_acc = get_prediction(model, all_dicts, val_df, val_lengths, do_print=False)


    # train on all data
    model, all_dicts = fit_model(train_df, train_lengths, num_chords, num_notes, lam=lam, trans_prior=0, emissions_prior=0)
    pred_states, all_key_model_minor_acc = get_prediction(model, all_dicts, minor_val_df, minor_val_lengths, do_print=False)
    pred_states, all_key_model_all_key_acc = get_prediction(model, all_dicts, val_df, val_lengths, do_print=False)

    print("Accuracy on Minor only Dataset\n----------------------")
    print(f"minor model: {minor_model_minor_acc:.4f}", f"all key model: {all_key_model_minor_acc:.4f}")

    print("Accuracy on All Key Dataset\n----------------------")
    print(f"minor model: {minor_model_all_key_acc:.4f}", f"all key model: {all_key_model_all_key_acc:.4f}")



lambda = None


Processing states: 100%|██████████| 328264/328264 [00:24<00:00, 13491.15it/s]
Processing states: 100%|██████████| 73259/73259 [00:06<00:00, 11269.83it/s]


Accuracy: 0.3087129226443167


Processing states: 100%|██████████| 604396/604396 [00:41<00:00, 14606.33it/s]


Accuracy: 0.30268565642393397


Processing states: 100%|██████████| 2414976/2414976 [02:34<00:00, 15650.53it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16721.02it/s]


Accuracy: 0.2528290039449078


Processing states: 100%|██████████| 604396/604396 [00:38<00:00, 15702.25it/s]


Accuracy: 0.242969840965195
Accuracy on Minor only Dataset
----------------------
minor model: 0.3087 all key model: 0.2528
Accuracy on All Key Dataset
----------------------
minor model: 0.3027 all key model: 0.2430


lambda = 4


Processing states: 100%|██████████| 328264/328264 [00:20<00:00, 15648.97it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16707.81it/s]


Accuracy: 0.29484431946928025


Processing states: 100%|██████████| 604396/604396 [00:39<00:00, 15220.63it/s]


Accuracy: 0.2897570467044785


Processing states: 100%|██████████| 2414976/2414976 [02:37<00:00, 15322.82it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 14788.42it/s]


Accuracy: 0.3529532207646842


Processing states: 100%|██████████| 604396/604396 [00:36<00:00, 16461.15it/s]


Accuracy: 0.3450684650460956
Accuracy on Minor only Dataset
----------------------
minor model: 0.2948 all key model: 0.3530
Accuracy on All Key Dataset
----------------------
minor model: 0.2898 all key model: 0.3451


lambda = 3


Processing states: 100%|██████████| 328264/328264 [00:20<00:00, 16279.50it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16917.81it/s]


Accuracy: 0.3011780122578796


Processing states: 100%|██████████| 604396/604396 [00:36<00:00, 16741.87it/s]


Accuracy: 0.29485304336891704


Processing states: 100%|██████████| 2414976/2414976 [02:29<00:00, 16118.65it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 17078.40it/s]


Accuracy: 0.3752576475245362


Processing states: 100%|██████████| 604396/604396 [00:38<00:00, 15843.47it/s]


Accuracy: 0.36602988768952804
Accuracy on Minor only Dataset
----------------------
minor model: 0.3012 all key model: 0.3753
Accuracy on All Key Dataset
----------------------
minor model: 0.2949 all key model: 0.3660


lambda = 2


Processing states: 100%|██████████| 328264/328264 [00:20<00:00, 16183.53it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 17130.24it/s]


Accuracy: 0.3076209066462824


Processing states: 100%|██████████| 604396/604396 [00:36<00:00, 16665.07it/s]


Accuracy: 0.3036138558163853


Processing states: 100%|██████████| 2414976/2414976 [02:32<00:00, 15884.80it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16651.93it/s]


Accuracy: 0.38613685690495364


Processing states: 100%|██████████| 604396/604396 [00:37<00:00, 15985.33it/s]


Accuracy: 0.3815230411849185
Accuracy on Minor only Dataset
----------------------
minor model: 0.3076 all key model: 0.3861
Accuracy on All Key Dataset
----------------------
minor model: 0.3036 all key model: 0.3815


lambda = 1


Processing states: 100%|██████████| 328264/328264 [00:20<00:00, 15788.10it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16315.33it/s]


Accuracy: 0.14467846953957875


Processing states: 100%|██████████| 604396/604396 [00:37<00:00, 15962.56it/s]


Accuracy: 0.14789641228598469


Processing states: 100%|██████████| 2414976/2414976 [02:34<00:00, 15586.56it/s]
Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 16412.04it/s]


Accuracy: 0.2695504989148091


Processing states: 100%|██████████| 604396/604396 [00:39<00:00, 15479.28it/s]


Accuracy: 0.2731503848470208
Accuracy on Minor only Dataset
----------------------
minor model: 0.1447 all key model: 0.2696
Accuracy on All Key Dataset
----------------------
minor model: 0.1479 all key model: 0.2732


In [28]:
# get the true states (should be the same no matter what)
num_chords = 1
num_notes = 0
true_states, _ = dataframe_to_states(minor_val_df, num_chords, num_notes)

# get the accuracy of the all I sequence
baseline = accuracy_score(true_states[:, num_chords-1].flatten(), np.ones(len(true_states)))
print("Baseline Accuracy:", baseline)

Processing states: 100%|██████████| 73259/73259 [00:04<00:00, 14958.27it/s]

Baseline Accuracy: 0.3389298389298389



