In [None]:
import wandb
import os
os.environ["WANDB_SILENT"] = "true"

run_name = "noble-field-7"

In [None]:
run = wandb.init()
links = {
    "GOOD_AVERAGE_glamorous-sweep-62": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v283",
    "GOOD_azure-sweep-54": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v279",
    "GOOD_apricot-sweep-17": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v242",
    "GOOD_hearty-sweep-60": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v280",
    "GOOD_worldly-sweep-22": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v245",
    "GOOD_legendary-sweep-5": "mmil_vae_g2d/voice_distribution_and_genre_distribution_imbalance/model_epoch_100:v230",
    "drawn_river_6": "mmil_vae_g2d/beta_annealing_study/model_epoch_100:v2",
    "worldly-firebrand-5": "mmil_vae_g2d/beta_annealing_study/model_epoch_100:v1",
    "noble-field-7": "mmil_vae_g2d/beta_annealing_study/model_epoch_100:v3",
    "young-violet-12": "mmil_vae_g2d/beta_annealing_study/model_epoch_700:v0",
    "kind-gorge-14": "mmil_vae_g2d/beta_annealing_study/model_epoch_500:v1"
    
}
artifact = run.use_artifact(links[run_name], type='model')
artifact_dir = artifact.download()

In [None]:
import sys
sys.path.insert(0, "../..")

In [None]:
from helpers import load_variational_mgt_model
from model import GrooveTransformerEncoderVAE
import torch

In [None]:
from helpers import load_variational_mgt_model   
import os

In [None]:
model = load_variational_mgt_model(os.path.join(artifact_dir, "100.pth"))

In [None]:
from data import load_gmd_hvo_sequences

train_set = load_gmd_hvo_sequences(
    dataset_setting_json_path = "../../data/dataset_json_settings/4_4_Beats_gmd.json", 
    subset_tag = "validation", 
    force_regenerate=False)

In [None]:
gt_sample = train_set[0]
groove = torch.tensor([gt_sample.flatten_voices(reduce_dim=True)], dtype=torch.float32)
gt_sample.metadata

In [None]:
mu, logvar = model.encode_to_mu_logvar(groove)
latent_z = model.reparametrize(mu, logvar)

In [None]:
latent_z

In [None]:
voice_thresholds = [0.5] * 9
voice_max_count_allowed = [32] * 9
h, v, o = model.sample(latent_z=latent_z,
                       voice_thresholds=voice_thresholds,
                       voice_max_count_allowed=voice_max_count_allowed,
                       return_concatenated=False,
                       sampling_mode=0)
print(h, v, o)

In [None]:
# from sklearn.datasets import load_digits
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler


In [None]:
import numpy as np

latents = []
labels = []
metadatas = []
use_all_styles = True
empty_hvo_seqs = []
for gt_sample in train_set:
    if (use_all_styles or (gt_sample.metadata["style_primary"] in ["rock", "funk", "afrobeat"])):
        empty_hvo_seqs.append(gt_sample.copy_empty())
        metadatas.append(gt_sample.metadata)
        labels.append(gt_sample.metadata["style_primary"])
        groove = torch.zeros((1, 32, 3))
        flattened_ = torch.tensor([gt_sample.flatten_voices(reduce_dim=True)], dtype=torch.float32)[:,:32, :]
        t_steps = flattened_.shape[1]
        groove[:, :t_steps, :] = flattened_
        mu, logvar = model.encode_to_mu_logvar(groove)
        latent_z = model.reparametrize(mu, logvar)
        latents.append(latent_z.detach().cpu().numpy())

latents = np.array(latents).squeeze(1)
features = np.expand_dims(latents, -1) # we use each dimension of latent_z as a feature
feature_labels = [f"z_{dim}" for dim in range(features.shape[1])]

In [None]:
import pandas as pd

data = {"style_primary": [style for style in labels]}
data.update({f"z_{dim_i}": latents[:, dim_i] for dim_i in range(features.shape[1])})
df = pd.DataFrame(data)
df.head()

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import show
import IPython.display as ipd
import sys
# sys.path.insert(0, '/usr/local/bin/fluidsynth')
output_notebook()

In [None]:
voice_thresholds=[0.3]*9
voice_thresholds[1] = 0.5
voice_thresholds[3] = 0.1

# voice_thresholds[-2] = 0.01
# voice_thresholds[-3] = 0.01

# random_z = [np.random.uniform(df[f"z_{i}"].min(), df[f"z_{i}"].max()) for i in range(len(latents[0, :]))]
# random_z = [np.random.uniform(df[f"z_{i}"].min(), df[f"z_{i}"].max()) for i in range(len(latents[0, :]))]
random_z = [np.random.normal(loc=df[f"z_{i}"].mean(), scale=df[f"z_{i}"].std()) for i in range(len(latents[0, :]))]


hvo = model.sample(latent_z=torch.tensor(random_z, dtype=torch.float32), 
                   voice_thresholds=voice_thresholds,
                   voice_max_count_allowed=[32]*9,
                   return_concatenated=True,
                   sampling_mode=0)
hvo_seq_ = empty_hvo_seqs[0]
hvo_seq_.hvo = hvo.detach().cpu().numpy()[0]

if (hvo_seq_.get_number_of_active_voices()>0):
    # draw and synthesize
    show(hvo_seq_.piano_roll())
    audio = hvo_seq_.synthesize(sf_path="../../hvo_sequence/soundfonts/TamaRockSTAR.sf2")

else:
    print("Empty Score")
    
ipd.Audio(audio, rate=44100, autoplay=True) # load a NumPy array