In [1]:
import numpy as np
import bnpy, os, util

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18, 8)
plt.rcParams['font.size'] = 20

from bnpy.allocmodel.hmm.HMMUtil import runViterbiAlg
from bnpy.util.StateSeqUtil import alignEstimatedStateSeqToTruth
from bnpy.util.StateSeqUtil import calcHammingDistance

from scipy import stats

In [2]:
alloc_model = 'FiniteHMM'
alg = 'VB'

# MoCap6 dataset

In [3]:
# Load data
mocap_path = os.path.join(bnpy.DATASET_PATH, 'mocap6', 'dataset.mat')
mocap_dataset = bnpy.data.GroupXData.read_mat(mocap_path)

# Define hyperparameters
obs_model = 'AutoRegGauss' 
K = 20
out_path = '/tmp/mocap6-K=%d' % (K)

mocap_experiment = util.run_experiment(mocap_dataset, alloc_model, obs_model, alg, K, out_path, n_task=1)

Training dense model
Dataset Summary:
GroupXData
  size: 6 units (documents)
  dimension: 12
Allocation Model:  None
Obs. Data  Model:  Auto-Regressive Gaussian with full covariance.
Obs. Data  Prior:  MatrixNormal-Wishart on each mean/prec matrix pair: A, Lam
  E[ A ] = 
  [[ 1.  0.]
   [ 0.  1.]] ...
  E[ Sigma ] = 
  [[ 1.  0.]
   [ 0.  1.]] ...
Initialization:
  initname = randexamples
  K = 20 (number of clusters)
  seed = 1607680
  elapsed_time: 0.1 sec
Learn Alg: VB | task  1/1 | alg. seed: 1607680 | data order seed: 8541952
task_output_path: /tmp/mocap6-K=20/dense/1
        1/500 after      0 sec. |    133.5 MiB | K   20 | loss  2.899921971e+00 |  
        2/500 after      1 sec. |    134.1 MiB | K   20 | loss  2.704977383e+00 | Ndiff   53.638 


KeyboardInterrupt: 

In [None]:
plot_loss(mocap_experiment)

In [None]:
def plot_loss(experiment_out, title=None):    
    # Plot loss vs iteration, and save info for plotting loss vs L
    L_vals = []
    sparse_loss = []
    blocked_loss = []
    colors = ['red', 'green', 'blue', 'orange', 'purple']
    
    for model, info_dict, label in experiment_out:
        loss = info_dict['loss']
        alg_dict = info_dict['KwArgs']['VB']
        L = alg_dict['nnzPerRowLP']
        blocked = alg_dict['blockedLP']
        
        # Plot loss vs iteration
        marker = '--' if blocked else '-'
        color = colors[L%5] if L > 0 else 'yellow'
        plt.plot(info_dict['lap_history'], info_dict['loss_history'],
                 label=label, linestyle=marker, color=color, linewidth=3)
        
        # Gather info for plotting loss vs L
        if L == 0 or L == 1:
            L_tmp = L if L == 1 else info_dict['K_history'][-1]
            L_vals.append(L_tmp)

            blocked_loss.append(loss)
            sparse_loss.append(loss)
        elif blocked:
            L_vals.append(L)
            blocked_loss.append(loss)
        else:
            sparse_loss.append(loss)

    plt.xlabel('iteration')
    plt.ylabel('loss')
    plt.legend()
    if title is not None: plt.title(title)
    plt.show()

    # Plot loss vs L
    plt.plot(L_vals[:-1], sparse_loss[:-1], label='One-pass sparse model', marker='o', linewidth=3)
    plt.plot(L_vals[:-1], blocked_loss[:-1], label='Two-pass sparse model', marker='o', linewidth=3)
    plt.hlines(sparse_loss[-1], L_vals[0], L_vals[-2], label='Dense model')
    plt.xlabel('L')
    plt.ylabel('loss')
    plt.legend()
    if title is not None: plt.title(title)
    plt.show()

In [None]:
plot_hamming(mocap_experiment)

In [None]:
def plot_hamming(experiment_out, title=None):
    L_vals = []
    sparse_ham = []
    blocked_ham = []
    colors = ['red', 'green', 'blue', 'orange', 'purple']
    
    for model, info_dict, label in experiment_out:
        alg_dict = info_dict['KwArgs']['VB']
        L = alg_dict['nnzPerRowLP']
        blocked = alg_dict['blockedLP']
        
        ham_dist_history = compute_hamming(info_dict)
        ham_dist = ham_dist_history[-1]
        
        # Plot Hamming distance vs iteration
        marker = '--' if blocked else '-'
        color = colors[L%5] if L > 0 else 'yellow'
        plt.plot(info_dict['lap_history'], ham_dist_history,
                 label=label, linestyle=marker, color=color, linewidth=3)

        # Gather info for plotting loss vs L
        if L == 0 or L == 1:
            L_tmp = L if L == 1 else info_dict['K_history'][-1]
            L_vals.append(L_tmp)

            blocked_ham.append(ham_dist)
            sparse_ham.append(ham_dist)
        elif alg_dict['blockedLP']:
            L_vals.append(L)
            blocked_ham.append(ham_dist)
        else:
            sparse_ham.append(ham_dist)

    plt.xlabel('iteration')
    plt.ylabel('Hamming distance')
    plt.legend()
    if title: plt.title(title)
    plt.show()

    plt.plot(L_vals[:-1], sparse_ham[:-1], label='One-pass sparse', marker='o', linewidth=3, markersize=3)
    plt.plot(L_vals[:-1], blocked_ham[:-1], label='Two-pass sparse', marker='o', linewidth=3, markersize=3)
    plt.hlines(sparse_ham[-1], L_vals[0], L_vals[-2], label='Dense model')
    plt.xlabel('L')
    plt.ylabel('Hamming distance')
    plt.legend()
    if title: plt.title(title)
    plt.show()
    
def compute_hamming(model_dict):
    dataset = model_dict['Data']
    N = dataset.X.shape[0]
    doc_range = dataset.doc_range
    ztrue = dataset.TrueParams['Z'].flatten()
    path = model_dict['task_output_path']
    laps = np.concatenate(([0], model_dict['lap_history']))
    ham_dist_history = np.empty_like(model_dict['lap_history'], dtype=float)
    
    # Iterate over laps
    for lap in laps:
        # Load model
        model, _ = bnpy.load_model_at_lap(path, lap)
    
        # Load model parameters
        init = model.allocModel.get_init_prob_vector()
        trans = model.allocModel.get_trans_prob_matrix()
        evidence = model.obsModel.calcLogSoftEvMatrix_FromPost(dataset)
        
        # Run Viterbi
        zhat = np.empty(N)
        for i, (start, end) in enumerate(zip(doc_range[:-1], doc_range[1:])):
            zhat[start:end] = runViterbiAlg(evidence[start:end],
                                            np.log(init), np.log(trans))
        zhat_aligned = alignEstimatedStateSeqToTruth(zhat, ztrue)

        # Compute Hamming distance
        ham_dist_history[lap - 1] = calcHammingDistance(ztrue, zhat_aligned)
        
    return ham_dist_history

# Speaker diarization dataset

Following previous work, we chose meetings 11, 16, and 21.

In [None]:
from bnpy.data import SpeakerDiar
num_meetings = len(SpeakerDiar.fileNames)
meeting_ids = [11, 16, 21]
all_meeting_data = [SpeakerDiar.get_data(i) for i in meeting_ids]

In [None]:
K = 25

In [None]:
spkr_dataset = all_meeting_data[0]
out_path = '/tmp/spkr-K=%d' % (K)
obs_model = 'Gauss'
spkr_experiment = util.run_experiment(spkr_dataset, alloc_model, obs_model, alg, K, out_path, n_task=1)

In [None]:
plot_loss(spkr_experiment)

In [None]:
plot_hamming(spkr_experiment)

# Grid toy dataset

In [None]:
N, T, D = 100, 10, 2
n_rows = 3
n_cols = 3
K = n_rows * n_cols
states, obs = util.sample_grid(N, T, n_rows, n_cols)

In [None]:
X = obs.reshape((N*T, D))
doc_range = np.arange(0, N*T+1, T)
Z_true = states.flatten()
grid_dataset = bnpy.data.GroupXData(X, doc_range, TrueZ=Z_true)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=Z_true)
plt.show()

In [None]:
out_path = '/tmp/toy-K=%d' % (K)
obs_model = 'DiagGauss'
toy_experiment = util.run_experiment(grid_dataset, alloc_model, obs_model, alg, K, out_path, n_task=5)

In [None]:
util.plot_loss(toy_experiment)

In [None]:
util.plot_hamming(toy_experiment)

In [9]:
mocap_dataset.TrueParams['Z'][0:382]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,
        5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  5.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,
        9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9.,  9

In [8]:
mocap_dataset.doc_range

array([   0,  382,  587,  838, 1284, 1671, 2058], dtype=int32)