In [None]:
import random
import os
import torch
import numpy as np
import librosa
import uuid
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform

# Load Model

In [None]:
model_path = "C:/Users/admin/RVC-Voice-Lab/models/base/base.pth"
m = torch.load(model_path, map_location="cpu")
emb = m["model"]["emb_g.weight"]
print(f"Is a voice lab model: {'voice_lab' in m.keys()}")

# Plot PCA components

In [None]:
num_components = 32

pca = PCA(n_components=num_components)
reduced = pca.fit_transform(emb[:-1])

fig, ax = plt.subplots(nrows=6, ncols=5)
fig.set_size_inches(120,120)

# Dataset High Light
#c = ["b"] * 86 + ["r"] * 62 + ["g"] * 110 + ["c"] * 20 

# Male (b) Female (r) Highlight (Cyan: didn't have any speaker information)
#c = ["c"]* 86 + ["r"] * 62 + ["r","b","b","r","r","r","r","b","r","r","r","b","r","r","r","b","b","r","b","b","b","r","r","r","b","b","r","b","b","b","r","b","b","b","r","r","b","r","r","r","r","r","r","b","b","b","b","b","b","r","r","b","b","r","b","r","r","b","b","b","b","r","b","r","r","r","r","b","r","r","r","b","r","b","r","r","r","r","r","b","r","r","r","b","b","r","r","r","b","r","r","r","b","r","r","r","r","r","r","b","b","r","b","r","r","b","b","b","b","r"] +["r"] * 9 + ["b"] * 11
i = 0
for row in ax:
    for col in row:
        col.set_xlabel('Principal Component '+str(0), fontsize = 15)
        col.set_ylabel('Principal Component '+str(i), fontsize = 15)
        col.scatter(reduced[:,0], reduced[:,i], c=c,)
        i += 1
plt.show()

# Plot PCA components and new generated

In [None]:
num_components = 32

num_new_emb = 250
scaling = 1

pca = PCA(n_components=num_components)
reduced = pca.fit_transform(emb)
emb_up_first = pca.inverse_transform(reduced*scaling)
emb_up_after = pca.inverse_transform(reduced) *scaling

mu_reduced = np.mean(reduced, axis=0)
sigma_reduced = np.cov(reduced, rowvar=False)

new_emb_reduced = np.random.multivariate_normal(mu_reduced, sigma_reduced, num_new_emb)
new_emb = pca.inverse_transform(new_emb_reduced)

fig, ax = plt.subplots(nrows=4, ncols=4)
fig.set_size_inches(65,65)

i = 0
for row in ax:
    for col in row:
        col.set_xlabel('Principal Component '+str(0), fontsize = 15)
        col.set_ylabel('Principal Component '+str(i), fontsize = 15)
        col.scatter(reduced[:,0], reduced[:,i], c="b", label="Original")
        col.scatter(new_emb_reduced[:,0], new_emb_reduced[:,i], c="g", label="New")
        col.legend(loc='upper right')
        i += 1
plt.title('PCA Analysis')
plt.show()

# Graphs

In [None]:
euclidean_dist_matrix = squareform(pdist(emb, 'euclidean'))
sns.heatmap(euclidean_dist_matrix, cmap='viridis')
plt.title('Euclidean Distance Matrix of Voice Embeddings')
plt.show()

cos_sim_matrix = cosine_similarity(emb)
sns.heatmap(cos_sim_matrix, cmap='viridis')
plt.title('Cosine Similarity Matrix of Voice Embeddings')
plt.show()