In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

In [None]:
from audio_file import read_samples, write_samples, read_concat_samples
from signal_processing.features import extract_features
from signal_processing.windowing import window_samples
from utils import get_files_of_types
import definitions

import pandas as pd
import random
import matplotlib.pyplot as plt
from IPython.display import Audio, display
import plotly.express as px
import os

from sklearn.cluster import KMeans
import numpy as np

In [None]:
files = get_files_of_types("E:/tsvr-samplepack-tools/data/raw/Brass", definitions.AUDIO_FILE_TYPES)
print(len(files))

In [None]:
def display_audio(samples, title="", rate=definitions.SAMPLE_RATE):
    display(Audio(samples, rate=rate))
    plt.title(title)
    plt.plot(samples)
    plt.show()

In [None]:
all_features = []
for f in files:
    print(f"loading sample {f}")
    samples = read_samples(f)
    features = extract_features(samples)
    all_features.append({'file' : f, **features})
    # display_audio(samples, f)
df_features = pd.DataFrame(all_features)

In [None]:
win_size = 8192
hop_size = 8192

all_features = []
for f in files[:10]:
    print(f"loading sample {f}")
    samples = read_samples(f)
    windowed_samples = window_samples(samples, win_size, hop_size, window_type="rectangular")
    for i, win in enumerate(windowed_samples):
        win = np.nan_to_num(win)
        print(f"\textracting features for {f} window {i+1}/{len(windowed_samples)}")
        try:
            features = extract_features(win)
            all_features.append({'file' : f, 'window' : i, **features})
        except Exception as e:
            print(f"failed to extract features for {f} window {i} | {e}")
df_features = pd.DataFrame(all_features)

In [None]:
df_features

In [None]:
def kmeans_quantize_distances(df, cluster_by, cluster_ratio=0.3):
    new_df = df.copy()
    X = np.array([np.asarray(x) for x in df[cluster_by]])
    K = int(len(X) * cluster_ratio)
    kmeans = KMeans(n_clusters=K, random_state=0).fit(X)
    transformed = kmeans.transform(X)
    labels = np.argmin(transformed, axis=1)
    for idx, label in enumerate(labels):
        new_df.loc[idx, 'label'] = label
        new_df.loc[idx, 'distance'] = transformed[idx, label]
    return new_df

df_dists = kmeans_quantize_distances(df_features, 'mfccs', 0.1)
# df_dists[df_dists.label == 4].sort_values('distance', ascending=True)

In [None]:
key = "mfccs"
df_dists = kmeans_quantize_distances(df_features, key, 0.1)

key_pos = "mfccs"
x_idx = 0
y_idx = 1
z_idx = 2

df_fig = df_dists.copy()

df_fig['x'] = df_fig[key_pos].apply(lambda x: x[x_idx])
df_fig['y'] = df_fig[key_pos].apply(lambda x: x[y_idx])
df_fig['z'] = df_fig[key_pos].apply(lambda x: x[z_idx])

px.scatter_3d(df_fig, x='x', y='y', z='z', color='label', size="rms", opacity=0.5, hover_data=['file', 'distance'])

We could do several things
- take the n closest to center of each cluster
* - within a cluster, find the permutation of distances that is the largest, with a given N

In [None]:
import itertools


# make a function to find the most optimal distances between vectors in the same K label
def find_optimal_distances(df, key, k):
    df_k = df[df.label == k]
    distances = []
    for i, j in itertools.combinations(range(len(df_k)), 2):
        # distances.append(np.linalg.norm(df_k.iloc[i][key] - df_k.iloc[j][key]))
        distances.append({'dist' : np.linalg.norm(df_k.iloc[i][key] - df_k.iloc[j][key]),
                          'i' : i,
                          'j' : j,
                          'file_i' : os.path.basename(df_k.iloc[i]['file']),})
    return pd.DataFrame(distances).sort_values('dist', ascending=False)

def filter_optimal_distances(df, key, k, min_dist, max_dist):
    df_k = df[df.label == k]
    distances = []
    for i, j in itertools.combinations(range(len(df_k)), 2):
        dist = np.linalg.norm(df_k.iloc[i][key] - df_k.iloc[j][key])
        print(dist)
        if dist > min_dist and dist < max_dist:
            distances.append({'dist' : dist,
                              'i' : i,
                              'j' : j,
                              'file_i' : os.path.basename(df_k.iloc[i]['file']),})
    print(distances)
    return pd.DataFrame(distances).sort_values('dist', ascending=False)

# df_optimal = find_optimal_distances(df_dists, 'mfccs', 4)
# df_optimal


df_filtered = filter_optimal_distances(df_dists, 'mfccs', 4, 0.5, 1.5)
df_filtered

In [None]:
df = df_features.copy()
key = "mfccs"
subset_size = 0.2

# Convert list of vectors to numpy array
X = np.array([np.asarray(x) for x in df[key]])
# X = X[:, :2]
K = int(len(X)*subset_size)

kmeans = KMeans(n_clusters=K, random_state=0).fit(X)

transformed = kmeans.transform(X)
labels = np.argmin(transformed, axis=1)

sorted_dists = {}

for idx, label in enumerate(labels):
    if label not in sorted_dists:
        sorted_dists[label] = [{'distance' : transformed[idx, label] , 'row' : df.iloc[idx] }]
    else:
        sorted_dists[label].append({'distance' : transformed[idx, label], 'row' : df.iloc[idx]})

for label in range(K):
    sorted_dists[label] = sorted(sorted_dists[label], key=lambda x: x['distance'])

for label, center in enumerate(kmeans.cluster_centers_):
    color = np.random.rand(3,)
    plt.scatter(center[0], center[1], color=color, marker='x', alpha=1)

    max_dist = sorted_dists[label][-1]['distance']

    for sd in sorted_dists[label]:
        x = sd['row'][key][0]
        y = sd['row'][key][1]
        plt.scatter(x, y, color=color, marker='o', alpha = 1-(sd['distance']/max_dist))

plt.show()

In [None]:
# bing


def kmeans_subselection(df, key, subset_size):
    # Convert list of vectors to numpy array
    X = np.array([np.asarray(x) for x in df[key]])
    # Run kmeans clustering
    kmeans = KMeans(n_clusters=int(len(X)*subset_size), random_state=0).fit(X)
    # Get the indices of the most dissimilar vectors
    indices = np.argpartition(kmeans.transform(X), -int(len(X)*subset_size), axis=0)[-int(len(X)*subset_size):]
    # Return the most dissimilar vectors
    print(f'LENGTH OF INDICES: {len(indices)} | idx: {indices}')
    selected = [df.iloc[i] for i in indices]
    return pd.DataFrame([x.to_dict() for x in selected])

# chatgpt

def select_dissimilar_vectors(vectors, subset_size):
    # Convert list of vectors to numpy array
    X = np.array(vectors)
    # Initialize KMeans clustering algorithm
    kmeans = KMeans(n_clusters=int(subset_size*len(vectors)), random_state=0).fit(X)
    # Get the cluster labels for each vector
    labels = kmeans.predict(X)
    # Compute the centroid of each cluster
    centroids = kmeans.cluster_centers_
    # Compute the distance of each vector to its centroid
    distances = [np.linalg.norm(X[i]-centroids[labels[i]]) for i in range(len(vectors))]
    # Sort the vectors by their distance to centroid
    sorted_vectors = [vector for _, vector in sorted(zip(distances, vectors))]
    # Return a subset of the most dissimilar vectors
    return sorted_vectors[:int(subset_size*len(vectors))]