In [4]:
from pathlib import Path

name = "librispeech-dev-clean"
in_dir = Path("data/dev-clean")
align_dir = Path("data/alignments/dev-clean")
feat_dir = Path("features")
audio_ext = ".flac" 

In [5]:
from encode import sample_files
from utils.features import DataSet

dataset = DataSet(
    name, in_dir, align_dir, feat_dir, audio_ext 
)

sample_size = 500
gamma = 0.2
out_dir_dusted = Path(f"output/dusted/{sample_size}")

sampled_paths = sample_files(dataset, sample_size)
print(len(sampled_paths))

500


In [6]:
from utils.features import get_words_and_dist_mat

dusted_words, dist_mat_dusted = get_words_and_dist_mat(dataset, "dusted", out_dir_dusted, sample_size, gamma)

Getting dusted words: 11876it [00:44, 266.91it/s]


In [7]:
from utils.features import display_words

true_words = display_words(dusted_words)

Sorted Word Counts


Unnamed: 0,Word,Count
Loading ITables v2.2.5 from the internet... (need help?),,


In [None]:
from distance import calculate_distance

# If calculate_distance is called with only the words, it does not save the matrices
dist_mat_dusted = calculate_distance(dusted_words, out_dir_dusted, 8)

Calculating Distances:   0%|          | 160/70513750 [00:28<3139:01:36,  6.24it/s]

In [10]:
from eval import pairwise_edit_dist_mat

# Visualise the distances
pairwise_edit_dist_mat(dist_mat_dusted, "Pairwise Edit Distance Matrix Dusted", true_words)

AttributeError: module 'numpy' has no attribute '_no_nep50_warning'

In [36]:
from cluster import cluster, get_word_clusters

# Get the int_clusters for each of the matrices
dust_clusters = cluster(dist_mat_dusted, 0.2)

# Convert these to word clusters
dust_word_clusters = get_word_clusters(dust_clusters, dusted_words)

Clustering: 100%|██████████| 11876/11876 [00:00<00:00, 65052.26it/s]
Getting Word Clusters: 7848it [00:02, 2864.05it/s]


In [None]:
from cluster import get_loaded_clusters

dust_word_clusters = get_loaded_clusters(dusted_words)

In [39]:
from eval import ned

# Compute the ned for my dusted and hubert clusters - atm my NED is computed only using clusters with > 1 words
ned_dusted = ned(dust_word_clusters, print_pure=True, print_inpure=True)
print(f"DUSTED NED: {ned_dusted}")

Cluster 1: 0
the, the

Cluster 2: 0
the, the, the

Cluster 6: 0
the, the, the, the, the, the, the, the, the

Cluster 7: 0
the, the

Cluster 9: 0
the, the

Cluster 10: 0
the, the

Cluster 14: 0
the, the, the

Cluster 23: 0
best, best

Cluster 27: 0
of, of, of, of, of

Cluster 28: 0
of, of

Cluster 29: 0.12121212121212122
of, of, of, of, of, of, of, of, of, of, of, of, of, of, of, of, of, of, a, of, of, of, of, of, of, of, of, of, of, of, of, of, of

Cluster 30: 0
of, of

Cluster 31: 2
a, of

Cluster 33: 0
of, of

Cluster 39: 0
of, of, of, of, of

Cluster 59: 0
of, of

Cluster 60: 0
move, move

Cluster 64: 0
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,

In [None]:
from cluster import get_best_clusters

ned_dusted, duplicate_count_dusted, dust_word_clusters = get_best_clusters(dust_word_clusters, ned_dusted, max_iter=10)

Iteration 0: NED: 0.066423, Duplicates: 5351


Calculating Cluster Centroids: 7848it [00:00, 22936.70it/s]


In [14]:
from utils.features import store_words

# Store words associated with these clusters
store_words(dust_word_clusters, out_dir_dusted)

Wrote words to output/dusted/20/words.csv


In [None]:
from eval import words_from_word_units, clusters_purity
            
dust_just_words_clusters = words_from_word_units(dust_word_clusters)
inpurity, total = clusters_purity(dust_just_words_clusters)
print(f"{total} Dusted Clusters with inpurity: {round(inpurity*100, 3)}%")

77 Dusted Clusters with inpurity: 29.87%


In [15]:
from eval import calculate_duplicate_clusters

cluster_counts_dusted, duplicate_counts_dusted = calculate_duplicate_clusters(dust_word_clusters, print_clusters=True)

Total duplicate clusters (considering word frequency): 25
Duplicate clusters and their counts:
{'you': 1}: 2 times
{'_': 2}: 3 times
{'my': 1}: 2 times
{'a': 1}: 4 times
{'white': 1}: 2 times
{'but': 1}: 2 times
{'_': 1}: 8 times
{'her': 1}: 2 times


In [16]:
from encode import sample_files

query_path = sample_files(dataset, sample_size=1)

# Query one of the ladys
query_path = [Path("data/dev-clean/174/50561/174-50561-0013.flac")]

from utils.features import load_units_from_paths

query_dusted_words = load_units_from_paths(dataset, "dusted", query_path)
query_dusted_sentence = " ".join([w.true_word for w in query_dusted_words])

from cluster import get_distance_to_centroids, get_cluster_centroids

dust_centroids = get_cluster_centroids(dust_word_clusters)
query_dusted_words = get_distance_to_centroids([query_dusted_words], dust_centroids)

import editdistance
import statistics

predictions = []
query_distances = []
for word in query_dusted_words:
    prediction = dust_centroids[word.cluster_id].true_word
    query_distances.append(editdistance.eval(word.true_word, prediction)/max(len(word.true_word), len(prediction)))
    predictions.append(prediction)

prediction_sentence = " ".join(predictions)
print(f"NED for query: {statistics.mean(query_distances)}")
print(query_dusted_sentence)
print(prediction_sentence)

Loading Units: 100%|██████████| 1/1 [00:00<00:00,  5.17it/s]
Calculating Cluster Centroids: 77it [00:00, 22276.27it/s]

NED for query: 0.34953703703703703
_ now _ my _ on the my _ fair lawn bough _ _ and i'll lady apple play _ so lady _ lady neath you dream shady the for gold o apple you lady shall
_ though _ my _ _ the my _ fair white bring _ _ the a lady a play _ o lady you lady me you me lady the from you o a you lady will



