In [None]:
from mutagen.mp3 import MP3


def read_lines(file_name):
    f = open(file_name, 'r', encoding='utf-8')
    lines = [line.rstrip('\n') for line in f]
    if len(lines[-1]) == 0:
        lines = lines[:-1]
    if len(lines[-1]) == 0:
        lines = lines[:-1]
    if len(lines[-1]) == 0:
        lines = lines[:-1]
    if len(lines[-1]) == 0:
        lines = lines[:-1]
    f.close()
    return lines

def pairwise_sim(sim, a_list, b_list):
    s = 0.0
    c = 0.0
    for a in a_list:
        for b in b_list:
            s += sim[(a,b)]
            c += 1.0
    return s/c

def find_closest2(sim, clusters):
    max_idx = (0,1)
    max_score = 0.0
    for i in range(len(clusters)):
        for j in range(i+1,len(clusters)):
            score = pairwise_sim(sim, clusters[i], clusters[j])
            if score > max_score:
                max_idx = (i,j)
                max_score = score
    return max_idx, max_score

def next_gen(sim, clusters):
    if len(clusters) == 1:
        return clusters, 0.0
    (i,j), max_score = find_closest2(sim, clusters)
    next_clusters = [sorted(clusters[i]+clusters[j])] + clusters[:i] + clusters[(i+1):j] + clusters[(j+1):]
    next_clusters  = sorted(next_clusters, key=lambda x:len(x), reverse=True)
    return next_clusters, max_score

def total_hours(files,ids):
    idset = set(ids)
    names = [f for f in files if (f.split('_')[0]) in idset]
    return sum([MP3(f'samples/mp3/{name}.mp3').info.length for name in names]) / 3600.0

print('Function ready!')

In [None]:
files = read_lines('samples/huza_imvugo_speech_all.txt')
ids = list(set([f.split('_')[0] for f in files]))
sim_scores = read_lines('samples/t4_huza_imvugo_speech_similarity.tsv')
sim = dict()
for sc in sim_scores:
    t = sc.split('\t')
    a = t[0].split('_')[0]
    b = t[1].split('_')[0]
    s = float(t[2])
    sim[(a,b)] = s
    sim[(b,a)] = s

clusters = [[id] for id in ids]
max_score = 1.0
print(f'Got {len(ids)} ids, {len(sim)} pairs!')

In [None]:

while (max_score > 0.9) and len(clusters) > 50:
    clusters, max_score = next_gen(sim, clusters)
    print(f'{len(clusters)} @ {max_score:.2f}')

print([len(c) for c in clusters[:10]])
print([f'{total_hours(files,c):.1f}h' for c in clusters[:10]])

In [None]:
print('\n'.join(clusters[9]))