In [1]:
import os
import glob
from math import floor

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import soundfile as sf
import librosa

In [2]:
#Load the NLP model
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/sentence-t5-large')  # https://huggingface.co/sentence-transformers/sentence-t5-large

In [3]:
sound_path = "./sound"
audioset_path = './sound/audioset_files'
highlight_path = './sound/highlights_v1'

alpha = 0.2  # weight for music class

sound_list = sorted(glob.glob(os.path.join(sound_path, "*.wav")))

if not os.path.exists(highlight_path):
    os.mkdir(highlight_path)

In [4]:
df_list = []  # all info
df_highlight_list = []
for index, row in enumerate(sound_list):
# for index, row in enumerate(sound_list[1:2]):
# for index, row in enumerate(sound_list[0:1]):
    print("{0} {1}".format(index, row))
    filename = os.path.basename(row)
    transcript_file = os.path.join(audioset_path, filename[:-4]+".csv")
    df_curr = pd.read_csv(transcript_file)

    # Music score
    print("...checking audio...")
    score_music = [1.0 if "Music" in row[1] else 0.0 for row in enumerate(df_curr['class_labels'])]
    df_curr['score_music'] = score_music
    
    # find the episode name
    print("...checking texts...")
    episode_name = filename.split('_')[-1][:-4]
    episode_name = episode_name.replace("-", " ")
    # Query based on the file name: "What is about <EPISODE NAME>"
    query = "what is about "+episode_name+"?"
    print("Query: {0}".format(query))
    # Embedding of the query
    query_emb = model.encode(query)

    # Compute the embedding of each paragraph
    texts = []
    for i in range(df_curr.shape[0]):
        texts.append(df_curr.iloc[i]['content'])
    doc_emb = model.encode(texts)
    scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
    df_curr['similarity'] = scores
    df_curr['sound_file'] = filename
    df_list.append(df_curr)
    
    # Total score
    print("...calculating scores...")
    df_curr['total_score'] = alpha*df_curr['score_music'] + (1.0-alpha)*df_curr['similarity']
    
    # Find the largest similarity
    # id_max = df_curr['similarity'].idxmax()
    id_max = df_curr[df_curr['duration']>=20.0]['total_score'].idxmax()  # Need to 20 sec long
    if not id_max:  # if id_max None => use the first segment
        id_max = 0
        
    start_time = df_curr.iloc[id_max]['start_time']
    end_time = df_curr.iloc[id_max]['end_time']
    duration = end_time - start_time
    if duration >= 4.0*60.0:  # 4 min
        end_time = start_time + 4.0*60.0
        duration = 4.0*60.0
    
    df_highlight_list.append({'sound_file': filename, 
                              'id_max': id_max,
                              'score': df_curr.iloc[id_max]['total_score'],
                              'score_text': df_curr.iloc[id_max]['similarity'],
                              'score_music': df_curr.iloc[id_max]['score_music'],
                              'start_time': start_time,
                              'end_time': end_time,
                              'duration': duration,
                              'content': df_curr.iloc[id_max]['content'],
                             })

    samples, samplerate = sf.read(row)
    samples_highlight = samples[floor(start_time*samplerate):floor(end_time*samplerate)]  # assuming mono channel
    highlight_filename = os.path.join(highlight_path, filename)
    sf.write(highlight_filename, samples_highlight, samplerate)

0 ./sound/99-invisible_matters-of-time.wav
...checking audio...
...checking texts...
Query: what is about matters of time?
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
...calculating scores...
1 ./sound/99-invisible_stuff-the-british-stole.wav
...checking audio...
...checking texts...
Query: what is about stuff the british stole?
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
...calculating scores...
2 ./sound/a16z_crypto.wav
...checking audio...
...checking texts...
Query: what is about crypto?
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
Ignored unknown kwarg option direction
...calculating scores...
3 ./sound/a16z_devel

In [5]:
df_highlight = pd.DataFrame(df_highlight_list)

In [6]:
pd.set_option('display.max_rows', None)
df_highlight

Unnamed: 0,sound_file,id_max,score,score_text,score_music,start_time,end_time,duration,content
0,99-invisible_matters-of-time.wav,0,0.845439,0.806799,1.0,0.4,40.8,40.4,"This is ninety nine percent, invisible, I'm Ro..."
1,99-invisible_stuff-the-british-stole.wav,0,0.87357,0.841962,1.0,0.56,32.48,31.92,"This is ninety nine percent, invisible, I'm Ro..."
2,a16z_crypto.wav,0,0.663986,0.829982,0.0,0.0,178.48,178.48,Welcome to the sixteen seed podcast Zorn today...
3,a16z_developers-as-creatives.wav,13,0.686931,0.858664,0.0,669.68,752.16,82.48,That's a great question so on. I think the ans...
4,a16z_malaria-vaccine.wav,24,0.880818,0.851022,1.0,829.12,953.6,124.48,"Well, there are a couple of things to think ab..."
5,conan-obrien_cecily-strong.wav,36,0.795365,0.744206,1.0,462.4,581.2,118.8,No trust me better put some ice on that thing....
6,conan-obrien_joel-mchale.wav,1,0.841221,0.801526,1.0,30.16,96.72,66.56,My name is Sir Joel Mchale and I feel I'm a li...
7,masters-of-scale_build-the-right-flywheel.wav,110,0.798459,0.748074,1.0,2600.72,2631.28,30.56,There is no one magic bullet when it comes to ...
8,masters-of-scale_make-it-epic.wav,68,0.820989,0.776236,1.0,1488.24,1526.96,38.72,"If you don't know about the big you project, y..."
9,masters-of-scale_master-your-emotions.wav,18,0.860032,0.82504,1.0,400.4,492.08,91.68,"I'm Reid, Offffman, CO, founder of Linkedin pa..."


In [7]:
df_all = pd.concat(df_list)

In [8]:
df_all

Unnamed: 0,speaker_id,class_labels,scores,start_time,end_time,duration,content,score_music,similarity,sound_file,total_score
0,0,"['Speech', 'Music']",[0.8772313 0.07477585],0.4,40.8,40.4,"This is ninety nine percent, invisible, I'm Ro...",1.0,0.806799,99-invisible_matters-of-time.wav,0.845439
1,1,['Speech'],[0.9992185],41.28,59.12,17.84,Real companies had to juggle all of these city...,0.0,0.729616,99-invisible_matters-of-time.wav,0.583693
2,2,['Speech'],[0.9962387],59.76,74.48,14.72,So this is a lovely clock with red letters and...,0.0,0.753295,99-invisible_matters-of-time.wav,0.602636
3,3,['Speech'],[0.99250525],74.8,127.36,52.56,We could definitely think that's the second ha...,0.0,0.766348,99-invisible_matters-of-time.wav,0.613079
4,4,"['Speech', 'Beatboxing']",[0.9625251 0.00961201],127.52,177.44,49.92,So that's the crazy part. Apparently they actu...,0.0,0.743952,99-invisible_matters-of-time.wav,0.595162
5,5,['Speech'],[0.9592903],177.52,189.92,12.4,Imagine all these people riding around trust m...,0.0,0.67555,99-invisible_matters-of-time.wav,0.54044
6,6,['Speech'],[0.99750066],190.08,208.8,18.72,I love it so much and and it's so hard for us....,0.0,0.739529,99-invisible_matters-of-time.wav,0.591623
7,7,['Silence'],[0.62599874],209.12,209.44,0.32,So really.,0.0,0.672893,99-invisible_matters-of-time.wav,0.538314
8,8,['Speech'],[0.99647635],209.6,236.48,26.88,"This, like three hundred o'clock, is a relic o...",0.0,0.766069,99-invisible_matters-of-time.wav,0.612855
9,9,['Speech'],[0.9898789],237.28,275.68,38.4,"And what do I mean by that. Well, they need lo...",0.0,0.778706,99-invisible_matters-of-time.wav,0.622964


In [9]:
df_highlight_list[0]

{'sound_file': '99-invisible_matters-of-time.wav',
 'id_max': 0,
 'score': 0.8454391002655031,
 'score_text': 0.8067988753318787,
 'score_music': 1.0,
 'start_time': 0.4,
 'end_time': 40.8,
 'duration': 40.4,
 'content': "This is ninety nine percent, invisible, I'm Roman Mars for the most board. We take time for granted. Maybe we don't have enough of it, but at least we know how it works. At least you know most of the time a lot of what we think about time and and how we keep track of. It is relatively recent and some aspects that we take for granted aren't actually all that universal and today we're going to be talking to a few of my young colleagues for a set of many stories about our evolving relationship with time and to get US started. It's Kirk Coolsa, the CO author of the nine hundred percent invisible city, and in our book we wrote about the standardization of time that came with the rise of railroads right and before Standard Times."}