# Speaker identity pipeline

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.display import display, HTML, Markdown, Image, Video
from ipywidgets import interact, FloatSlider, IntSlider, Button, Output 
display(HTML("<style>.rendered_html.text_cell_render {max-width:600px; }</style>")) 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from cgnai.audio.embeddings import get_embedding
from cgnai.utils import cgnai_home
from cgnai.fileio import ls, load
from pathlib import Path
import torchaudio
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
from cgnai.audio.diarization import (get_superpixel_sim_matrix, 
                                     optimize_labels, 
                                     make_speaker_map, 
                                     get_speaker_timeline)
import torch

torchaudio.set_audio_backend("sox_io")
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Path to `mp3`-files
# data_path = cgnai_home()/"shared/podverse/data/dlf_politik_podcast/"
data_path = cgnai_home()/"local/data/Lex_Fridman_Podcast/Examples"
files = list(data_path.rglob("*.mp3"))
files = [f.name for f in files]
print(len(files), files[0])

3 lex_#330.mp3


In [None]:
results_dir =  data_path
!mkdir -p $results_dir

## Helper

**Note** They might be specific to the "DLF Politik" podcast.

In [None]:
import json
from urllib.parse import urlparse
from collections import defaultdict

with open(data_path/"episode_info.json", "r") as f:
    episode_info = json.load(f)

    
files_to_episode_ids = {}
for episode_id, info in episode_info.items():
    url   = info['audio_url']
    fname = Path(urlparse(url).path).name
    
    assert fname in files
        
    files_to_episode_ids[fname] = int(episode_id)
    
def get_episode_id(fname):
    return files_to_episode_ids[fname]

## Step 1: Generate embeddings

Load mp3's and compute vector embeddings.

In [None]:
for file in files:
    print(file)
    wav, sr = torchaudio.load(data_path/file, format="mp3")
    emb     = get_embedding(wav, sr, device=dev)
    np.save(str(data_path / file) + "_emb.npy", emb)

lex_#330.mp3
lex_#329.mp3
lex_#328.mp3


## Step 2: Generate Super Pixels and similarity matrices

Take vector embeddings and compute similarity matrices and super pixels.

In [None]:
from cgnai.audio.superpixels import find_super_pixels, plot_super_pixels
matplotlib.use('Agg')

for file in files:
    print(file)
    
    emb = load(str(data_path / file) + "_emb.npy")
    emb = emb/(np.linalg.norm(emb, axis=1,keepdims=True))
    d   = emb@emb.T
    
    # Number of segments at start
    N0 = round(20 * d.shape[0]/60) 
    
    # Run super pixel heuristic
    # and save data
    I = find_super_pixels(d, N0 = N0, mu = 0.0001, min_pixels_per_min = 2)
    np.save(str(results_dir/file) + "_super_pixels.npy", np.array(I))
    
    
    print(f"T:  {d.shape[0]}")
    print(f"N0: {N0}")
    print(f"N:  {len(I)-1}")
    # ------------------------------
    fig = plt.figure(figsize=(10,10))
    plt.title(f"{file}")
    plt.imshow(d, interpolation="none")
    plt.savefig(str(results_dir / file) + "_sim.jpg")
    plt.close(fig)
    
    fig = plot_super_pixels(I,d)
    plt.savefig(str(results_dir / file) + "_super_pixels.jpg")
    plt.close(fig)

lex_#330.mp3
INFO superpixels|  min_pixels: 337
T:  10120
N0: 3373
N:  2117
lex_#329.mp3
INFO superpixels|  min_pixels: 367
T:  11010
N0: 3670
N:  2429
lex_#328.mp3
INFO superpixels|  min_pixels: 577
T:  17316
N0: 5772
N:  3680


<img src="_temp/der_politik_podcast_folge_148_corona_geld_und_die_dlf_20200515_0117_44862193.mp3_super_pixels.jpg" width="30%">

## Step 3: Generate Diarization

Compute optimized speaker ids from super pixels

In [None]:
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

# Colormaps for visualizations below
viridis   = cm.get_cmap('tab10', 256)
newcolors = viridis(np.linspace(0, 1, 256))
white     = np.array([1,1,1, 1])
newcolors[:1, :] = white
speaker_cm = ListedColormap(newcolors)

In [None]:
from cgnai.audio.diarization import (get_superpixel_sim_matrix, 
                                     optimize_labels, 
                                     make_speaker_map, 
                                     get_speaker_timeline)
from os.path import exists

for i, file in enumerate(files):
    print(i, file)
    
    # Load embeddings
    # and similarity matrix
    emb = load(str(data_path / file) + "_emb.npy")
    emb = emb/(np.linalg.norm(emb, axis=1,keepdims=True))
    d   = emb@emb.T
    
    # load super pixels
    # and create speaker map
    I = load(str(data_path / file) + "_super_pixels.npy")
    M = get_superpixel_sim_matrix(d, I)
    ids, logp = optimize_labels(M, I, max_speaker=10)
    sm = make_speaker_map(I, ids)
    
    # Save speaker ids
    np.save(str(data_path / file) + "_speaker_ids.npy", ids)
    # -------------------------------
    fig = plt.figure(figsize=(10,10))
    plt.imshow(sm, interpolation="None", cmap=speaker_cm, vmin=0, vmax=np.max(ids))
    plt.savefig(str(data_path / file) + "_speaker_map.jpg")
    plt.close(fig)
    
    fig = plt.figure(figsize=(10,10))
    plt.hist(get_speaker_timeline(ids,I), bins=len(set(ids)));
    plt.savefig(str(data_path / file) + "_times.jpg")
    plt.close(fig)

0 lex_#330.mp3
1 lex_#329.mp3
2 lex_#328.mp3


## Step 4: Extract Authors

For each episode we want to define a superset of speakers. That is,  the list should contain at least names of all present speakers, but can contain more.

**Note.** This part of the pipeline is specific to the "DLF Politik" podcast.

In [None]:
import json

with open(data_path/"episode_info.json", "r") as f:
    episode_info = json.load(f)

In [None]:
def normalize_author(author):
    if ',' in author:
        i = author.find(',')
        author = author[i+1:].strip() + ' ' + author[0:i].strip()
    return author.strip()

def parse_from_authors_field(info):
    if '|' not in info['author']:
        return None
    authors = info['author'][0:info['author'].find('|')]
    authors = authors.split(';')
    return list(map(normalize_author, authors))

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("flair/ner-german")

def parse_authors_from_details(info):
    txt = info['details'] + ' ' + info['description']
    sentence = Sentence(txt)
    tagger.predict(sentence)
    authors = [entity.text for entity in sentence.get_spans('ner') if entity.tag=='PER']
    return list(set(map(normalize_author, authors)))

In [None]:
def parse_authors(info):
    authors = parse_from_authors_field(info)
    if authors is not None:
        return authors
    authors = parse_authors_from_details(info)
    return list(filter(lambda a: ' ' in a, authors))

In [None]:
from pathlib import Path
from urllib.parse import urlparse
from collections import defaultdict


all_authors          = set()
files_to_authors     = defaultdict(list)
files_to_episode_ids = {}

for episode_id, info in episode_info.items():
    url   = info['audio_url']
    fname = Path(urlparse(url).path).name
    
    assert fname in files
    
    a = parse_authors(info)
    
    files_to_authors[fname] = a
    files_to_episode_ids[fname] = int(episode_id)
    
    all_authors.update(a)
    
def get_episode_id(fname):
    return files_to_episode_ids[fname]

In [None]:
import json

with open(data_path / "episode_authors.json", "w") as f:
    json.dump(files_to_authors, f)

## Step 5: Set up and solve factor graph

In [None]:
import json
from collections import defaultdict

with open(data_path / "episode_authors_clean.json", "r") as f:
    files_to_authors = json.load(f)

all_authors=set()
authors_to_files=defaultdict(set)
for fname, authors in files_to_authors.items():
    for author in authors:
        authors_to_files[author].add(fname)
    all_authors.update(authors)

### Load super pixels, IDs, embeddings into mem 

In [None]:
from cgnai.audio.similarities import load_ids, load_super_pixels, load_embedding

ids = {}
Is = {}
embs = {}
for i, fname in enumerate(files):
    ids[fname] = load_ids(data_path / fname)
    Is[fname] = load_super_pixels(data_path / fname)
    embs[fname] = load_embedding(data_path / fname)

### Solve factor graphs

In [None]:
def plot_timeline(episode_id):
    fname = [f for f in files if "folge_103" in f][0]
    plt.plot(get_speaker_timeline(ids[fname], Is[fname])[80:86])

In [None]:
import factorgraph as fg
from cgnai.audio.similarities import get_clusters, get_cluster_similarity



def create_factor_graph(fnames, ids, Is, embs, rv_constraints, constrained_files):
    g = fg.Graph()
    
    print(f"processing {len(fnames)} episodes")
    constrained_authors_count = 0
    for fname in fnames:
        emb = embs[fname]
        authors = files_to_authors[fname]
        cls, timeline = get_clusters(ids[fname], Is[fname])
        for cl in cls.keys():
            rv_id = f"{get_episode_id(fname)}_{cl}"
            g.rv(rv_id, len(authors), labels=authors, meta={'fname': fname, 'cl': cl})
            if rv_id in rv_constraints:
                F = np.zeros((len(authors)))
                F[authors.index(rv_constraints[rv_id])] = 1.0
                g.factor([rv_id], potential=F)
                constrained_authors_count += 1
                
        
        S = get_cluster_similarity(cls, emb, cls, emb)
        for i, cl1 in enumerate(cls.keys()):
            for j, cl2 in enumerate(cls.keys()):
                if i >= j:
                    continue
                s = min(S[i, j]/0.6, 1.0)
                F = np.ones((len(authors), len(authors))) * (1 - s)
                np.fill_diagonal(F, s)
                g.factor(
                    [f"{get_episode_id(fname)}_{cl1}",
                     f"{get_episode_id(fname)}_{cl2}"],
                    potential = F
                )
    print(f"added {constrained_authors_count} name constraints")
    for a, fname_a in enumerate(fnames):
        
        emb_a = embs[fname_a]
        authors_a = np.array(files_to_authors[fname_a])
        cls_a, _ = get_clusters(ids[fname_a], Is[fname_a])
        
        for b, fname_b in enumerate(fnames):
            if a >= b:
                continue
            emb_b = embs[fname_b]
            authors_b = np.array(files_to_authors[fname_b])
            cls_b, _ = get_clusters(ids[fname_b], Is[fname_b])
            
            S = get_cluster_similarity(cls_a, emb_a, cls_b, emb_b)
            for i, cl_i in enumerate(cls_a.keys()):
                for j, cl_j in enumerate(cls_b.keys()):
                    s = min(S[i, j]/0.6, 1.0)
                    F = np.where(authors_a[:, None]==authors_b[None, :], s, 1-s)
                    g.factor(
                        [f"{get_episode_id(fname_a)}_{cl_i}",
                         f"{get_episode_id(fname_b)}_{cl_j}"],
                        potential = F
                    )
    return g

In [None]:
def split_given_size(a, size):
    splits = np.split(a, np.arange(size,len(a),size))
    if len(splits[-1]) < size / 2 and len(splits) > 1:
        splits[-2] = np.concatenate((splits[-2], splits[-1]))
        return splits[:-1]
    return splits

In [None]:

# maps RV names to authors.
rv_constraints={} # rv_name => author

constrained_authors_to_files = defaultdict(set)

In [None]:
import random

for author, fnames in sorted(authors_to_files.items(), key=lambda x: len(x[1]), reverse=True):
    # Set up factor graph, including rv constraints
    print(">",author, len(fnames))
    
    n_episodes = 5
    for fnames_ in split_given_size(np.array(list(fnames)), n_episodes):
        g = create_factor_graph(fnames_, ids, Is, embs, rv_constraints, constrained_authors_to_files[author])
        iters, converged = g.lbp(normalize=True,  max_iters=100)
        if not converged:
            print(f"NOT CONVERGED!")
            continue
        marginals = g.rv_marginals(normalize=True)
        for rv, probs in marginals:
            for n in range(0,probs.shape[0]):
                authors = files_to_authors[rv.meta['fname']]
                fname = rv.meta['fname']
                cl = rv.meta['cl']
                if probs[n] > 0.99: # authors[n] == author and 
                    rv_id = f"{get_episode_id(fname)}_{cl}"
                    if rv_id in rv_constraints and authors[n] != rv_constraints[rv_id]:
                        print(f"{rv_constraints[rv_id]} -> {authors[n]}")
                    rv_constraints[rv_id] = authors[n]
                    constrained_authors_to_files[authors[n]].add(fname)
        print(f"# of identified speakers: {len(rv_constraints)}")

In [None]:
for _ in range(0,20):
    fnames_ = random.sample(files, 10)
    fnames_ = [f for f in fnames_ if f in files_to_authors and len(files_to_authors[f]) > 1]
    
    g = create_factor_graph(fnames_, ids, Is, embs, rv_constraints, constrained_authors_to_files[author])
    iters, converged = g.lbp(normalize=True,  max_iters=100)
    if not converged:
        print(f"NOT CONVERGED!")
        continue
    marginals = g.rv_marginals(normalize=True)
    for rv, probs in marginals:
        for n in range(0,probs.shape[0]):
            authors = files_to_authors[rv.meta['fname']]
            fname = rv.meta['fname']
            cl = rv.meta['cl']
            if probs[n] > 0.99: # authors[n] == author and 
                rv_id = f"{get_episode_id(fname)}_{cl}"
                if rv_id in rv_constraints and authors[n] != rv_constraints[rv_id]:
                    print(f"{rv_constraints[rv_id]} -> {authors[n]}")
                rv_constraints[rv_id] = authors[n]
                constrained_authors_to_files[authors[n]].add(fname)
    print(f"# of identified speakers: {len(rv_constraints)}")

# Playground

In [None]:
from datetime import datetime, timedelta

def parse_timestamp(timestamp):
    try:
        t=datetime.strptime(timestamp, '%H:%M:%S.%f')
    except:
        t=datetime.strptime(timestamp, '%M:%S.%f')
    s = round(t.microsecond/1000000)
    return timedelta(hours=t.hour, minutes=t.minute, seconds=(t.second+s))

In [None]:
[files.index(f) for f in constrained_authors_to_files['Ann-Kathrin Büüsker']]

In [None]:
def get_fname(eid):
    return [f for f in files if ("folge_" + str(eid)) in f][0]

def get_fname_index(eid):
    return files.index(get_fname(eid))

In [None]:
from collections import Counter

def print_speaker_annotate_transcript(transcript, timeline, ids_to_names):
    for line in transcript:
        (t0, t1), txt = line
        t0 = round(t0.total_seconds())
        t0 = max(t0 - 2, 0)
        t1 = round(t1.total_seconds())
        t1 = min(t1 - 2, len(timeline))
        ids = timeline[t0:t1]
        speaker_id = Counter(ids).most_common(1)[0][0]
        
        if speaker_id in ids_to_names:
            name = f'{ids_to_names[speaker_id]:>20}'
        else:
            name = f'                    '
        ids = f'{str(ids):>20}'
        print(f'{t0:>5}', name,ids, txt)

In [None]:
def load_transcript(mp3_path):
    transcript = []
    with open(str(mp3_path) + ".vtt", "r") as f:
        ls = f.readlines()[2:]
        timestamps = ls[0::3]
        txt = ls[1::3]
        for entry in zip(timestamps, txt):
            t0, t1 = entry[0].split("-->")
            
            t0 = parse_timestamp(t0.strip())
            t1 = parse_timestamp(t1.strip())
            txt = entry[1].strip()
            transcript.append(((t0, t1),txt))
    return transcript

In [None]:
def show_episode(f):
    if isinstance(f, int):
        fname = files[f]
    else:
        fname = f
    episode_id = get_episode_id(fname)
    mp3_path = data_path / fname
    transcript = load_transcript(mp3_path)
    I = load_super_pixels(mp3_path)
    ids = load_ids(mp3_path)
    timeline = get_speaker_timeline(ids, I)
    ids_to_names = {}
    for i in set(ids):
        if f"{episode_id}_{i}" in rv_constraints:
            ids_to_names[i] = rv_constraints[f"{episode_id}_{i}"]
    print(files_to_authors[fname])
    plt.hist([ids_to_names[i] for i in ids if i in ids_to_names]);
    plt.show()
    print(f'============ {fname} ============')
    print_speaker_annotate_transcript(transcript, timeline, ids_to_names)
show_episode(101)

In [None]:
def create_transcript_document(transcript, timeline, ids_to_names):
    doc = []
    for line in transcript:
        (t0, t1), txt = line
        t0 = round(t0.total_seconds())
        t0 = max(t0 - 2, 0)
        t1 = round(t1.total_seconds())
        t1 = min(t1 - 2, len(timeline))
        ids = timeline[t0:t1]
        if len(ids) == 0:
            continue
        speaker_id = Counter(ids).most_common(1)[0][0]
        if speaker_id in ids_to_names:
            doc.append({
                "start": t0,
                "end": t1,
                "speaker": ids_to_names[speaker_id],
                "text": txt
            })
    return doc

In [None]:
def create_transcript_document_for_file_index(i):
    fname = files[i]
    episode_id = get_episode_id(fname)
    mp3_path = data_path / fname
    transcript = load_transcript(mp3_path)
    I = load_super_pixels(mp3_path)
    ids = load_ids(mp3_path)
    timeline = get_speaker_timeline(ids, I)
    ids_to_names = {}
    for i in set(ids):
        if f"{episode_id}_{i}" in rv_constraints:
            ids_to_names[i] = rv_constraints[f"{episode_id}_{i}"]
    lines = create_transcript_document(transcript, timeline, ids_to_names)
    return {
        "filename": fname,
        "episode_id": episode_id,
        "transcript": lines
    }

In [None]:
from itertools import groupby
from operator import itemgetter

speaker_texts = defaultdict(list)

for i, file in enumerate(files):
    doc = create_transcript_document_for_file_index(i)['transcript']
    speakers = [x['speaker'] for x in doc]
    for k, g in groupby(enumerate(speakers), lambda x: x[1]):
        text = []
        start = None
        for g_ in map(itemgetter(0), g):
            text.append(doc[g_]['text'])
            if start is None:
                start = doc[g_]['start']
        text = ' '.join(text)
        speaker_texts[k].append((text, file, start))

# Elastic

In [None]:
from elasticsearch import Elasticsearch

ModuleNotFoundError: No module named 'elasticsearch'

In [None]:
client = Elasticsearch(
    "http://localhost:9200"
)

# Successful response!
client.info()

## Create Index

In [None]:
client.indices.create(index="transcripts", body={
"mappings": {
    "properties": {
        "transcript":{
            "type":"nested"
        },
        "episode_id": { "type": "integer" },
        "filename": { "type": "keyword"  },
        
    }
}})

## Insert Documents

In [None]:
for i in range(0,len(files)):
    doc = create_transcript_document_for_file_index(i)
    resp = client.index(index="transcripts", id=i, document=doc)

## Query Index

In [None]:
import json

def query_transcripts(author, query):
    q = {
        "nested": {
            "path": "transcript",
            "query": {
                "bool": {
                    "must": [
                        {"match": {"transcript.speaker": author}},
                        {"match": {"transcript.text": query}}
                    ]
                }
            },
            "inner_hits" : {}
        }
    }
    resp=client.search(index="transcripts", query=q, source=False, docvalue_fields=["episode_id", "filename"])
    return [{
        'episode_id': r['fields']['episode_id'][0],
        'filename': r['fields']['filename'][0],
        'hits': [
            {
                'speaker': i['_source']['speaker'],
                'start': i['_source']['start'],
                'end': i['_source']['end'],
                'text': i['_source']['text']
            } for i in r['inner_hits']['transcript']['hits']['hits']
        ]
    } for r in resp['hits']['hits']]

In [None]:
query_transcripts("Frank Capellan", "Krieg")

In [None]:
show_episode(get_fname(272))