In [1]:
import pickle

import numpy as np
import pandas as pd
import pyLDAvis

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
f = open("../base-model.pkl", "rb")
model = pickle.load(f)

In [4]:
P = model['pipeline']

In [5]:
def chunker(lines):
    
    chunks, chunk = [], ''
    speaker_id = lines[0]['speaker_id']

    for line in lines:
        if speaker_id == line['speaker_id']:
            chunk = chunk + ' ' + line['best_text'].strip()
        else:
            chunks.append(chunk)
            chunk = line['best_text']
            speaker_id = line['speaker_id']
            
    chunks.append(chunk)
    return chunks

In [6]:
from glob import glob

PATHS = "../data/data-*/transcripts/json/*.json"
files = sorted(glob(PATHS))

In [7]:
import json
docs = []
for file in files:
    data = json.load(open(file, 'r'))
    docs.extend(chunker(data['lines']))

docs = pd.Series(docs)

In [8]:
len(docs)

10325

In [9]:
y = P.fit_transform(docs)

In [10]:
vect, lda = P.named_steps['counts'], P.named_steps['LDA']

In [11]:
def from_scikit(y, vect, lda, docs, **kwargs):
    def norm(x):
        """Standardize rows to sum to 1"""
        return x / x.sum(axis=1).reshape(-1, 1)
    
    freqs = vect.fit_transform(docs)
    
    return pyLDAvis.prepare(
        doc_lengths = docs.str.len(),
        vocab = vect.get_feature_names(),
        term_frequency = np.array(freqs.sum(axis=0)).flatten(),
        topic_term_dists = norm(lda.components_),
        doc_topic_dists = norm(y),
        **kwargs)

In [13]:
tsne, pcoa = (from_scikit(y, vect, lda, docs, mds=mds, R=20) for mds in ('tsne', 'pcoa'))

In [14]:
pyLDAvis.display(tsne)

In [15]:
pyLDAvis.display(pcoa)