# Latent Dirichlet Allocation for Topic Analysis

Performs LDA on lit review abstracts for SGSMA.  Both term frequency and term frequency/inverse document frequency (TFIDF) are explored.

# Imports

In [1]:
%matplotlib notebook
import csv
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.externals import joblib
from nltk.probability import FreqDist

import warnings
warnings.filterwarnings('ignore')

# Read in doc metadata

In [2]:
doc_lookup = {}
with open("data/lit-review-doc-metadata.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        doc_lookup[row["Document Title"]] = row

print(len(doc_lookup.keys()))

9734


# Filter docs

In [3]:
not_found = 0
with open("data/lit-review-categories.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row["Document Title"] in doc_lookup:
            doc_lookup[row["Document Title"]]["Label"] = row["Domain"]
        else:
            not_found += 1

print("Missing docs: {}".format(not_found))

ignore = ("", "Overview", "Exclude")
docs = [d for d in doc_lookup.values() if "Label" in d and d["Label"] not in ignore] 
print("Filtered docs: {}".format(len(docs)))

corpus = [d["Abstract"] for d in docs]

Missing docs: 296
Filtered docs: 6413


# Create and Fit LDA model

In [4]:
# params
no_topics = 50
no_features = 1000

# vectorize words
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=2, max_features=no_features, stop_words='english', ngram_range=(1,3))
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

# create separate tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


# Create/fit models
lda_tf = LatentDirichletAllocation(
    n_components=no_topics, 
    random_state=0).fit(tf)

lda_tfidf = LatentDirichletAllocation(
    n_components=no_topics, 
    random_state=0).fit(tfidf)

Save model and related data to disk for separate analysis

In [5]:
joblib.dump({"model": lda_tf, "dtm": tf, "vectorizer": tf_vectorizer, "docs": docs}, 'lda_tf_review.joblib') 

['lda_tf_review.joblib']

# Display top words per topic

In [6]:
no_top_words = 10

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}: {}".format(
            topic_idx,
            " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        ))

## Term Frequency

In [7]:
display_topics(lda_tf, tf_feature_names, no_top_words)

Topic 0: measurement units measurement units phasor phasor measurement units phasor measurement model power dynamic models
Topic 1: bus ieee proposed 14 method placement optimal ieee 14 test observability
Topic 2: parameters parameter measurements errors pmu estimation identification pmu measurements measurement error
Topic 3: islanding time real real time actions cascading based power detection risk
Topic 4: fault location algorithm fault location proposed line proposed algorithm faults using based
Topic 5: data detection event synchrophasor events analysis time pmu data paper synchrophasor data
Topic 6: standard synchrophasor ieee 118 c37 c37 118 measurement pmu performance test
Topic 7: power flow power flow network analysis model power network based simulation results
Topic 8: fault detection faults proposed protection technique scheme fault detection current using
Topic 9: real time real time hardware simulator simulation algorithm platform developed software
Topic 10: sampling me

## Term Frequency / Inverse Document Frequency

In [8]:
display_topics(lda_tfidf, tfidf_feature_names, no_top_words)

Topic 0: zone failures field feeder feedback features feature feasibility faults fault location
Topic 1: hierarchical zone failure feeder feedback features feature feasibility faults fault location
Topic 2: zone failures field feeder feedback features feature feasibility faults fault location
Topic 3: virtual lead zone failures feeder feedback features feature feasibility faults
Topic 4: oscillations oscillation self known detecting control systems caused positioning gps authors
Topic 5: data pmu data synchrophasor synchrophasor data event pmu spectral samples synchrophasor measurements detection
Topic 6: sensor sensors factors individual wireless faster failure failures false fast
Topic 7: shedding load shedding load scheme frequency schemes adaptive protection local uncertainties
Topic 8: iec iec 61850 61850 protective c37 118 c37 individual substation feasibility fault detection
Topic 9: closed closed loop loop delays delay stability analysis feedback control scheme margin modeled
T

# Find predictions for training data

In [9]:
tf_predictions = np.argmax(lda_tf.transform(tf), axis=1)
tfidf_predictions = np.argmax(lda_tfidf.transform(tfidf), axis=1)
(tf_predictions.shape, tfidf_predictions.shape)

((6413,), (6413,))

In [10]:
def plot_topic_distribution(predictions, title):
    fix, ax = plt.subplots()
    ax.set_xlabel("topics")
    ax.set_ylabel("document count")
    ax.set_title(title)

    series = pd.Series(predictions)
    _ = series.hist(bins=no_topics, ax=ax)

In [11]:
plot_topic_distribution(tf_predictions, "TF LDA")

<IPython.core.display.Javascript object>

In [12]:
plot_topic_distribution(tfidf_predictions, "TFIDF LDA")

<IPython.core.display.Javascript object>

# Interactive Topic Explorer

In [13]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import IntSlider, Output
from IPython.display import display, clear_output

## TF Explorer

In [14]:
tf_freq = FreqDist(tf_predictions)
out1 = Output()

def f_tf(topic):
    indexes = np.where(tf_predictions==topic)[0]
    text = []
    with out1:
        clear_output()
        for idx in indexes:
            print("- {}".format(docs[idx]["Document Title"]))
        

interact(f_tf, topic=sorted([("Topic {}: {} docs".format(k, v), k) for k, v in tf_freq.items()], key=lambda d: d[1]))
display(out1)

interactive(children=(Dropdown(description='topic', options=(('Topic 0: 83 docs', 0), ('Topic 1: 270 docs', 1)…

Output()

## TFIDF Explorer

In [15]:
tfidf_freq = FreqDist(tfidf_predictions)
tfidf_out = Output()

def tfidf_f(topic):
    indexes = np.where(tfidf_predictions==topic)[0]
    with tfidf_out:
        clear_output()
        for idx in indexes:
            print("- {}".format(docs[idx]["Document Title"]))
        

interact(tfidf_f, topic=sorted([("Topic {}: {} docs".format(k, v), k) for k, v in tfidf_freq.items()], key=lambda d: d[1]))
display(tfidf_out)

interactive(children=(Dropdown(description='topic', options=(('Topic 0: 16 docs', 0), ('Topic 15: 12 docs', 15…

Output()

## Topic Word Visual Explorer

In [16]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [17]:
# MMDS and TSNE can be used for plotting rather than the default PCoA
pyLDAvis.sklearn.prepare(lda_tf, tf, tf_vectorizer, mds='mmds')

In [19]:
pyLDAvis.sklearn.prepare(lda_tf, tfidf, tfidf_vectorizer, mds='mmds')