# Latent Dirichlet Allocation for Topic Analysis

Performs LDA on lit review abstracts for SGSMA.  Both term frequency and term frequency/inverse document frequency (TFIDF) are explored.

# Imports

In [58]:
%matplotlib notebook
import csv
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.externals import joblib
from nltk.probability import FreqDist

import warnings
warnings.filterwarnings('ignore')


sns.set_style('whitegrid')
sns.set_context('notebook')

# Read in doc metadata

In [4]:
doc_lookup = {}
with open("data/lit-review-doc-metadata.csv", "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        doc_lookup[row["Document Title"]] = row

print(len(doc_lookup.keys()))

9734


# Filter docs

In [6]:
not_found = 0
with open("data/lit-review-categories.csv") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row["Document Title"] in doc_lookup:
            doc_lookup[row["Document Title"]]["Label"] = row["Domain"]
        else:
            not_found += 1

print("Missing docs: {}".format(not_found))

ignore = [
    "Exclude",
    "Overview",
    "PMU Placement",
    "PMU",
    "PDC",
    "Unknown",
    "Undefined",
    "Control",
    "Communications",
    "Phasor Estimation",
    "Simulation",
    "Islanding",
    "Time Synchronization"
    "Testing",
    "Real Time Operations",
    "Tools",
    "EMS",
    "Standards",
    "",
    "Phasor estimation",
    "Protection Systems"
]

docs = [d for d in doc_lookup.values() if "Label" in d and d["Label"] not in ignore] 
print("Filtered docs: {}".format(len(docs)))

corpus = [d["Abstract"] for d in docs]

Missing docs: 296
Filtered docs: 3872


# Create and Fit LDA model

In [46]:
# params
no_topics = 40
no_features = 750

# vectorize words
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=2, max_features=no_features, stop_words='english', ngram_range=(1,3))
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

# create separate tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
tfidf = tfidf_vectorizer.fit_transform(corpus)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


# Create/fit models
lda_tf = LatentDirichletAllocation(
    n_components=no_topics, 
    random_state=0).fit(tf)

lda_tfidf = LatentDirichletAllocation(
    n_components=no_topics, 
    random_state=0).fit(tfidf)

Save model and related data to disk for separate analysis

In [47]:
joblib.dump({"model": lda_tf, "dtm": tf, "vectorizer": tf_vectorizer, "docs": docs}, 'lda_tf_review.joblib') 

['lda_tf_review.joblib']

# Display top words per topic

In [48]:
no_top_words = 10

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}: {}".format(
            topic_idx,
            " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
        ))

## Term Frequency

In [49]:
display_topics(lda_tf, tf_feature_names, no_top_words)

Topic 0: sensor sensors propose respect effectiveness proposed best grid devices operators optimal
Topic 1: generator effective source shown method low frequency oscillations matrix frequency forced
Topic 2: bus ieee proposed test using method new 39 power systems
Topic 3: network neural networks power network neural network stage voltages nodes graph algorithm
Topic 4: current voltage method technique based impedance classification proposed learning using
Topic 5: problem algorithm optimization optimal approach proposed solution model set paper
Topic 6: signal signals event events small frequency small signal features processing time
Topic 7: pmu data phasor measurement unit phasor measurement pmu data measurement unit phasor measurement unit unit pmu
Topic 8: equipment technology data substation paper level proposed monitor statistical monitoring
Topic 9: model transmission line line transmission performance measures order network accuracy reliability
Topic 10: topology series time s

## Term Frequency / Inverse Document Frequency

In [50]:
display_topics(lda_tfidf, tfidf_feature_names, no_top_words)

Topic 0: parameters line parameter line parameters estimate considering static estimation state estimation method proposed
Topic 1: modes operators critical order pmu locations bus modal robust introduced
Topic 2: prediction model proposed dynamic risk based power ieee decomposition bus
Topic 3: model estimation network algorithms conditions power attacks evaluation measurements pmu
Topic 4: impedance method feeder power monitoring reactive estimates conditions strategy distribution
Topic 5: se matrix state proposed robust cyber physical pmu states method
Topic 6: bad data events bad signals signal time time synchronization features distance synchronization
Topic 7: state estimation state estimation measurements estimator measurement data method state estimator power
Topic 8: attacks data estimation currents method achieved voltages measurements robustness states
Topic 9: transmission line model transmission line uncertainty line parameters optimization robust models verify
Topic 10: s

# Find predictions for training data

In [51]:
tf_predictions = np.argmax(lda_tf.transform(tf), axis=1)
tfidf_predictions = np.argmax(lda_tfidf.transform(tfidf), axis=1)
(tf_predictions.shape, tfidf_predictions.shape)

((3872,), (3872,))

In [74]:
def plot_topic_distribution(predictions, title):
    fix, ax = plt.subplots()
    g = sns.distplot(predictions, bins=no_topics, norm_hist=False, kde=False, color="", ax=ax)
    ax.set_xlabel("topics")
    ax.set_ylabel("document count")
    ax.set_title(title)

#     series = pd.Series(predictions)
#     _ = series.hist(bins=no_topics, ax=ax)

In [93]:
def plot_topic_distribution(predictions, title, save=None):
    fix, ax = plt.subplots(figsize=(9,6))
    ax.set_xlabel("topics")
    ax.set_ylabel("document count")
    ax.set_title(title)
    ax.grid('false', which='x')
        
    series = pd.Series(predictions)
    _ = series.hist(bins=no_topics, ax=ax)
    
    if save:
        plt.savefig(save)
    

In [94]:
plot_topic_distribution(tf_predictions, "LDA Topic Distribution", save="topic_distribution.png")

<IPython.core.display.Javascript object>

In [54]:
plot_topic_distribution(tfidf_predictions, "TFIDF LDA")

<IPython.core.display.Javascript object>

# Interactive Topic Explorer

In [16]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import IntSlider, Output
from IPython.display import display, clear_output

## TF Explorer

In [14]:
tf_freq = FreqDist(tf_predictions)
out1 = Output()

def f_tf(topic):
    indexes = np.where(tf_predictions==topic)[0]
    text = []
    with out1:
        clear_output()
        for idx in indexes:
            print("- {}".format(docs[idx]["Document Title"]))
        

interact(f_tf, topic=sorted([("Topic {}: {} docs".format(k, v), k) for k, v in tf_freq.items()], key=lambda d: d[1]))
display(out1)

interactive(children=(Dropdown(description='topic', options=(('Topic 0: 83 docs', 0), ('Topic 1: 270 docs', 1)…

Output()

## TFIDF Explorer

In [15]:
tfidf_freq = FreqDist(tfidf_predictions)
tfidf_out = Output()

def tfidf_f(topic):
    indexes = np.where(tfidf_predictions==topic)[0]
    with tfidf_out:
        clear_output()
        for idx in indexes:
            print("- {}".format(docs[idx]["Document Title"]))
        

interact(tfidf_f, topic=sorted([("Topic {}: {} docs".format(k, v), k) for k, v in tfidf_freq.items()], key=lambda d: d[1]))
display(tfidf_out)

interactive(children=(Dropdown(description='topic', options=(('Topic 0: 16 docs', 0), ('Topic 15: 12 docs', 15…

Output()

## Topic Word Visual Explorer

In [17]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [76]:
# MMDS and TSNE can be used for plotting rather than the default PCoA
pyLDAvis.sklearn.prepare(lda_tf, tf, tf_vectorizer, mds='tsne')

In [19]:
pyLDAvis.sklearn.prepare(lda_tf, tfidf, tfidf_vectorizer, mds='mmds')