In [6]:
%matplotlib inline

# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


Applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on the Rx thorax report corpus and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The time complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).


In [10]:
from time import time
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


print("Loading dataset...")
t0 = time()
df = pd.read_csv('report_sentences_preprocessed.csv', encoding="ISO-8859-1", na_filter=False)
data = pd.Series(df['v_preprocessed']).tolist()
print("done in %0.3fs." % (time() - t0))



n_features = 1000
n_components = 14
n_top_words = 20
n_samples = len(data)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



#words occurring in only one document or in at least 95% of the documents are removed.

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                                evaluate_every = 128,
                                verbose = 1)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.740s.
Extracting tf-idf features for NMF...
done in 3.341s.
Extracting tf features for LDA...
done in 3.086s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=206369 and n_features=1000...
done in 29.054s.

Topics in NMF model (Frobenius norm):
Topic #0: alter sin radiolog significativas signific grand no inspir plac identif hallazgos observ inter fdo rest estudi torax parenquim visualiz parenquimat
Topic #1: signific hallazg sin patologico fdo radiolog cardiomediastin entid siluet aspect patolog variacion dorsal escoliosis paciente respect parenquim normales espondilosis visualiz
Topic #2: con pleural izquierd derech derram aument relacion probabl densid bilateral atelectasi basal sen hemitorax costofren pinzamient hili valor bas infiltr
Topic #3: normal dentr limit siluet cardiomediastin normalidad estudi cardiotorac indic pulmon parenquim alto rang torax exploracion libres posteroanterior preoperatori cardi variant
Topic #4: epoc s