In [6]:
%matplotlib inline

# Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation


Applying :class:`sklearn.decomposition.NMF` and
:class:`sklearn.decomposition.LatentDirichletAllocation` on the Rx thorax report corpus and extract additive models of the topic structure of the
corpus.  The output is a list of topics, each represented as a list of
terms (weights are not shown).

Non-negative Matrix Factorization is applied with two different objective
functions: the Frobenius norm, and the generalized Kullback-Leibler divergence.
The latter is equivalent to Probabilistic Latent Semantic Indexing.

The time complexity is polynomial in NMF. In LDA, the time complexity is
proportional to (n_samples * iterations).

Source: http://scikit-learn.org/0.18/auto_examples/applications/topics_extraction_with_nmf_lda.html


In [10]:
from time import time
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


print("Loading dataset...")
t0 = time()
df = pd.read_csv('report_sentences_preprocessed.csv', encoding="ISO-8859-1", na_filter=False)
data = pd.Series(df['v_preprocessed']).tolist()
print("done in %0.3fs." % (time() - t0))



n_features = 1000
n_components = 14
n_top_words = 20
n_samples = len(data)

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



#words occurring in only one document or in at least 95% of the documents are removed.

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features)
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(data)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,
                                evaluate_every = 128,
                                verbose = 1)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
done in 0.740s.
Extracting tf-idf features for NMF...
done in 3.341s.
Extracting tf features for LDA...
done in 3.086s.

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=206369 and n_features=1000...
done in 29.054s.

Topics in NMF model (Frobenius norm):
Topic #0: alter sin radiolog significativas signific grand no inspir plac identif hallazgos observ inter fdo rest estudi torax parenquim visualiz parenquimat
Topic #1: signific hallazg sin patologico fdo radiolog cardiomediastin entid siluet aspect patolog variacion dorsal escoliosis paciente respect parenquim normales espondilosis visualiz
Topic #2: con pleural izquierd derech derram aument relacion probabl densid bilateral atelectasi basal sen hemitorax costofren pinzamient hili valor bas infiltr
Topic #3: normal dentr limit siluet cardiomediastin normalidad estudi cardiotorac indic pulmon parenquim alto rang torax exploracion libres posteroanterior preoperatori cardi variant
Topic #4: epoc s

# Topic extraction using doc2vec and k-means

Doc2Vec (also called Paragraph Vectors) is a generalization of Word2Vec, which learns vectors from documents (https://cs.stanford.edu/~quocle/paragraph_vector.pdf). Then the K-means algorithm is used to cluster those vectors in topics.

To generate doc2vec we use gensim

In [2]:
import gensim
import os
import collections
import smart_open
import random

In [32]:
from time import time
import pandas as pd
import numpy as np
print("Loading dataset...")
t0 = time()
df = pd.read_csv('report_sentences_preprocessed.csv', encoding="ISO-8859-1", na_filter=False)
text = pd.Series(df['v_preprocessed']).str.cat(sep=' ')
data = [x.strip() for x in text.split('.')  if x.strip() != '']
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.786s.


In [69]:
def preprocess_corpus(data):
    for i, line in enumerate(data):
        line = list(filter(None, line.split()))
        yield gensim.models.doc2vec.TaggedDocument(line,[i])
train_corpus = list(preprocess_corpus(data))   
          

In [70]:
model = gensim.models.Doc2Vec(size=300, window=10, min_count=5, workers=11,alpha=0.025, min_alpha=0.025, iter=55) # use fixed learning rate



In [71]:
model.build_vocab(list(train_corpus))

In [None]:
%time model.train(list(train_corpus), total_examples=model.corpus_count, epochs=model.iter)


In [37]:
model.save('gensim_doc2vec')

In [38]:
model = gensim.models.doc2vec.Doc2Vec.load('gensim_doc2vec')

In [39]:
model.infer_vector(['sin', 'hallazg', 'signific'])

array([ 0.15340352, -0.11914615,  0.01125181, -0.08972302, -0.06965642,
       -0.06225431,  0.00456697,  0.13876516, -0.03326254, -0.04685351,
        0.11211377,  0.12251745,  0.04491276, -0.06148997,  0.13704689,
       -0.05602381,  0.02089587,  0.03450793,  0.10800999, -0.09400132,
       -0.13894641, -0.16020191, -0.05783847,  0.07556529,  0.03521203,
       -0.11349928,  0.01542531,  0.2096909 ,  0.04083255, -0.09696309,
       -0.01472371, -0.0388328 ,  0.05001611, -0.07439538,  0.09838079,
       -0.01198428, -0.01821596, -0.11436306, -0.14083408,  0.00779142,
        0.0995015 ,  0.07938567,  0.03741743, -0.03562498,  0.23675995,
        0.07780671, -0.12604718,  0.12318231,  0.03296375, -0.02621838], dtype=float32)

Evaluating the model

In [40]:
ranks = []
second_ranks = []
#for doc_id in range(len(train_corpus)):
for doc_id in range(10):
    print(train_corpus[doc_id].words)
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    print(rank)
    ranks.append(rank)
    
    second_ranks.append(sims[1])

['sign', 'radiolog', 'epoc']
156111
['radiografi', 'torax', 'posteroanterior', 'lateral']
39595
['no', 'identif', 'sign', 'radiolog', 'derram', 'pleural', 'signific']
107
['sin', 'hallazg', 'signific']
204014
['cambi', 'engros', 'pleural', 'apical', 'bilateral', 'nodular', 'probabl', 'granulom', 'secundari', 'enfermed', 'residual', 'no', 'dispon', 'radiografi', 'previ', 'pod', 'compar']
1
['tub', 'endotraqueal', '1']
192
['granulom', 'calcific', 'vertic', 'derech', 'cardiomegali']
29591
['compar', 'con', 'estudi', 'previ', 'fech', '03', '01', '2001', 'objetiv', 'resolucion', 'parcial', 'infiltr', 'neumon', 'basal', 'derech', 'resolucion', 'practic', 'complet', 'basal', 'izquierd']
24
['sin', 'hallazg', 'significacion', 'patolog', 'cambi', 'secundari', 'cirugi', 'vertic', 'pulmon', 'derech']
26
['tub', 'endotraqueal', '2', '5', 'cm', 'carin']
58074


In [41]:
collections.Counter(ranks) 

Counter({1: 1,
         24: 1,
         26: 1,
         107: 1,
         192: 1,
         29591: 1,
         39595: 1,
         58074: 1,
         156111: 1,
         204014: 1})

In [42]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (9): «tub endotraqueal 2 5 cm carin»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (267180, 0.9394081234931946): «via central subclavi izquierd termi»

MEDIAN (157240, 0.5713056325912476): «mastectomi izquierd»

LEAST (343335, -0.29552340507507324): «implantefis»



TODO: try TURI https://github.com/apple/turicreate/blob/master/userguide/text/intro.md