# Short tutorial on gensim for topic modeling

In [1]:
import nbimporter
from benchmark import ImageTags
import numpy as np
from IPython.core.display import display, HTML, Image
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools
from sklearn import metrics
from gensim import corpora, models
from classifiers import Classifier

importing Jupyter notebook from benchmark.ipynb
importing Jupyter notebook from classifiers.ipynb


In [2]:
def cm_plot(ax, classes, CM, title, figure):
    im = ax.imshow(CM, interpolation='nearest', cmap=plt.cm.Blues)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.05)
    figure.colorbar(im, cax=cax, orientation='vertical')
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=90, fontsize=12)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes, rotation=0, fontsize=12)
    ax.set_title(title, fontsize=16)
    thresh = CM.max() / 2.
    for i, j in itertools.product(range(CM.shape[0]), range(CM.shape[1])):
        ax.text(j, i, CM[i, j], horizontalalignment="center",
                 color="white" if CM[i, j] > thresh else "black", fontsize=12)
    ax.set_ylabel('True label', fontsize=16)
    ax.set_xlabel('Predicted label', fontsize=16)

In [3]:
I = ImageTags('inforet', 'googleimages', url='image_thumbnail_url', 
              selection=None)
C = Classifier(I, 'category')

## 1. Create dictionary

In [4]:
dictionary = corpora.Dictionary(I.tag_stream())

In [None]:
print dictionary
print dictionary.token2id.items()[:10]

### Gensim bag of words

In [None]:
doc = [I.tags[j] for j, x in enumerate(I.M[0]) if x > 0]
print doc
print dictionary.doc2bow(doc)

## 2. Create a corpus

In [24]:
class IMG(object):
    
    def __init__(self, imagetags, dictionary):
        self.I, self.D = imagetags, dictionary
    
    def __iter__(self):
        for doc in self.I.tag_stream():
            yield self.D.doc2bow(doc)
    
    def __len__(self):
        return self.I.M.shape[0]

In [25]:
corpus = IMG(I, dictionary)

## 3. Apply transformations

In [26]:
tfidf = models.TfidfModel(corpus) # Model trained by corpus

In [27]:
corpus_tfidf = tfidf[corpus] # New corpus transformed

## 4. Create models

### LSI

In [28]:
lsi = models.LsiModel(corpus_tfidf, id2word=corpus.D, num_topics=4)
lsi_corpus = lsi[corpus_tfidf]

Term topic matrix with shape (num_topics, vocabulary_size)

In [29]:
lsi.get_topics()

array([[ -1.69941776e-01,  -1.17329690e-03,  -1.64761776e-01, ...,
         -5.25552514e-04,  -2.81119205e-04,  -5.28990705e-04],
       [ -1.75132826e-01,  -1.25554721e-03,  -1.73434011e-01, ...,
          1.24944331e-04,   6.05062660e-05,   3.29446833e-04],
       [  6.17300526e-02,   4.57329943e-04,   6.27342346e-02, ...,
         -1.00821391e-03,  -5.90353513e-04,  -9.60073818e-04],
       [  3.13567981e-02,   2.66717898e-04,   3.23121518e-02, ...,
         -1.19995424e-03,  -7.89869900e-04,   4.75069981e-04]])

In [None]:
print lsi.show_topic(0)
print lsi.show_topic(1)

Document to topics

In [30]:
lsi_predicted = []
for lsi_vec in lsi_corpus:
    values = np.array([np.abs(x) for i, x in lsi_vec])
    lsi_predicted.append(np.argmax(values))

### LDA

In [31]:
lda = models.LdaModel(corpus_tfidf, id2word=corpus.D, num_topics=4)
lda_corpus = lda[corpus_tfidf]

In [None]:
print lda.show_topic(0)
print lda.show_topic(1)

In [32]:
lda_predicted = []
for lda_vec in lda_corpus:
    values = np.array([x for i, x in lda_vec])
    lda_predicted.append(np.argmax(values))

### HDP

In [33]:
hdp = models.HdpModel(corpus_tfidf, id2word=corpus.D)
hdp_corpus = hdp[corpus_tfidf]

In [34]:
print hdp.show_topic(0)
print hdp.show_topic(1)

[(u'bungalow', 0.0044005352246404604), (u'etching', 0.0041877364635602076), (u'boxer', 0.0038920237975361757), (u'purse', 0.0033773252087090132), (u'cozy', 0.0033431603392006255), (u'departure', 0.0032750789529845206), (u'badge', 0.0030124071616789675), (u'batch', 0.0029556911073749912), (u'skyscraper', 0.0029470456268494045), (u'bill', 0.0029436477515863216), (u'water', 0.0029284407475684443), (u'closeup', 0.0029013830459323087), (u'merry', 0.0028318259530757169), (u'Venetian', 0.0028017365707845559), (u'tie', 0.0027531114972588035), (u'thirst', 0.002732921375695657), (u'royalty', 0.0027054147215845991), (u'wildlife', 0.0026413111029540441), (u'rice', 0.0026245225450399917), (u'rainforest', 0.0026090503407302295)]
[(u'pot', 0.0037253360575587964), (u'purse', 0.003638235498818948), (u'status badge', 0.0034510398788701714), (u'joy', 0.0033468015820072119), (u'prayer', 0.0031570528233270242), (u'fashion', 0.0031420561307019847), (u'document', 0.0030059262265220237), (u'gradient', 0.00299

In [35]:
hdp_predicted = []
for hdp_vec in hdp_corpus:
    values = np.array([x for i, x in hdp_vec])
    hdp_predicted.append(np.argmax(values))

# EVALUATION

In [37]:
experiments = ['LSI', 'LDA', 'HDP']
clusters = [lsi_predicted, lda_predicted, hdp_predicted]

rows = []
for i, e in enumerate(experiments):
    data = [
        e, 
        round(metrics.adjusted_mutual_info_score(C.categories, clusters[i]), 3),
        round(metrics.adjusted_rand_score(C.categories, clusters[i]), 3),
        round(metrics.homogeneity_score(C.categories, clusters[i]), 3)
    ]
    row = "<tr>" + "".join(["<td>{}</td>".format(x) for x in data]) + "</tr>"
    rows.append(row)
table = "<table>{}</table>".format("".join(rows))
display(HTML(table))

0,1,2,3
LSI,0.217,0.19,0.218
LDA,0.155,0.118,0.158
HDP,0.0,0.004,0.001
