# Short tutorial on gensim for topic modeling

In [1]:
import nbimporter
from benchmark import ImageTags
import numpy as np
from IPython.core.display import display, HTML, Image
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools
from sklearn import metrics
from gensim import corpora, models
from classifiers import Classifier

importing Jupyter notebook from benchmark.ipynb
importing Jupyter notebook from classifiers.ipynb


In [2]:
def cm_plot(ax, classes, CM, title, figure):
    im = ax.imshow(CM, interpolation='nearest', cmap=plt.cm.Blues)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.05)
    figure.colorbar(im, cax=cax, orientation='vertical')
    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(classes, rotation=90, fontsize=12)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(classes, rotation=0, fontsize=12)
    ax.set_title(title, fontsize=16)
    thresh = CM.max() / 2.
    for i, j in itertools.product(range(CM.shape[0]), range(CM.shape[1])):
        ax.text(j, i, CM[i, j], horizontalalignment="center",
                 color="white" if CM[i, j] > thresh else "black", fontsize=12)
    ax.set_ylabel('True label', fontsize=16)
    ax.set_xlabel('Predicted label', fontsize=16)

In [3]:
I = ImageTags('inforet', 'googleimages', url='image_thumbnail_url', 
              selection=None)
C = Classifier(I, 'category')

## 1. Create dictionary

In [4]:
dictionary = corpora.Dictionary(I.tag_stream())

In [5]:
print dictionary
print dictionary.token2id.items()[:10]

Dictionary(1694 unique tokens: [u'mackerel', u'dynamic', u'dynasty', u'four', u'sleep']...)
[(u'mackerel', 1488), (u'dynamic', 1227), (u'dynasty', 576), (u'four', 986), (u'sleep', 201), (u'hanging', 1311), (u'captain', 1314), (u'aggression', 123), (u'cellular telephone', 1062), (u'sports fan', 532)]


### Gensim bag of words

In [6]:
doc = [I.tags[j] for j, x in enumerate(I.M[0]) if x > 0]
print doc
print dictionary.doc2bow(doc)

[u'balcony', u'step', u'tourist', u'town', u'window', u'house', u'sight', u'scene', u'urban', u'street', u'building', u'travel', u'outdoors', u'no person', u'tourism', u'architecture', u'sky', u'city', u'vacation', u'modern']
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]


## 2. Create a corpus

In [7]:
class IMG(object):
    
    def __init__(self, imagetags, dictionary):
        self.I, self.D = imagetags, dictionary
    
    def __iter__(self):
        for doc in self.I.tag_stream():
            yield self.D.doc2bow(doc)
    
    def __len__(self):
        return self.I.M.shape[0]

In [8]:
corpus = IMG(I, dictionary)

## 3. Apply transformations

In [10]:
tfidf = models.TfidfModel(corpus) # Model trained by corpus

In [11]:
corpus_tfidf = tfidf[corpus] # New corpus transformed

## 4. Create models

### LSI

In [18]:
lsi = models.LsiModel(corpus_tfidf, id2word=corpus.D, num_topics=4)
lsi_corpus = lsi[corpus_tfidf]

Term topic matrix with shape (num_topics, vocabulary_size)

In [20]:
lsi.get_topics()[1]

array([  1.75126356e-01,   1.25651644e-03,   1.73422478e-01, ...,
        -1.27798017e-04,  -6.04423152e-05,  -3.34730780e-04])

In [21]:
print lsi.show_topic(0)
print lsi.show_topic(1)

[(u'architecture', -0.16993642655264315), (u'city', -0.16763504784756825), (u'building', -0.16475685422558198), (u'sky', -0.16325962862914245), (u'travel', -0.15877929023973955), (u'outdoors', -0.15737438274741994), (u'cityscape', -0.13729080175629554), (u'water', -0.13681424865454558), (u'urban', -0.13579093700875691), (u'tourism', -0.13034862699789343)]
[(u'architecture', 0.17512635562620024), (u'ball', -0.17361976931925477), (u'building', 0.17342247786649248), (u'athlete', -0.16899756745575856), (u'competition', -0.16650915762269461), (u'city', 0.16166427113283136), (u'game', -0.16055728563471894), (u'sky', 0.15937670108006333), (u'sports equipment', -0.15293968610044234), (u'cityscape', 0.1525133463040993)]


Document to topics

In [29]:
lsi_predicted = []
for lsi_vec in lsi_corpus:
    values = np.array([np.abs(x) for i, x in lsi_vec])
    lsi_predicted.append(np.argmax(values))

### LDA

In [31]:
lda = models.LdaModel(corpus_tfidf, id2word=corpus.D, num_topics=4)
lda_corpus = lda[corpus_tfidf]

In [32]:
print lda.show_topic(0)
print lda.show_topic(1)

[(u'desktop', 0.0088255135), (u'leather', 0.0086313877), (u'vector', 0.0086098295), (u'illustration', 0.0084475456), (u'sport', 0.0082818214), (u'exercise', 0.008073207), (u'isolated', 0.0079897512), (u'image', 0.007886583), (u'symbol', 0.0078473194), (u'recreation', 0.0078245196)]
[(u'woman', 0.0087118978), (u'group', 0.0070632999), (u'crowd', 0.0065643089), (u'many', 0.0065352451), (u'people', 0.0063774819), (u'portrait', 0.0062129386), (u'adult', 0.0061985068), (u'girl', 0.0061666495), (u'police', 0.0058442648), (u'wear', 0.005840438)]


In [33]:
lda_predicted = []
for lda_vec in lda_corpus:
    values = np.array([x for i, x in lda_vec])
    lda_predicted.append(np.argmax(values))

### HDP

In [35]:
hdp = models.HdpModel(corpus_tfidf, id2word=corpus.D)
hdp_corpus = hdp[corpus_tfidf]

In [36]:
print hdp.show_topic(0)
print hdp.show_topic(1)

[(u'road', 0.0045546237840529413), (u'electricity', 0.0039035546718500843), (u'tee', 0.0038708281713159668), (u'sparkling', 0.0035571509700796423), (u'aid', 0.0035143649771242767), (u'population', 0.003468160154639108), (u'bubble', 0.0034430965356653676), (u'splash', 0.0033961373739557314), (u'town', 0.0033733546155359745), (u'steps', 0.0032430862491531738), (u'marina', 0.0032105925212552932), (u'window', 0.0030610754268700279), (u'cap', 0.0030609016328958273), (u'drive', 0.0029993192467058492), (u'skyscraper', 0.0029848762293942575), (u'tennis player', 0.0029597627088877648), (u'site', 0.002915973467748465), (u'health', 0.0028844433178560093), (u'clean', 0.0028162584787746951), (u'barricade', 0.002814648132703444)]
[(u'disjunct', 0.004665056703012131), (u'net', 0.003455718469394126), (u'salmon', 0.0033995429374478951), (u'moment', 0.0033651450238194438), (u'sushi', 0.0033335410927258849), (u'mare', 0.0032570436231149414), (u'figurine', 0.0032355060654377832), (u'ban', 0.00322323249227

In [37]:
hdp_predicted = []
for hdp_vec in hdp_corpus:
    values = np.array([x for i, x in hdp_vec])
    hdp_predicted.append(np.argmax(values))

In [38]:
print hdp_predicted[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


# EVALUATION

In [39]:
experiments = ['LSI', 'LDA', 'HDP']
clusters = [lsi_predicted, lda_predicted, hdp_predicted]

rows = []
for i, e in enumerate(experiments):
    data = [
        e, 
        round(metrics.adjusted_mutual_info_score(C.categories, clusters[i]), 3),
        round(metrics.adjusted_rand_score(C.categories, clusters[i]), 3),
        round(metrics.homogeneity_score(C.categories, clusters[i]), 3)
    ]
    row = "<tr>" + "".join(["<td>{}</td>".format(x) for x in data]) + "</tr>"
    rows.append(row)
table = "<table>{}</table>".format("".join(rows))
display(HTML(table))

0,1,2,3
LSI,0.216,0.189,0.218
LDA,0.142,0.119,0.143
HDP,0.0,0.004,0.001
