# Topic modeling with LSI and LDA
Using `Gensim` on real datasets

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import urllib.request
import json
import spacy
nlp = spacy.load("en_core_web_sm")
from wikisearch.retrieval import WikiDataset

## Data preprocessing
In this example, we work on the wikidataset at the **sentence level**. We work on `NOUN` and `PROPN` only and we skip empty sentences after tokenization. The ground truth is given by wikientities (multi-class labeling).

### Exercise
Wikipages contain multiple entities (links to wikidata entities). Try to use such information to build a ground truth for multi-label classification.

In [3]:
from string import punctuation, digits

In [4]:
tokenize = lambda text: [x.lemma_.lower() for x in text if x.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']]

In [5]:
url = 'http://island.ricerca.di.unimi.it/~alfio/shared/inforet/wikipeople.json'
data = WikiDataset(url)

In [6]:
entities = list(set(data.entities))

In [None]:
corpus, assignment = [], []
for i, doc in tqdm(list(enumerate(data.documents))):
    clean_doc = "".join([x for x in doc if x not in punctuation and x not in digits])
    for sentence in nlp(clean_doc).sents:
        tokens = tokenize(sentence)
        if len(tokens) > 0:
            corpus.append(tokens)
            assignment.append(data.entities[i])

In [8]:
corpus, assignment = [], []
for i, doc in tqdm(list(enumerate(data.documents))):
    clean_doc = "".join([x for x in doc if x not in punctuation and x not in digits])
    tokens = tokenize(nlp(clean_doc))
    if len(tokens) > 0:
        corpus.append(tokens)
        assignment.append(data.entities[i])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1139.0), HTML(value='')))




In [9]:
example = 345
ex_entity = assignment[example]
print(corpus[example], ex_entity, data.entity_label[ex_entity])

['city', 'chemnitz', 'consist', 'neighborhood', 'neighborhood', 'einsiedel', 'euba', 'grüna', 'klaffenbach', 'kleinolbersdorfaltenhain', 'mittelbach', 'röhrsdorf', 'wittgensdorf', 'same', 'time', 'locality', 'meaning', 'sections', 'saxon', 'municipal', 'code', 'neighborhood', 'come', 'wake', 'last', 'incorporation', 'wave', 'formerly', 'independent', 'municipality', 'city', 'chemnitz', 'therefore', 'enjoy', 'special', 'position', 'compare', 'other', 'part', 'city', 'locality', 'local', 'council', 'depend', 'number', 'inhabitant', 'locality', 'concern', 'comprise', 'member', 'as', 'well', 'chairman', 'same', 'local', 'council', 'hear', 'important', 'matter', 'concern', 'locality', 'final', 'decision', 'however', 'incumbent', 'city', 'council', 'city', 'chemnitz', 'official', 'identification', 'district', 'number', 'base', 'follow', 'principle', 'start', 'city', 'center', 'neighborhood', 'zentrum', 'schloßchemnitz', 'other', 'part', 'city', 'assign', 'clockwise', 'ascend', 'order', 'tent

## Prepare data for Gensim

In [10]:
from gensim.corpora import Dictionary
from collections import defaultdict

In [11]:
I = defaultdict(lambda: 0)
for doc in corpus:
    for word in doc:
        I[word] += 1

In [12]:
fcorpus = []
for doc in corpus:
    newdoc = [x for x in doc if I[x] > 10]
    fcorpus.append(newdoc)
corpus = fcorpus

In [14]:
dictionary = Dictionary(corpus)

In [15]:
dictionary.doc2bow(['philosopher', 'philosopher', 'ancient'])

[(1, 1), (27, 2)]

In [16]:
C = [dictionary.doc2bow(doc) for doc in corpus]

In [18]:
reverse = lambda c, d: [(d[x], y) for x, y in c]

In [19]:
reverse(C[234], dictionary)

[('tradition', 1),
 ('when', 1),
 ('christian', 1),
 ('gain', 1),
 ('name', 1),
 ('accompany', 2),
 ('experience', 1),
 ('say', 1),
 ('state', 1),
 ('trade', 1),
 ('islamic', 1),
 ('career', 1),
 ('meet', 1),
 ('god', 1),
 ('prophet', 1),
 ('journey', 1),
 ('muhammad', 2),
 ('uncle', 1),
 ('muhammads', 1),
 ('meccans', 1)]

## Latent Semantic Indexing

In [20]:
from gensim.models import LsiModel

In [26]:
model = LsiModel(C, id2word=dictionary, num_topics=10)

In [27]:
vectors = model[C]

In [28]:
type(vectors)

gensim.interfaces.TransformedCorpus

### Topic vectors

In [29]:
topics = model.get_topics()
topics.shape

(10, 1247)

In [31]:
for topicno in range(10):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=10)], '\n')

Topic 0
[('jesus', 0.59), ('napoleon', 0.18), ('lincoln', 0.15), ('churchill', 0.14), ('muhammad', 0.13), ('john', 0.12), ('when', 0.12), ('war', 0.11), ('other', 0.11), ('would', 0.1)] 

Topic 1
[('jesus', -0.65), ('napoleon', 0.31), ('lincoln', 0.22), ('churchill', 0.21), ('war', 0.17), ('french', 0.15), ('army', 0.11), ('would', 0.09), ('force', 0.08), ('john', -0.08)] 

Topic 2
[('napoleon', -0.66), ('lincoln', 0.51), ('french', -0.24), ('churchill', 0.16), ('abraham', 0.13), ('army', -0.11), ('lincolns', 0.1), ('france', -0.09), ('battle', -0.08), ('carrier', 0.07)] 

Topic 3
[('muhammad', -0.64), ('lincoln', 0.38), ('napoleon', 0.25), ('jesus', 0.15), ('muslims', -0.13), ('ibn', -0.12), ('medina', -0.12), ('muhammads', -0.11), ('mecca', -0.11), ('french', 0.11)] 

Topic 4
[('churchill', -0.71), ('lincoln', 0.4), ('muhammad', 0.27), ('napoleon', 0.16), ('british', -0.13), ('war', -0.11), ('abraham', 0.11), ('minister', -0.1), ('government', -0.09), ('lincolns', 0.08)] 

Topic 5
[(

### Document vectors

In [32]:
for v in vectors[10]:
    print(v)

(0, 1.060786828925309)
(1, 0.5725640342744063)
(2, 0.271922196092288)
(3, -0.5747414827050337)
(4, -0.06076002347827541)
(5, 0.9374538855472313)
(6, 1.933458936341755)
(7, 1.720639600577701)
(8, 0.853552899736879)
(9, -0.2604155867322992)


### Map a new document on the model

In [33]:
q = ['abraham', 'lincoln', 'president']
qbow = dictionary.doc2bow(q)

In [34]:
qbow

[(781, 1), (1090, 1), (1204, 1)]

In [35]:
v = model[qbow]

In [36]:
np.array([y for x, y in v])

array([ 2.20502382e-01,  3.05937274e-01,  7.00335235e-01,  5.02944194e-01,
        5.30852142e-01, -2.05203769e-01, -9.59407729e-02, -5.31992735e-02,
       -5.93701868e-04, -1.82609826e-01])

## LDA

In [37]:
from gensim.models import LdaModel

In [38]:
lda = LdaModel(C, id2word=dictionary, num_topics=10)

In [39]:
lda_vectors = lda[C]

In [40]:
lda_vectors[0]

[(1, 0.4598989), (4, 0.5250003)]

In [41]:
lda_topics = lda.get_topics()

In [42]:
lda_topics.shape

(10, 1247)

In [43]:
for topicno in range(6):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in lda.show_topic(topicno, topn=10)], '\n')

Topic 0
[('lincoln', 0.02), ('aristotle', 0.01), ('first', 0.01), ('bear', 0.01), ('also', 0.01), ('play', 0.01), ('churchill', 0.01), ('support', 0.01), ('state', 0.01), ('give', 0.01)] 

Topic 1
[('muhammad', 0.02), ('napoleon', 0.01), ('lincoln', 0.01), ('year', 0.01), ('french', 0.01), ('film', 0.01), ('medina', 0.01), ('first', 0.01), ('when', 0.01), ('aristotle', 0.01)] 

Topic 2
[('churchill', 0.02), ('war', 0.01), ('also', 0.01), ('film', 0.01), ('make', 0.01), ('year', 0.01), ('muhammad', 0.01), ('become', 0.01), ('first', 0.01), ('new', 0.01)] 

Topic 3
[('jesus', 0.04), ('napoleon', 0.01), ('churchill', 0.01), ('first', 0.01), ('when', 0.01), ('force', 0.01), ('john', 0.01), ('year', 0.01), ('french', 0.01), ('would', 0.01)] 

Topic 4
[('lincoln', 0.02), ('jesus', 0.01), ('napoleon', 0.01), ('other', 0.01), ('use', 0.01), ('include', 0.01), ('army', 0.01), ('new', 0.01), ('more', 0.01), ('gospel', 0.01)] 

Topic 5
[('jesus', 0.02), ('lincoln', 0.02), ('city', 0.01), ('film',

### Words to topics

In [45]:
tid = dictionary.token2id['jesus']
lda.get_term_topics(tid)

[(3, 0.040068567), (4, 0.013906933), (5, 0.020271765), (6, 0.014605697)]

### Experiment: force words in their top topic only

In [46]:
from collections import defaultdict

In [47]:
np.round(lda_topics[:,8], 4)

array([4.0e-04, 4.0e-04, 1.1e-03, 1.0e-04, 9.0e-04, 4.0e-04, 2.0e-04,
       3.0e-04, 8.0e-04, 6.0e-04], dtype=float32)

In [48]:
dictionary.token2id['philosopher']

27

In [49]:
lda.get_term_topics(8)

[]

In [50]:
topic2words = defaultdict(lambda: [])
words2topic = {}
for word, wid in dictionary.token2id.items():
    best_t = np.argmax(lda_topics[:,wid])
    best_p = lda_topics[best_t,wid]
    topic2words[best_t].append((word, best_p))
    words2topic[word] = (best_t, best_p)

In [51]:
for topic, words in topic2words.items():
    print(topic, [x for x, _ in sorted(words, key=lambda k: -k[1])[:10]])

1 ['muhammad', 'year', 'medina', 'take', 'great', 'know', 'slavery', 'bonaparte', 'leave', 'end']
4 ['other', 'include', 'army', 'more', 'gospel', 'later', 'bc', 'will', 'life', 'could']
0 ['lincoln', 'aristotle', 'bear', 'play', 'support', 'only', 'accord', 'lincolns', 'union', 'animal']
9 ['war', 'would', 'french', 'party', 'british', 'day', 'university', 'government', 'france', 'defeat']
5 ['city', 'film', 'god', 'abraham', 'state', 'new', 'family', 'prophet', 'father', 'roman']
2 ['churchill', 'make', 'become', 'general', 'us', 'victory', 'call', 'begin', 'united', 'minister']
7 ['napoleon', 'work', 'battle', 'win', 'brutus', 'caesars', 'history', 'group', 'so', 'rome']
6 ['when', 'john', 'give', 'people', 'many', 'man', 'see', 'most', 'aristotles', 'soul']
8 ['caesar', 'also', 'then', 'first', 'use', 'name', 'time', 'where', 'house', 'early']
3 ['jesus', 'force', 'matthew', 'mark', 'son', 'disciple', 'carrier', 'however', 'such', 'luke']


In [None]:
print(words2topic['french'], words2topic['france'])

# Exercize
1. Exploit lsi and lda models for query purposes
2. Exploit lsi and lda models for clustering
3. Compare the results against TfIdf vectorization