# Topic modeling with LSI and LDA
Using `Gensim` on real datasets

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import urllib.request
import json
import spacy
nlp = spacy.load("en_core_web_lg")
from wikisearch.retrieval import WikiDataset

## Data preprocessing
In this example, we work on the wikidataset at the **sentence level**. We work on `NOUN` and `PROPN` only and we skip empty sentences after tokenization. The ground truth is given by wikientities (multi-class labeling).

### Exercise
Wikipages contain multiple entities (links to wikidata entities). Try to use such information to build a ground truth for multi-label classification.

In [6]:
from string import punctuation, digits

In [4]:
tokenize = lambda text: [x.lemma_.lower() for x in text if x.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV']]

In [7]:
url = 'http://island.ricerca.di.unimi.it/~alfio/shared/inforet/wikipeople.json'
data = WikiDataset(url)

In [8]:
entities = list(set(data.entities))

In [9]:
entities[:3]

['Q332528', 'Q587557', 'Q715814']

In [10]:
corpus, assignment = [], []
for i, doc in tqdm(list(enumerate(data.documents))):
    clean_doc = "".join([x for x in doc if x not in punctuation and x not in digits])
    for sentence in nlp(clean_doc).sents:
        tokens = tokenize(sentence)
        if len(tokens) > 0:
            corpus.append(tokens)
            assignment.append(data.entities[i])

  0%|          | 0/1139 [00:00<?, ?it/s]

In [11]:
corpus, assignment = [], []
for i, doc in tqdm(list(enumerate(data.documents))):
    clean_doc = "".join([x for x in doc if x not in punctuation and x not in digits])
    tokens = tokenize(nlp(clean_doc))
    if len(tokens) > 0:
        corpus.append(tokens)
        assignment.append(data.entities[i])

  0%|          | 0/1139 [00:00<?, ?it/s]

In [12]:
example = 345
ex_entity = assignment[example]
print(corpus[example], ex_entity, data.entity_label[ex_entity])

['city', 'chemnitz', 'consist', 'neighborhood', 'neighborhood', 'einsiedel', 'euba', 'grüna', 'klaffenbach', 'kleinolbersdorfaltenhain', 'mittelbach', 'röhrsdorf', 'wittgensdorf', 'be', 'same', 'time', 'locality', 'meaning', 'section', 'saxon', 'municipal', 'code', 'neighborhood', 'come', 'wake', 'last', 'incorporation', 'wave', 'formerly', 'independent', 'municipality', 'city', 'chemnitz', 'therefore', 'enjoy', 'special', 'position', 'compare', 'other', 'part', 'city', 'locality', 'have', 'local', 'council', 'depend', 'number', 'inhabitant', 'locality', 'concern', 'comprise', 'member', 'as', 'well', 'chairman', 'same', 'local', 'council', 'be', 'hear', 'important', 'matter', 'concern', 'locality', 'final', 'decision', 'be', 'however', 'incumbent', 'city', 'council', 'city', 'chemnitz', 'official', 'identification', 'district', 'number', 'base', 'following', 'principle', 'start', 'city', 'center', 'neighborhood', 'zentrum', 'schloßchemnitz', 'other', 'part', 'city', 'assign', 'clockwis

## Prepare data for Gensim

In [13]:
from gensim.corpora import Dictionary
from collections import defaultdict

In [14]:
I = defaultdict(lambda: 0)
for doc in corpus:
    for word in doc:
        I[word] += 1

In [15]:
fcorpus = []
for doc in corpus:
    newdoc = [x for x in doc if I[x] > 10]
    fcorpus.append(newdoc)
corpus = fcorpus

In [16]:
dictionary = Dictionary(corpus)

In [17]:
dictionary.doc2bow(['philosopher', 'philosopher', 'ancient'])

[(1, 1), (28, 2)]

In [18]:
C = [dictionary.doc2bow(doc) for doc in corpus]

In [19]:
reverse = lambda c, d: [(d[x], y) for x, y in c]

In [20]:
reverse(C[234], dictionary)

[('be', 1),
 ('tradition', 1),
 ('when', 1),
 ('christian', 1),
 ('gain', 1),
 ('name', 1),
 ('accompany', 2),
 ('experience', 1),
 ('say', 1),
 ('state', 1),
 ('trade', 1),
 ('islamic', 1),
 ('career', 1),
 ('meet', 1),
 ('god', 1),
 ('prophet', 1),
 ('journey', 1),
 ('muhammad', 2),
 ('uncle', 1),
 ('muhammads', 1),
 ('meccans', 1)]

## Latent Semantic Indexing

In [21]:
from gensim.models import LsiModel

In [22]:
model = LsiModel(C, id2word=dictionary, num_topics=10)

In [23]:
vectors = model[C]

In [24]:
type(vectors)

gensim.interfaces.TransformedCorpus

In [26]:
vectors[0]

[(0, 1.488921782540863),
 (1, -0.47945759098946195),
 (2, -0.7082532105642725),
 (3, -0.2761320117255319),
 (4, -0.4585511912655323),
 (5, -0.4690218443442252),
 (6, 0.2122506047414563),
 (7, -0.7317228312495568),
 (8, 1.4331343817028788),
 (9, 2.0199043964081564)]

### Topic vectors

In [27]:
topics = model.get_topics()
topics.shape

(10, 1241)

In [28]:
for topicno in range(10):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=10)], '\n')

Topic 0
[('be', 0.59), ('jesus', 0.36), ('lincoln', 0.14), ('napoleon', 0.14), ('have', 0.13), ('churchill', 0.13), ('muhammad', 0.12), ('when', 0.11), ('war', 0.1), ('other', 0.1)] 

Topic 1
[('jesus', 0.82), ('be', -0.24), ('napoleon', -0.16), ('lincoln', -0.14), ('churchill', -0.13), ('john', 0.12), ('war', -0.1), ('matthew', 0.1), ('luke', 0.09), ('god', 0.09)] 

Topic 2
[('napoleon', 0.68), ('be', -0.42), ('french', 0.28), ('army', 0.17), ('jesus', 0.12), ('muhammad', -0.12), ('france', 0.1), ('war', 0.1), ('battle', 0.09), ('aristotle', -0.09)] 

Topic 3
[('lincoln', 0.71), ('muhammad', -0.28), ('napoleon', -0.26), ('abraham', 0.19), ('be', -0.17), ('lincolns', 0.14), ('war', 0.12), ('carrier', 0.1), ('state', 0.09), ('support', 0.09)] 

Topic 4
[('muhammad', 0.69), ('churchill', -0.27), ('lincoln', 0.27), ('be', -0.27), ('muslims', 0.14), ('ibn', 0.13), ('medina', 0.12), ('mecca', 0.12), ('god', 0.11), ('muslim', 0.11)] 

Topic 5
[('churchill', 0.69), ('be', -0.25), ('napoleon',

### Document vectors

In [29]:
for v in vectors[10]:
    print(v)

(0, 2.104251011335291)
(1, -0.7033590056898559)
(2, -1.0850356008985789)
(3, -0.5002738948153902)
(4, -0.6327588829925445)
(5, -0.6204359599119935)
(6, -0.15175312891704792)
(7, -0.5422335503967235)
(8, 0.999514744648123)
(9, 1.6994713814362343)


### Map a new document on the model

In [30]:
q = ['abraham', 'lincoln', 'president']
qbow = dictionary.doc2bow(q)

In [31]:
qbow

[(777, 1), (1087, 1), (1199, 1)]

In [32]:
v = model[qbow]

In [33]:
np.array([y for x, y in v])

array([ 0.19492349, -0.18403036,  0.05077765,  0.9628104 ,  0.3679299 ,
       -0.27037442, -0.11135121,  0.06071447, -0.0517115 , -0.04167743])

## LDA

In [None]:
from gensim.models import LdaModel

In [None]:
lda = LdaModel(C, id2word=dictionary, num_topics=10)

In [None]:
lda_vectors = lda[C]

In [None]:
lda_vectors[0]

In [None]:
lda_topics = lda.get_topics()

In [None]:
lda_topics.shape

In [None]:
for topicno in range(6):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in lda.show_topic(topicno, topn=10)], '\n')

### Words to topics

In [None]:
tid = dictionary.token2id['jesus']
lda.get_term_topics(tid)

### Experiment: force words in their top topic only

In [None]:
from collections import defaultdict

In [None]:
np.round(lda_topics[:,8], 4)

In [None]:
dictionary.token2id['philosopher']

In [None]:
lda.get_term_topics(8)

In [None]:
topic2words = defaultdict(lambda: [])
words2topic = {}
for word, wid in dictionary.token2id.items():
    best_t = np.argmax(lda_topics[:,wid])
    best_p = lda_topics[best_t,wid]
    topic2words[best_t].append((word, best_p))
    words2topic[word] = (best_t, best_p)

In [None]:
for topic, words in topic2words.items():
    print(topic, [x for x, _ in sorted(words, key=lambda k: -k[1])[:10]])

In [None]:
print(words2topic['french'], words2topic['france'])

# Exercize
1. Exploit lsi and lda models for query purposes
2. Exploit lsi and lda models for clustering
3. Compare the results against TfIdf vectorization