# Topic modeling with LSI and LDA
Using `Gensim` on real datasets

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import urllib.request
import json
import spacy
nlp = spacy.load("en_core_web_sm")
from wikisearch.retrieval import WikiDataset

## Data preprocessing
In this example, we work on the wikidataset at the **sentence level**. We work on `NOUN` and `PROPN` only and we skip empty sentences after tokenization. The ground truth is given by wikientities (multi-class labeling).

### Exercise
Wikipages contain multiple entities (links to wikidata entities). Try to use such information to build a ground truth for multi-label classification.

In [3]:
from string import punctuation, digits

In [4]:
tokenize = lambda text: [x.lemma_.lower() for x in text if x.pos_ in ['NOUN', 'PROPN']]

In [5]:
url = 'http://island.ricerca.di.unimi.it/~alfio/shared/inforet/wikipeople.json'
data = WikiDataset(url)

In [6]:
entities = list(set(data.entities))

In [7]:
corpus, assignment = [], []
for i, doc in tqdm(list(enumerate(data.documents))):
    clean_doc = "".join([x for x in doc if x not in punctuation and x not in digits])
    for sentence in nlp(clean_doc).sents:
        tokens = tokenize(sentence)
        if len(tokens) > 0:
            corpus.append(tokens)
            assignment.append(data.entities[i])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1139.0), HTML(value='')))




In [8]:
example = 2345
ex_entity = assignment[example]
print(corpus[example], ex_entity, data.entity_label[ex_entity])

['day', 'may', 'abraham', 'lincoln', 'field', 'son', 'forest', 'ground'] Q2821841 Abraham Lincoln (grandfather of president Abraham Lincoln)


## Prepare data for Gensim

In [15]:
from gensim.corpora import Dictionary

In [16]:
dictionary = Dictionary(corpus)
C = [dictionary.doc2bow(doc) for doc in corpus]

In [20]:
reverse = lambda c, d: [(d[x], y) for x, y in c]

In [21]:
reverse(C[234], dictionary)

[('aristotle', 3),
 ('philosopher', 2),
 ('philosophy', 1),
 ('tradition', 1),
 ('scholar', 2),
 ('teacher', 2),
 ('medieval', 1),
 ('averroes', 1),
 ('dante', 1),
 ('exemplar', 1),
 ('muslim', 1),
 ('poem', 1),
 ('title', 1)]

## Latent Semantic Indexing

In [11]:
from gensim.models import LsiModel

In [25]:
model = LsiModel(C, id2word=dictionary, num_topics=len(entities))

In [26]:
vectors = model[C]

### Topic vectors

In [30]:
topics = model.get_topics()
topics.shape

(33, 8354)

In [36]:
for topicno in range(len(entities)):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in model.show_topic(topicno, topn=10)], '\n')

Topic 0
[('jesus', 0.91), ('john', 0.14), ('god', 0.1), ('film', 0.09), ('gospel', 0.08), ('matthew', 0.08), ('luke', 0.08), ('disciple', 0.07), ('scholar', 0.07), ('mark', 0.07)] 

Topic 1
[('napoleon', 0.55), ('lincoln', 0.4), ('churchill', 0.3), ('war', 0.29), ('jesus', -0.18), ('army', 0.15), ('year', 0.12), ('france', 0.11), ('battle', 0.11), ('abraham', 0.1)] 

Topic 2
[('napoleon', 0.65), ('lincoln', -0.64), ('abraham', -0.16), ('churchill', -0.14), ('war', -0.09), ('lincolns', -0.08), ('france', 0.08), ('army', 0.07), ('battle', 0.06), ('john', -0.06)] 

Topic 3
[('churchill', -0.77), ('lincoln', 0.43), ('napoleon', 0.26), ('war', -0.16), ('abraham', 0.12), ('minister', -0.1), ('muhammad', -0.08), ('government', -0.08), ('prime', -0.06), ('year', -0.06)] 

Topic 4
[('muhammad', 0.82), ('churchill', -0.19), ('year', 0.16), ('mecca', 0.15), ('medina', 0.13), ('god', 0.13), ('muslims', 0.12), ('napoleon', -0.11), ('ibn', 0.11), ('jesus', -0.1)] 

Topic 5
[('caesar', -0.77), ('film

### Document vectors

In [38]:
for v in vectors[0]:
    print(v)

(0, 0.06146628616438651)
(1, 0.07756338994373224)
(2, -0.012168671663828566)
(3, -0.02733045450509954)
(4, 0.08362935097411132)
(5, -0.3006005441175099)
(6, 0.0066352702618239405)
(7, 0.12243976291150545)
(8, 0.9393588507114129)
(9, 0.4266297113070801)
(10, -0.04494656379003664)
(11, 0.13074951500076426)
(12, -0.18371933733791404)
(13, 0.06206086714540601)
(14, -0.1135035987605824)
(15, 0.07591684031177959)
(16, -0.09144335736916245)
(17, -0.2070544273795482)
(18, 0.05891951156978529)
(19, 0.09071300319567417)
(20, -0.14895229498404466)
(21, 0.07361159522794633)
(22, -0.10510853573562373)
(23, 0.056676445271941954)
(24, -0.030317383929361206)
(25, -0.16939201326209224)
(26, 0.09593532928812909)
(27, 0.05965468423235859)
(28, -0.10388720681338377)
(29, -0.16863701329086345)
(30, -0.04763563079047266)
(31, 0.03360323456898951)
(32, -0.025998864035574783)


### Map a new document on the model

In [39]:
q = ['abraham', 'lincoln', 'president']
v = model[dictionary.doc2bow(q)]

## LDA

In [77]:
from gensim.models import LdaModel

In [78]:
lda = LdaModel(C, id2word=dictionary, num_topics=len(entities))

In [79]:
lda_vectors = lda[C]

In [80]:
lda_vectors[2]

[(13, 0.1737541), (14, 0.34003782), (17, 0.4482893)]

In [81]:
lda_topics = lda.get_topics()

In [82]:
lda_topics.shape

(33, 8354)

In [83]:
for topicno in range(len(entities)):
    print('Topic {}'.format(topicno))
    print([(x, round(y, 2)) for x, y in lda.show_topic(topicno, topn=10)], '\n')

Topic 0
[('abraham', 0.07), ('army', 0.02), ('system', 0.02), ('general', 0.02), ('churchill', 0.01), ('ceremony', 0.01), ('unit', 0.01), ('marshal', 0.01), ('saint', 0.01), ('mile', 0.01)] 

Topic 1
[('war', 0.03), ('family', 0.02), ('man', 0.02), ('france', 0.02), ('size', 0.02), ('charles', 0.02), ('bank', 0.02), ('project', 0.02), ('union', 0.01), ('april', 0.01)] 

Topic 2
[('churchill', 0.02), ('day', 0.02), ('napoleon', 0.02), ('man', 0.01), ('war', 0.01), ('port', 0.01), ('hero', 0.01), ('july', 0.01), ('minute', 0.01), ('house', 0.01)] 

Topic 3
[('russians', 0.02), ('soldier', 0.02), ('court', 0.02), ('english', 0.02), ('us', 0.02), ('time', 0.01), ('abraham', 0.01), ('people', 0.01), ('plot', 0.01), ('century', 0.01)] 

Topic 4
[('song', 0.02), ('london', 0.01), ('churchill', 0.01), ('good', 0.01), ('york', 0.01), ('star', 0.01), ('complexion', 0.01), ('attention', 0.01), ('independence', 0.01), ('health', 0.01)] 

Topic 5
[('french', 0.06), ('austria', 0.02), ('year', 0.02)

### Words to topics

In [84]:
tid = dictionary.token2id['jesus']
lda.get_term_topics(tid)

[(6, 0.033375844),
 (7, 0.023949811),
 (9, 0.014820911),
 (10, 0.011981268),
 (12, 0.013095714),
 (13, 0.010792364),
 (15, 0.024703098),
 (16, 0.015780577),
 (17, 0.03333264),
 (19, 0.023893114),
 (30, 0.0157442),
 (31, 0.024752295)]

### Experiment: force words in their top topic only

In [85]:
from collections import defaultdict

In [95]:
np.round(lda_topics[:,8], 4)

array([0.    , 0.    , 0.    , 0.0007, 0.0006, 0.    , 0.    , 0.    ,
       0.    , 0.    , 0.    , 0.0006, 0.    , 0.    , 0.0006, 0.0016,
       0.    , 0.    , 0.    , 0.0047, 0.0023, 0.    , 0.0005, 0.    ,
       0.0006, 0.    , 0.0011, 0.    , 0.    , 0.0007, 0.0006, 0.0006,
       0.0004], dtype=float32)

In [94]:
dictionary.token2id['philosopher']

8

In [87]:
lda.get_term_topics(8)

[]

In [96]:
topic2words = defaultdict(lambda: [])
words2topic = {}
for word, wid in dictionary.token2id.items():
    best_t = np.argmax(lda_topics[:,wid])
    best_p = lda_topics[best_t,wid]
    topic2words[best_t].append((word, best_p))
    words2topic[word] = (best_t, best_p)

In [98]:
for topic, words in topic2words.items():
    print(topic, [x for x, _ in sorted(words, key=lambda k: -k[1])[:10]])

19 ['aristotle', 'son', 'motion', 'foot', 'event', 'rate', 'critic', 'parent', 'j', 'argument']
24 ['confederate', 'church', 'marriage', 'book', 'address', 'month', 'david', 'copy', 'friend', 'paper']
11 ['slavery', 'union', 'treaty', 'follower', 'compromise', 'van', 'hope', 'issue', 'poland', 'freedom']
7 ['brother', 'death', 'james', 'new', 'home', 'mother', 'child', 'testament', 'building', 'battlefield']
18 ['louis', 'congress', 'struggle', 'henry', 'nation', 'elizabeth', 'october', 'public', 'england', 'minister']
13 ['version', 'name', 'president', 'leipzig', 'bible', 'republic', 'national', 'november', 'library', 'america']
30 ['fort', 'american', 'de', 'house', 'portugal', 'portrait', 'way', 'number', 'member', 'military']
17 ['jesus', 'life', 'birth', 'white', 'news', 'science', 'subject', 'angel', 'father', 'casualty']
1 ['war', 'family', 'man', 'size', 'charles', 'bank', 'project', 'april', 'conflict', 'program']
14 ['proclamation', 'wife', 'course', 'medina', 'community', '

In [101]:
print(words2topic['french'], words2topic['france'])

(5, 0.060463376) (12, 0.04582151)


# Exercize
1. Exploit lsi and lda models for query purposes
2. Exploit lsi and lda models for clustering
3. Compare the results against TfIdf vectorization