# Example of the motivations for LSI and LDA
Build on top of wikipedia pages

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import urllib.request
import json

In [4]:
import spacy
nlp = spacy.load("en_core_web_lg")

## Get the data

In [5]:
from wikisearch.retrieval import WikiDataset

In [6]:
url = 'http://island.ricerca.di.unimi.it/~alfio/shared/inforet/wikipeople.json'
data = WikiDataset(url)

In [7]:
Nap = [x for i, x in enumerate(data.documents) 
     if data.queries[i] == 'Napoleon general emperor']
Cit = [x for i, x in enumerate(data.documents) 
     if data.queries[i] == 'Napoleon city Lafayette County Missouri United States']

## Build the example small dataset

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation, digits

In [9]:
tokenize = lambda text: [x.lemma_.lower() for x in nlp(text) if x.pos_ in ['NOUN', 'PROPN']]
vectorizer = CountVectorizer(tokenizer=tokenize)

In [10]:
raw_corpus = [Nap[0], Cit[0]]
corpus = []
for doc in raw_corpus:
    for sentence in nlp(doc).sents:
        corpus.append("".join([x for x in sentence.text if x not in punctuation and x not in digits]))

In [11]:
corpus[9]

'east of the Independence city limits as of this writing'

In [12]:
X = vectorizer.fit_transform(corpus)
Xa = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())



In [13]:
dictionary = ['napoleon', 'battle', 'france', 'french', 'commander', 'leader', 'history',
              'city', 'route', 'town', 'mile', 'south']
A = Xa[dictionary]

In [14]:
A

Unnamed: 0,napoleon,battle,france,french,commander,leader,history,city,route,town,mile,south
0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,1,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0,0
7,1,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,0,0,2,0
9,0,0,0,0,0,0,0,1,0,0,0,0


# LSI from scratch

In [15]:
U, S, VT = np.linalg.svd(A)
Sigma = np.zeros((A.shape[0], A.shape[1]))
Sigma[:A.shape[0], :A.shape[0]] = np.diag(S)

In [16]:
np.round(U.dot(Sigma).dot(VT), 3)

array([[-0., -0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0., -0., -0.,  0.,  0., -0.,  0.,  0., -0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1., -0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0., -0.,  0.,  0.,  0.,  0., -0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -0.,  2., -0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  2.,  1.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

### Reduce number of features

In [17]:
k = 2
Sigmak = np.zeros((A.shape[0], A.shape[1]))
Sigmak[:k, :k] = np.diag(S[:k])

In [18]:
lsi = np.round(U.dot(Sigmak).dot(VT), 3)

In [19]:
D = U.dot(Sigmak)[:,:k]
T = Sigmak.dot(VT)[:k, :]

### Documents

In [20]:
Dp = pd.DataFrame(D)

In [21]:
round(Dp, 3)

Unnamed: 0,0,1
0,0.0,0.0
1,0.0,0.0
2,0.0,-1.155
3,0.0,-1.155
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,-1.155
8,-1.859,0.0
9,0.0,0.0


In [22]:
print(corpus[1])
print(corpus[2])
print(corpus[5])
print(corpus[6])

He rose to prominence during the French Revolution and led several successful campaigns during the Revolutionary Wars
As Napoleon I he was Emperor of the French from  until  and again in 
One of the greatest commanders in history his wars and campaigns are studied at military schools worldwide
He also remains one of the most celebrated and controversial political figures in history



### Terminology

In [23]:
Tp = pd.DataFrame(T, columns=dictionary)

In [24]:
round(Tp, 2).T

Unnamed: 0,0,1
napoleon,0.0,-1.73
battle,0.0,0.0
france,0.0,-0.58
french,0.0,-0.58
commander,0.0,0.0
leader,0.0,-0.0
history,0.0,0.0
city,0.0,0.0
route,0.0,-0.58
town,-0.79,0.0


# Collapsed Gibbs sampling from scratch
- $n_{d,k}$ number of words of the document $d$ assigned to $k$
- $n_{k,w}$ number of times (instances of) word $w$ is assigned to $k$
- $n_k$ number of word instances assignements to $k$
- $z$ array of assignments to topics for each of the words instances

In [25]:
from IPython.display import display

In [26]:
K = 2
A

Unnamed: 0,napoleon,battle,france,french,commander,leader,history,city,route,town,mile,south
0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,1,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0,0
7,1,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,0,0,0,2,0
9,0,0,0,0,0,0,0,1,0,0,0,0


In [27]:
N = A.values.sum()
z = np.zeros(N)
ndk = np.zeros((A.shape[0], K))
nkw = np.zeros((K, A.shape[1]))
nk = np.zeros(K)

In [28]:
docs = []
for i, row in A.iterrows():
    doc = []
    for k, v in row.items():
        if v > 0:
            doc.append(k)
    docs.append(doc)
words = list(A.columns)

In [29]:
docs

[['leader'],
 [],
 ['napoleon', 'french'],
 ['napoleon', 'france'],
 ['battle'],
 ['commander', 'history'],
 ['history'],
 ['napoleon', 'route'],
 ['mile'],
 ['city'],
 ['town', 'mile', 'south'],
 []]

## Random initialization

In [30]:
instance = 0
for i, doc in enumerate(docs):
    for w in doc:
        t = np.random.choice(range(0, K))
        z[instance] = t
        nk[t] += 1
        ndk[i,t] += 1
        nkw[t, words.index(w)] += 1
        instance += 1

In [31]:
display(pd.DataFrame(z).T)
display(pd.DataFrame(ndk).T)
display(pd.DataFrame(nkw, columns=A.columns))
display(pd.DataFrame(nk))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,2.0,0.0,0.0,2.0,0.0
1,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


Unnamed: 0,napoleon,battle,france,french,commander,leader,history,city,route,town,mile,south
0,2.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0
1,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


Unnamed: 0,0
0,9.0
1,7.0


## Likelihood

In [32]:
alpha = np.array([1]*K)
beta = np.array([1]*len(words))

In [33]:
def theta(ndk, alpha, d_i, k_i):
    return (ndk[d_i, k_i] + alpha[k_i]) / (np.sum(ndk[d_i,:]) + alpha[k_i])

def phi(nkw, beta, w_i, k_i):
    return (nkw[k_i, w_i] + beta[w_i]) / (np.sum(nkw[k_i,:] + beta))

## Gibbs

In [34]:
def gibbs(docs, words, topics, ndk, nkw, nk, alpha, beta, iterations=10):
    for it in range(0, iterations):
        w_i = 0
        for doc_i, doc in enumerate(docs):
            for w in doc:
                word = [x for y in docs for x in y][w_i]
                topic = int(topics[w_i])
                # Remove current assignment
                ndk[doc_i, topic] -= 1
                nkw[topic, words.index(word)] -= 1
                nk[topic] -= 1
                p_z = np.zeros(len(alpha))
                for k_i in range(0, len(alpha)):
                    p_z[k_i] = theta(ndk, alpha, doc_i, k_i) * phi(nkw, beta, words.index(word), k_i)
                p_z = p_z / np.sum(p_z)
                # Sample from p_z
                new_topic = np.random.choice(len(p_z), 1, p=p_z)[0]
                # Update
                topics[w_i] = new_topic
                ndk[doc_i, new_topic] += 1
                nkw[new_topic, words.index(word)] += 1
                nk[new_topic] += 1
                w_i += 1

In [35]:
gibbs(docs, words, z, ndk, nkw, nk, alpha, beta, iterations=100)

In [36]:
display(pd.DataFrame(z).T)
display(pd.DataFrame(ndk).T)
display(pd.DataFrame(nkw, columns=A.columns))
display(pd.DataFrame(nk))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,0.0
1,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0


Unnamed: 0,napoleon,battle,france,french,commander,leader,history,city,route,town,mile,south
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0,1.0
1,2.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,0
0,8.0
1,8.0
