# Abstract

This notebook investigates what's up with the MoM likelihood derivation

# Prelude

## Imports

In [47]:
import numpy as np
import scipy as sp
import sys
import os
import pathlib
import logging
import scipy.special as fns
import pandas as pd
from IPython.display import display

In [9]:
sys.path.append(str(pathlib.Path.cwd().parent))

In [10]:
from sidetopics.model import DataSet
from sidetopics.model.sklearn import *
from sidetopics.model.sklearn.lda_cvb import TopicModelType, ScoreMethod
from sidetopics.model.evals import perplexity_from_like

In [11]:
from sidetopics.model.sklearn.tests.test_lda_cvb import *

# Test Setup

In [16]:
from sidetopics.model.mom_em import ModelState, QueryState
from sidetopics.model.common import DataSet

def log_likelihood_expected(data: DataSet, model: ModelState, query: QueryState) -> float:
    lls = np.zeros(shape=(len(data), model.K), dtype=model.dtype)

    # p(z=k | alpha)
    if query is None:  # user prior, the standard likelihood formula
        logging.info("Returning likelihood based on prior over topic assignments")
        alpha = np.ndarray(shape=(len(data), model.K), dtype=model.dtype)
        alpha[:, :] = corpusTopicDistDirichletParam(model)[np.newaxis, :]
        alpha_sum = np.ndarray(shape=(len(data),), dtype=model.dtype)
        alpha_sum[:] = corpusTopicDistDirichletParam(model).sum()
    else:  # Use a different distribution, e.g. the posterior from a doc-completion split
        logging.info("Returning likelihood based on give (e.g. posterior) assignments")
        alpha = topicDistsDirichletParam(query)  # DxK
        alpha_sum = alpha.sum(axis=1)            # Dx1

    lls_check = np.zeros(shape=(len(data), model.K), dtype=model.dtype)
    lls_check += fns.loggamma(alpha_sum)[:, np.newaxis]
    lls_check -= fns.loggamma(alpha_sum + 1)[:, np.newaxis]
    # FIXME this memory explosion needs to be contained
    lls_check += np.sum(fns.loggamma(np.eye(model.K)[np.newaxis, :, :] + alpha[:, :, np.newaxis]), axis=2)
    lls_check -= np.sum(fns.loggamma(alpha), axis=1)[:, np.newaxis]
    # FIXME at this point exp(lls) does not resemble the prior at all <-- This is probably the issue

    # Try at this point using the point estimate of the prior and see what happens to isolate.
    if query is not None:
        lls += np.log(query.topicDists)
    else:
        lls += np.log(corpusTopicDist(model))[np.newaxis, :]

    # p(X = x|z=k, beta)
    doc_lens = np.squeeze(np.array(data.words.sum(axis=1)))

    beta = wordDistsDirichletParam(model)  # K x T
    beta_sum = beta.sum(axis=1)  # K x 1

    lls += fns.loggamma(beta_sum)[np.newaxis, :]
    lls -= fns.loggamma(doc_lens[:, np.newaxis] + beta_sum[np.newaxis, :])
    for k in range(model.K):
        lls[:, k] += np.squeeze(np.array(
            fns.loggamma(data.words + (beta[k, :])[np.newaxis, :]).sum(axis=1)
        ))
    lls -= np.sum(fns.loggamma(beta), axis=1)[np.newaxis, :]

    max_lls = lls.max(axis=1)
    lls -= max_lls[:, np.newaxis]
    np.exp(lls, out=lls)

    lls = max_lls + np.log(lls.sum(axis=1))

    return lls.sum()

In [18]:
full_data_raw = TopicModelTestSample.new_fixed(0xBADB055)
full_dataset = full_data_raw.as_dataset(debug=True)
train_data, test_data = full_dataset.cross_valid_split(test_fold_id=0, num_folds=4, debug=True)
est_data, eval_data = test_data.doc_completion_split(debug=True)

model = TopicModel(kind=TopicModelType.MOM_VB, n_components=full_data_raw.n_components, seed=0xC0FFEE)
model.fit(train_data, iterations=100, persist_query_state=True)

_ = model.transform(est_data, persist_query_state=True)
q = model.query_state_
m = model._model_state

common.py:__init__:36   :: Converting words to sparse CSR matrix
lda_cvb.py:fit:233   :: Creating new model-state at random
lda_cvb.py:make_or_resume_query_state:349   :: Creating new query state
lda_cvb.py:_set_or_clear_last_query_state:255   :: Persisting query state
lda_cvb.py:fit:250   :: Fit model to data after 100 iterations
lda_cvb.py:transform:321   :: Transforming data
lda_cvb.py:make_or_resume_query_state:349   :: Creating new query state
lda_cvb.py:_set_or_clear_last_query_state:255   :: Persisting query state
lda_cvb.py:transform:338   :: Obtained a (25, 5) topic-assignments matrix after 100 iterations


In [83]:
import sidetopics.model.mom_em as mom_em

data = eval_data

logging.info("Returning likelihood based on give (e.g. posterior) assignments")
alpha = mom_em.topicDistsDirichletParam(q)  # DxK
alpha_sum = alpha.sum(axis=1)               # Dx1
    
lls_check = np.zeros(shape=(len(data), m.K), dtype=m.dtype)
lls_check += fns.loggamma(alpha_sum)[:, np.newaxis]
lls_check -= fns.loggamma(alpha_sum + 1)[:, np.newaxis]
# FIXME this memory explosion needs to be contained
lls_check += np.sum(fns.loggamma(np.eye(m.K)[np.newaxis, :, :] + alpha[:, :, np.newaxis]), axis=1)
lls_check -= np.sum(fns.loggamma(alpha), axis=1)[:, np.newaxis]
# FIXME at this point exp(lls) does not resemble the prior at all <-- This is probably the issue


<ipython-input-83-e986b4819d58>:<module>:5   :: Returning likelihood based on give (e.g. posterior) assignments


In [84]:
lls = np.log(mom_em.topicDists(q))

In [85]:
def safe_soft_max(x: np.ndarray) -> np.ndarray:
    mx = x.max(axis=1)
    x = x - mx[:, np.newaxis]
    x = np.exp(x)
    return (x / x.sum(axis=1)[:, np.newaxis]).round(2)

def show_soft_max(x: np.ndarray) -> pd.DataFrame:
    s = safe_soft_max(x)
    s = (s * 100).astype(np.int32)
    return pd.DataFrame(s)

In [86]:
show_soft_max(lls_check).head(5)

Unnamed: 0,0,1,2,3,4
0,0,100,0,0,0
1,0,100,0,0,0
2,0,0,0,0,100
3,0,0,0,0,100
4,0,100,0,0,0


In [87]:
show_soft_max(lls).head(5)

Unnamed: 0,0,1,2,3,4
0,0,100,0,0,0
1,0,100,0,0,0
2,0,0,0,0,100
3,0,0,0,0,100
4,0,100,0,0,0


In [61]:
pd.DataFrame(mom_em.topicDists(q)).head().round(2).astype(np.int32)

Unnamed: 0,0,1,2,3,4
0,0,1,0,0,0
1,0,1,0,0,0
2,0,0,0,0,1
3,0,0,0,0,1
4,0,1,0,0,0


In [62]:
pd.DataFrame(mom_em.topicDistsDirichletParam(q)).head().round(2).astype(np.int32)

Unnamed: 0,0,1,2,3,4
0,0,25,0,0,0
1,0,25,0,0,0
2,0,0,0,0,25
3,0,0,0,0,25
4,0,25,0,0,0


In [80]:
lls = np.zeros(alpha.shape, dtype=alpha.dtype)
display(lls.shape)
for row in range(alpha.shape[0]):
    for col in range(alpha.shape[1]):
        ac = alpha[row].copy()
        ac[col] += 1
        lls[row, col] = float(fns.loggamma(ac).sum())

(25, 5)

In [81]:
pd.DataFrame(lls).round().astype(int).head(5)

Unnamed: 0,0,1,2,3,4
0,303,521,505,408,392
1,1234,1757,1513,1174,1397
2,1215,1126,1064,911,1424
3,949,836,772,743,1085
4,410,447,147,417,411


In [82]:
pd.DataFrame((fns.loggamma(np.eye(m.K)[np.newaxis, :, :] + alpha[:, :, np.newaxis])).sum(axis=1)).round().astype(int)

Unnamed: 0,0,1,2,3,4
0,303,521,505,408,392
1,1234,1757,1513,1174,1397
2,1215,1126,1064,911,1424
3,949,836,772,743,1085
4,410,447,147,417,411
5,92,159,180,156,176
6,652,750,678,562,865
7,275,547,517,610,634
8,395,272,319,307,332
9,927,619,807,703,696
