In [2]:
import sys

sys.path.append('../gtm/')
sys.path.append('../src/')

In [3]:
import importlib
%matplotlib inline
import matplotlib.pyplot as plt
import collections
import pickle
import numpy as np
from random import random
import torch
import pandas as pd
from tqdm import tqdm
from corpus import GTMCorpus
from patsy import dmatrix
from scipy.optimize import linear_sum_assignment

from corpus import GTMCorpus
from gtm import GTM

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
num_topics = 20
num_simulations = 5
doc_args = {
    "num_content_covs": 2,
    "num_prev_covs": 2,
    "min_words": 50,
    "max_words": 100,
    "num_docs": 10000,
    "voc_size": 1000}

- I checked the estimation difference between gtm(dirichlet*sage) and lda
- Mainly checked two parts
  - the cossim scores of true and estimated doc_topic matrix
    - pd.DataFrame of cossim_score 
      - like the pd.DataFramae of LDA, the diagonal elements should be higher than others. But, the element of GTM pd.DataFrame are all equally high.
  - the first 5 docs' topic proportions (true and estimated doc_topic)
    - lda estimated fairly well
    - In gtm, even though true topic proportions are unbalanced, the estimated proportions are almost equal

#### generating docs by GTM and estimating by GTM (dirichlet*SAGE)

In [9]:
gtm_model_args = {
    "n_topics": num_topics,
    "num_epochs": 10,
    "update_prior": True,
    "doc_topic_prior": "dirichlet",
    "decoder_type": "sage",
    "decoder_estimate_interactions": True,
    "encoder_hidden_layers":[512,256],
    "decoder_hidden_layers":[300],
    "seed":0
    }


### generating docs and creating true doc_topic dist
df_true_dist_list, docs = generate_docs_by_gtm(
    num_topics=20,
    doc_topic_prior=gtm_model_args["doc_topic_prior"],
    decoder_type=gtm_model_args["decoder_type"],
    seed=0,
    doc_args=doc_args,
    is_output=False,
)


### estimating doc_topic dist
df_test = pd.DataFrame(
    data={"doc":docs["doc"],
    "doc_clean":docs["doc"],
    "prevalence": docs["prevalence_covariates"],
    "content": docs["content_covariates"],
    }
)
test_dataset = GTMCorpus(
    df_test,
    prevalence="~ prevalence",
    content="~ content",
    embeddings_type = None,
)
tm_test = GTM(
    train_data = test_dataset,
    **gtm_model_args
)
df_doc_topic_gtm = pd.DataFrame(
        tm_test.get_doc_topic_distribution(test_dataset),
        index=["Doc{}".format(i) for i in range(doc_args["num_docs"])],
        columns=["Topic{}".format(i) for i in range(num_topics)],
    )

true_df = df_true_dist_list[0]
estimated_df = df_doc_topic_gtm


### matching the columns of estimated doc_topic dist with those of true doc_topic dist by maximizing dot-product
dotproduct_list = []
for true_col in true_df.columns:
    true_target_col = true_df.loc[:, true_col]
    dotproduct_list_per_row = []
    for col in estimated_df.columns:
        target_col = estimated_df.loc[:, col]
        dotproduct_list_per_row.append(np.dot(target_col, true_target_col))
    dotproduct_list.append(dotproduct_list_per_row)

corres_num_topic_dict = {}
dotproduct_matrix = pd.DataFrame(dotproduct_list)

true_topics, estimated_topics = linear_sum_assignment(-dotproduct_matrix)
for true_topic, estimated_topic in zip(true_topics, estimated_topics):
    corres_num_topic_dict["Topic{}".format(true_topic)] = "Topic{}".format(
        estimated_topic
    )

rearanged_df_gtm = estimated_df.loc[:, corres_num_topic_dict.values()]
rearanged_df_gtm.columns = corres_num_topic_dict.keys()


### calculating the cossim scores between true and estimated doc_topic dist
cossim_score = []
for true_col in true_df.columns:
    score_per_topic = []
    series_1 = true_df.loc[:, true_col]
    for col in rearanged_df_gtm.columns:
        series_2 = rearanged_df_gtm.loc[:, col]
        score_per_topic.append(
            np.dot(series_1.T, series_2)
            / (np.linalg.norm(series_1) * np.linalg.norm(series_2)))
    cossim_score.append(score_per_topic)

100%|██████████| 10000/10000 [01:53<00:00, 88.24it/s]


Epoch   1	Iter   10	Total Training Loss:2.5533109	Rec Loss:2.0433400	MMD Loss:0.5099514	Sparsity Loss:0.0000197	Pred Loss:0.0000000
Epoch   1	Iter   20	Total Training Loss:2.3821440	Rec Loss:2.0512173	MMD Loss:0.3309071	Sparsity Loss:0.0000196	Pred Loss:0.0000000
Epoch   1	Iter   30	Total Training Loss:2.4138753	Rec Loss:2.1047819	MMD Loss:0.3090737	Sparsity Loss:0.0000197	Pred Loss:0.0000000
Epoch   1	Iter   40	Total Training Loss:33.1842842	Rec Loss:34.1274490	MMD Loss:-0.9434826	Sparsity Loss:0.0003173	Pred Loss:0.0000000
Epoch   1	Training Loss:3.4876612
Epoch   2	Iter   10	Total Training Loss:2.0876143	Rec Loss:2.0320191	MMD Loss:0.0544250	Sparsity Loss:0.0011702	Pred Loss:0.0000000
Epoch   2	Iter   20	Total Training Loss:2.0815988	Rec Loss:2.0218239	MMD Loss:0.0586269	Sparsity Loss:0.0011480	Pred Loss:0.0000000
Epoch   2	Iter   30	Total Training Loss:2.1210263	Rec Loss:2.0837359	MMD Loss:0.0361493	Sparsity Loss:0.0011411	Pred Loss:0.0000000
Epoch   2	Iter   40	Total Training Loss

In [10]:
pd.DataFrame(cossim_score)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.752271,0.751229,0.761298,0.751523,0.766565,0.751728,0.76906,0.737417,0.74241,0.759901,0.757959,0.751319,0.758603,0.744842,0.357952,0.766733,0.770552,0.746464,0.738831,0.747116
1,0.780635,0.78216,0.790223,0.788325,0.800142,0.782879,0.797446,0.768733,0.772761,0.792489,0.788546,0.784901,0.791223,0.774411,0.370273,0.802016,0.800565,0.774615,0.770629,0.775385
2,0.689953,0.695435,0.698632,0.690806,0.708183,0.693315,0.7078,0.676871,0.681243,0.696759,0.700848,0.690445,0.69385,0.687467,0.319644,0.706978,0.709082,0.686702,0.684109,0.687682
3,0.732471,0.73705,0.738326,0.735207,0.75065,0.734854,0.750052,0.718546,0.725164,0.739827,0.737919,0.737553,0.738475,0.72662,0.34864,0.749198,0.751511,0.72783,0.720367,0.722881
4,0.562076,0.564055,0.568334,0.563814,0.580452,0.568154,0.575095,0.545004,0.559017,0.568083,0.570741,0.564188,0.56427,0.555949,0.274184,0.577628,0.582347,0.554892,0.555823,0.555543
5,0.740985,0.74072,0.749635,0.74081,0.760707,0.746254,0.756972,0.722755,0.731829,0.749482,0.746176,0.743371,0.744532,0.732666,0.352681,0.759527,0.760973,0.740236,0.728339,0.729854
6,0.574094,0.5717,0.5805,0.573072,0.58316,0.573615,0.588645,0.555034,0.568584,0.578782,0.576006,0.573363,0.57229,0.564578,0.28682,0.585879,0.588224,0.566051,0.566185,0.565868
7,0.824448,0.823203,0.831724,0.82432,0.843387,0.825834,0.843082,0.806591,0.812523,0.835337,0.830334,0.821272,0.828997,0.812183,0.396725,0.842093,0.846106,0.817856,0.809643,0.813436
8,0.80186,0.80498,0.812203,0.805285,0.821021,0.808518,0.821257,0.78887,0.794947,0.81349,0.804607,0.80489,0.810602,0.793211,0.387303,0.822005,0.824738,0.796094,0.791242,0.790139
9,0.612811,0.610599,0.617377,0.613851,0.627448,0.61711,0.624668,0.594216,0.604348,0.614943,0.615334,0.609933,0.619782,0.608199,0.292195,0.625188,0.629156,0.610355,0.601055,0.597505


In [11]:
for i in range(5):
    print("{} doc's true topic propotion is".format(i))
    print(df_true_dist_list[0].iloc[i,:])
    print()
    print("{} doc's estimated topic propotion is".format(i))
    print(pd.DataFrame(tm_test.get_doc_topic_distribution(test_dataset)[i,:],index=["Topic{}".format(j) for j in range(num_topics)]))
    print("------------")
    print()

0 doc's true topic propotion is
Topic0     0.036420
Topic1     0.019133
Topic2     0.004226
Topic3     0.058087
Topic4     0.004424
Topic5     0.170350
Topic6     0.045694
Topic7     0.076254
Topic8     0.110143
Topic9     0.009770
Topic10    0.138973
Topic11    0.073326
Topic12    0.000155
Topic13    0.093822
Topic14    0.002217
Topic15    0.011145
Topic16    0.004317
Topic17    0.007991
Topic18    0.048550
Topic19    0.085002
Name: Doc0, dtype: float64

0 doc's estimated topic propotion is
                0
Topic0   0.077826
Topic1   0.063446
Topic2   0.051647
Topic3   0.067337
Topic4   0.068527
Topic5   0.065586
Topic6   0.049080
Topic7   0.000175
Topic8   0.032614
Topic9   0.021860
Topic10  0.058073
Topic11  0.062439
Topic12  0.038634
Topic13  0.049347
Topic14  0.049147
Topic15  0.053196
Topic16  0.042833
Topic17  0.052828
Topic18  0.057532
Topic19  0.037874
------------

1 doc's true topic propotion is
Topic0     0.080140
Topic1     0.080401
Topic2     0.008970
Topic3     0.051392

#### generating docs by LDA and estimating by LDA

In [20]:
num_topics = 20
lda_model_args = {
        "update_every": 1,
        "chunksize": 100,
        "passes": 10,
        "alpha": 0.1,
        "eta": 0.1,
        "per_word_topics": True,
        "random_state":0,
}


### generating docs and creating true doc_topic dist
df_true_dist_list2, docs2 = generate_docs_by_lda(
    num_topics=num_topics,
    seed=0,
    alpha=None,
    beta=None,
    doc_args = {
        "min_words": 50,
        "max_words": 100,
        "num_docs": 10000,
        "voc_size": 1000,
    },
    is_output=False,
)


### estimating doc_topic dist
df_doc_topic_list2, df_topic_word_list2 = estimate_dist_by_lda(
    data=docs2,
    num_topics=num_topics,
    voc_size=doc_args["voc_size"],
    model_args=lda_model_args,
    is_output=False,
)
true_df = df_true_dist_list2[0]
estimated_df = df_doc_topic_lda


### matching the columns of estimated doc_topic dist with those of true doc_topic dist by maximizing dot-product
score_list = []
for true_col in true_df.columns:
    true_target_col = true_df.loc[:, true_col]
    score_list_per_row = []
    for col in estimated_df.columns:
        target_col = estimated_df.loc[:, col]
        score_list_per_row.append(np.dot(target_col, true_target_col))
    score_list.append(score_list_per_row)

corres_num_topic_dict2 = {}
score_matrix = pd.DataFrame(score_list)
true_topics, estimated_topics = linear_sum_assignment(-score_matrix)

for true_topic, estimated_topic in zip(true_topics, estimated_topics):
    corres_num_topic_dict2["Topic{}".format(true_topic)] = "Topic{}".format(
        estimated_topic
    )

reanged_df_lda = estimated_df.loc[:, corres_num_topic_dict2.values()]
reanged_df_lda.columns = corres_num_topic_dict2.keys()


### calculating the cossim scores between true and estimated doc_topic dist
cossim_score2 = []
for true_col in true_df.columns:
    inner_res = []
    series_1 = true_df.loc[:, true_col]
    for col in reanged_df_lda.columns:
        series_2 = reanged_df_lda.loc[:, col]
        inner_res.append(
            np.dot(series_1.T, series_2)
            / (np.linalg.norm(series_1) * np.linalg.norm(series_2)))
    cossim_score2.append(inner_res)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [02:28<00:00, 67.44it/s]


In [18]:
pd.DataFrame(cossim_score2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.966966,0.121119,0.099242,0.113975,0.111537,0.122923,0.112724,0.111915,0.113762,0.12509,0.10905,0.123663,0.119413,0.1639,0.129806,0.109139,0.115162,0.109029,0.101731,0.117981
1,0.114284,0.710227,0.127836,0.104996,0.120945,0.127966,0.11973,0.134096,0.130222,0.131551,0.097294,0.123424,0.14591,0.233797,0.187306,0.109312,0.103851,0.139006,0.100922,0.11335
2,0.107157,0.127806,0.944978,0.108187,0.105977,0.117449,0.106199,0.158459,0.125088,0.127137,0.097659,0.133701,0.110055,0.186784,0.22002,0.111379,0.116482,0.152096,0.09755,0.115047
3,0.098672,0.120999,0.112822,0.964052,0.14075,0.121427,0.107179,0.119113,0.113405,0.106576,0.103845,0.113477,0.115119,0.304165,0.123817,0.108146,0.107279,0.121264,0.107527,0.129112
4,0.101198,0.129164,0.114105,0.12127,0.963232,0.108907,0.105027,0.134819,0.109554,0.118439,0.10941,0.131092,0.117738,0.296529,0.136671,0.119528,0.12194,0.11614,0.105116,0.115803
5,0.102789,0.126431,0.114316,0.109599,0.097037,0.933576,0.102219,0.129103,0.112228,0.113907,0.113036,0.132567,0.121716,0.164539,0.142648,0.107807,0.119637,0.132677,0.112155,0.129122
6,0.109315,0.13765,0.105657,0.135617,0.103332,0.120398,0.965932,0.135093,0.112604,0.117044,0.102968,0.120223,0.164455,0.161982,0.136564,0.117254,0.120136,0.120935,0.091208,0.136144
7,0.113695,0.131437,0.127479,0.108811,0.108674,0.146671,0.111111,0.946163,0.107229,0.101332,0.116161,0.119235,0.121796,0.163405,0.206608,0.125066,0.126246,0.13558,0.105432,0.14462
8,0.095541,0.128628,0.128511,0.108916,0.104016,0.138914,0.112932,0.122005,0.965305,0.121792,0.100546,0.111954,0.114122,0.179128,0.131852,0.11234,0.111679,0.128807,0.108559,0.121694
9,0.102897,0.130497,0.111768,0.097845,0.109773,0.114695,0.110479,0.114144,0.111315,0.961691,0.113749,0.106808,0.115097,0.167513,0.123342,0.109668,0.106812,0.115505,0.107501,0.112195


In [19]:
for i in range(5):
    print("{} doc's true topic propotion is".format(i))
    print(df_true_dist_list2[0].iloc[i,:])
    print()
    print("{} doc's estimated topic propotion is".format(i))
    print(reanged_df_lda.iloc[i,:])
    print("------------")
    print()

0 doc's true topic propotion is
Topic0     6.968426e-04
Topic1     1.779735e-03
Topic2     5.236102e-05
Topic3     7.236574e-05
Topic4     2.846014e-01
Topic5     2.720386e-02
Topic6     9.833561e-04
Topic7     9.197617e-13
Topic8     3.208884e-18
Topic9     2.288536e-02
Topic10    4.768472e-01
Topic11    1.231430e-04
Topic12    1.505897e-10
Topic13    1.030299e-09
Topic14    4.210461e-04
Topic15    4.721245e-07
Topic16    1.096394e-04
Topic17    1.542085e-18
Topic18    2.075297e-03
Topic19    1.821479e-01
Name: Doc0, dtype: float64

0 doc's estimated topic propotion is
Topic0     0.000000
Topic1     0.000000
Topic2     0.000000
Topic3     0.000000
Topic4     0.249842
Topic5     0.000000
Topic6     0.000000
Topic7     0.000000
Topic8     0.055517
Topic9     0.012565
Topic10    0.318640
Topic11    0.000000
Topic12    0.000000
Topic13    0.136678
Topic14    0.054909
Topic15    0.000000
Topic16    0.000000
Topic17    0.048990
Topic18    0.027846
Topic19    0.084009
Name: Doc0, dtype: floa