In [1]:
import sys

sys.path.append('../gtm/')
sys.path.append('../src/')

In [7]:
import importlib
%matplotlib inline
import matplotlib.pyplot as plt
import collections
import pickle
import numpy as np
from random import random
import torch
import pandas as pd
from tqdm import tqdm
from corpus import GTMCorpus
from patsy import dmatrix
from scipy.optimize import linear_sum_assignment

from corpus import GTMCorpus
from gtm import GTM

In [4]:
import simulation
importlib.reload(simulation)
from simulation import *

In [5]:
num_topics = 20
num_simulations = 5
doc_args = {
    "num_content_covs": 2,
    "num_prev_covs": 2,
    "min_words": 50,
    "max_words": 100,
    "num_docs": 10000,
    "voc_size": 1000}

### generating docs by GTM and estimating by GTM (dirichlet*SAGE)

In [10]:
gtm_model_args = {
    "n_topics": num_topics,
    "num_epochs": 10,
    "update_prior": True,
    "doc_topic_prior": "dirichlet",
    "decoder_type": "sage",
    "decoder_estimate_interactions": True,
    "encoder_hidden_layers":[512,256],
    "decoder_hidden_layers":[300],
    }

df_true_dist_list, docs = generate_docs_by_gtm(
    num_topics=20,
    doc_topic_prior=gtm_model_args["doc_topic_prior"],
    decoder_type=gtm_model_args["decoder_type"],
    seed=0,
    doc_args=doc_args,
    is_output=False,
)

df_test = pd.DataFrame(
    data={"doc":docs["doc"],
    "doc_clean":docs["doc"],
    "prevalence": docs["prevalence_covariates"],
    "content": docs["content_covariates"],
    }
)
test_dataset = GTMCorpus(
    df_test,
    prevalence="~ prevalence",
    content="~ content",
    embeddings_type = None,
)
tm_test = GTM(
    train_data = test_dataset,
    **gtm_model_args
)

df_doc_topic_gtm = pd.DataFrame(
        tm_test.get_doc_topic_distribution(test_dataset),
        index=["Doc{}".format(i) for i in range(doc_args["num_docs"])],
        columns=["Topic{}".format(i) for i in range(num_topics)],
    )
df_topic_word_gtm = pd.DataFrame(
    tm_test.get_topic_word_distribution(doc_args["voc_size"]),
    index=["Topic{}".format(i) for i in range(num_topics)],
    columns=["word_{}".format(i) for i in range(doc_args["voc_size"])],
)

true_df = df_true_dist_list[0]
estimated_df = df_doc_topic_gtm

dotproduct_list = []
for true_col in true_df.columns:
    true_target_col = true_df.loc[:, true_col]
    dotproduct_list_per_row = []
    for col in estimated_df.columns:
        target_col = estimated_df.loc[:, col]
        dotproduct_list_per_row.append(np.dot(target_col, true_target_col))
    dotproduct_list.append(dotproduct_list_per_row)

corres_num_topic_dict = {}
dotproduct_matrix = pd.DataFrame(dotproduct_list)
true_topics, estimated_topics = linear_sum_assignment(-dotproduct_matrix)

for true_topic, estimated_topic in zip(true_topics, estimated_topics):
    corres_num_topic_dict["Topic{}".format(true_topic)] = "Topic{}".format(
        estimated_topic
    )

rearanged_df_gtm = estimated_df.loc[:, corres_num_topic_dict.values()]
rearanged_df_gtm.columns = corres_num_topic_dict.keys()

cossim_score = []
for true_col in true_df.columns:
    score_per_topic = []
    series_1 = true_df.loc[:, true_col]
    for col in rearanged_df_gtm.columns:
        series_2 = rearanged_df_gtm.loc[:, col]
        score_per_topic.append(
            np.dot(series_1.T, series_2)
            / (np.linalg.norm(series_1) * np.linalg.norm(series_2)))
    cossim_score.append(score_per_topic)

In [14]:
pd.DataFrame(cossim_score)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.762108,0.753446,0.772359,0.768716,0.777159,0.759487,0.781247,0.736088,0.745933,0.773786,0.759766,0.764992,0.767101,0.750592,0.358197,0.752664,0.372108,0.757692,0.74905,0.750687
1,0.791347,0.782681,0.802938,0.79935,0.811313,0.789516,0.816039,0.766256,0.775896,0.802465,0.791329,0.799198,0.8015,0.78245,0.371218,0.790205,0.382162,0.788364,0.780827,0.7834
2,0.700519,0.693927,0.71052,0.706104,0.717724,0.701145,0.719533,0.677049,0.682309,0.710429,0.699111,0.706947,0.708497,0.69348,0.329617,0.693782,0.341335,0.695738,0.686531,0.69393
3,0.739448,0.733527,0.752342,0.747148,0.760513,0.740086,0.763956,0.718383,0.725993,0.751444,0.741312,0.746635,0.749033,0.735812,0.346045,0.740094,0.356122,0.737455,0.73141,0.7364
4,0.569513,0.566845,0.577073,0.572789,0.584119,0.568835,0.58284,0.550275,0.557936,0.577347,0.5724,0.57569,0.57412,0.560457,0.260204,0.566767,0.281806,0.566914,0.564492,0.558726
5,0.749634,0.741881,0.757113,0.753731,0.768655,0.75152,0.769444,0.725624,0.735183,0.760688,0.750317,0.755003,0.756486,0.739347,0.34484,0.746882,0.360683,0.746105,0.740476,0.741422
6,0.57251,0.573957,0.584701,0.58502,0.594697,0.580984,0.594331,0.556314,0.570133,0.586225,0.580047,0.584817,0.590504,0.58015,0.26971,0.578594,0.288431,0.572568,0.564864,0.570893
7,0.835086,0.826288,0.843045,0.840141,0.852978,0.830236,0.855483,0.804228,0.815495,0.846005,0.832456,0.841192,0.841495,0.822644,0.390027,0.827499,0.397393,0.82846,0.82176,0.823826
8,0.812551,0.803002,0.821362,0.819405,0.833835,0.808882,0.835095,0.784799,0.797704,0.826842,0.813516,0.822154,0.822354,0.801765,0.38437,0.808114,0.393139,0.808155,0.801908,0.804488
9,0.621525,0.611155,0.622468,0.623322,0.633203,0.616199,0.633218,0.599675,0.606107,0.627529,0.62129,0.621323,0.624639,0.610041,0.293625,0.618122,0.281635,0.613712,0.612456,0.612469


In [12]:
for i in range(5):
    print("{} doc's true topic propotion is".format(i))
    print(df_true_dist_list[0].iloc[i,:])
    print()
    print("{} doc's estimated topic propotion is".format(i))
    print(tm_test.get_doc_topic_distribution(test_dataset)[i,:])
    print("------------")

0 doc's true topic propotion is
Topic0     0.036420
Topic1     0.019133
Topic2     0.004226
Topic3     0.058087
Topic4     0.004424
Topic5     0.170350
Topic6     0.045694
Topic7     0.076254
Topic8     0.110143
Topic9     0.009770
Topic10    0.138973
Topic11    0.073326
Topic12    0.000155
Topic13    0.093822
Topic14    0.002217
Topic15    0.011145
Topic16    0.004317
Topic17    0.007991
Topic18    0.048550
Topic19    0.085002
Name: Doc0, dtype: float64

0 doc's estimated topic propotion is
[0.03463147 0.06124951 0.02489123 0.0487724  0.00077826 0.04362777
 0.05183314 0.06042557 0.06596762 0.0625105  0.07044107 0.06398511
 0.07180298 0.05493277 0.03634851 0.00025997 0.06847683 0.0457205
 0.0847277  0.04861711]
------------
1 doc's true topic propotion is
Topic0     0.080140
Topic1     0.080401
Topic2     0.008970
Topic3     0.051392
Topic4     0.013120
Topic5     0.099699
Topic6     0.015678
Topic7     0.166390
Topic8     0.063746
Topic9     0.003052
Topic10    0.034895
Topic11    0.0

### generating docs by LDA and estimating by LDA

In [15]:
num_topics = 20
df_true_dist_list2, docs2 = generate_docs_by_lda(
    num_topics=num_topics,
    seed=0,
    alpha=None,
    beta=None,
    doc_args = {
        "min_words": 50,
        "max_words": 100,
        "num_docs": 10000,
        "voc_size": 1000,
    },
    is_output=False,
)

df_doc_topic_list2, df_topic_word_list2 = estimate_dist_by_lda(
    data=docs2,
    num_topics=num_topics,
    voc_size=doc_args["voc_size"],
    model_args=None,
    is_output=False,
)

df_doc_topic_lda = df_doc_topic_list2[0]
df_topic_word_lda = df_topic_word_list2[0]

true_df = df_true_dist_list2[0]
estimated_df = df_doc_topic_lda

score_list = []
for true_col in true_df.columns:
    true_target_col = true_df.loc[:, true_col]
    score_list_per_row = []
    for col in estimated_df.columns:
        target_col = estimated_df.loc[:, col]
        score_list_per_row.append(np.dot(target_col, true_target_col))
    score_list.append(score_list_per_row)

corres_num_topic_dict2 = {}
score_matrix = pd.DataFrame(score_list)
true_topics, estimated_topics = linear_sum_assignment(-score_matrix)

for true_topic, estimated_topic in zip(true_topics, estimated_topics):
    corres_num_topic_dict2["Topic{}".format(true_topic)] = "Topic{}".format(
        estimated_topic
    )

reanged_df_lda = estimated_df.loc[:, corres_num_topic_dict2.values()]
reanged_df_lda.columns = corres_num_topic_dict2.keys()
cossim_score2 = []
for true_col in true_df.columns:
    inner_res = []
    series_1 = true_df.loc[:, true_col]
    for col in reanged_df_lda.columns:
        series_2 = reanged_df_lda.loc[:, col]
        inner_res.append(
            np.dot(series_1.T, series_2)
            / (np.linalg.norm(series_1) * np.linalg.norm(series_2)))
    cossim_score2.append(inner_res)

100%|██████████| 10000/10000 [02:26<00:00, 68.31it/s]


In [16]:
pd.DataFrame(cossim_score2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.963989,0.11305,0.282799,0.099077,0.110413,0.116956,0.12839,0.119316,0.131743,0.105969,0.118922,0.114503,0.128776,0.095493,0.138916,0.116965,0.131565,0.112216,0.116629,0.116899
1,0.108835,0.955577,0.395318,0.105578,0.12797,0.103108,0.121774,0.142045,0.154814,0.104922,0.112955,0.1127,0.13865,0.115538,0.141567,0.132335,0.103601,0.127039,0.13734,0.115303
2,0.102711,0.118093,0.331815,0.100932,0.107119,0.104942,0.111044,0.165217,0.708898,0.13516,0.10398,0.105355,0.125541,0.098276,0.135023,0.125902,0.110499,0.142326,0.108052,0.119332
3,0.103931,0.100661,0.165751,0.968107,0.119866,0.11257,0.129625,0.124388,0.143833,0.116931,0.127561,0.103947,0.113765,0.103888,0.129701,0.100063,0.115903,0.146407,0.109678,0.117189
4,0.109271,0.114756,0.249833,0.112203,0.958375,0.145178,0.126058,0.14349,0.139397,0.115612,0.131242,0.105448,0.127274,0.100073,0.143087,0.114125,0.140174,0.122991,0.109541,0.107934
5,0.099335,0.104364,0.370981,0.111857,0.102016,0.956155,0.105705,0.150772,0.146728,0.10464,0.133125,0.097574,0.123571,0.101841,0.143679,0.113875,0.110828,0.120524,0.120649,0.129859
6,0.10776,0.117398,0.191334,0.1038,0.105069,0.10268,0.960068,0.131607,0.134945,0.106043,0.1074,0.102818,0.138268,0.112324,0.124976,0.112286,0.109521,0.114002,0.093303,0.118431
7,0.102346,0.106339,0.2145,0.096565,0.113185,0.130937,0.135664,0.876946,0.159746,0.105698,0.114583,0.103846,0.202383,0.120517,0.172614,0.114683,0.116744,0.140529,0.124633,0.107559
8,0.099582,0.104374,0.221648,0.101374,0.109143,0.10583,0.125182,0.146676,0.715806,0.108282,0.101253,0.109516,0.125556,0.101663,0.233378,0.109126,0.109457,0.163552,0.110732,0.114004
9,0.112448,0.109939,0.281209,0.097264,0.119077,0.102447,0.112597,0.12571,0.1313,0.961996,0.110897,0.098364,0.140078,0.105435,0.122876,0.109403,0.10245,0.12924,0.118803,0.126154


In [19]:
for i in range(5):
    print("{} doc's true topic propotion is".format(i))
    print(df_true_dist_list2[0].iloc[i,:])
    print()
    print("{} doc's estimated topic propotion is".format(i))
    print(reanged_df_lda.iloc[i,:])
    print("------------")

0 doc's true topic propotion is
Topic0     6.968426e-04
Topic1     1.779735e-03
Topic2     5.236102e-05
Topic3     7.236574e-05
Topic4     2.846014e-01
Topic5     2.720386e-02
Topic6     9.833561e-04
Topic7     9.197617e-13
Topic8     3.208884e-18
Topic9     2.288536e-02
Topic10    4.768472e-01
Topic11    1.231430e-04
Topic12    1.505897e-10
Topic13    1.030299e-09
Topic14    4.210461e-04
Topic15    4.721245e-07
Topic16    1.096394e-04
Topic17    1.542085e-18
Topic18    2.075297e-03
Topic19    1.821479e-01
Name: Doc0, dtype: float64

0 doc's estimated topic propotion is
Topic0     0.000000
Topic1     0.000000
Topic2     0.012626
Topic3     0.000000
Topic4     0.229526
Topic5     0.000000
Topic6     0.000000
Topic7     0.000000
Topic8     0.036789
Topic9     0.000000
Topic10    0.509283
Topic11    0.000000
Topic12    0.000000
Topic13    0.062218
Topic14    0.026646
Topic15    0.000000
Topic16    0.000000
Topic17    0.000000
Topic18    0.032386
Topic19    0.078524
Name: Doc0, dtype: floa