In [1]:
## Pre-setting
# automatically adjust the width of the notebook code cell
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# if one module is changed, this line will automatically reload that module
%load_ext autoreload
%autoreload 2

# display the figure in the notebook
%matplotlib inline

# To change the font size in acrobat
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [2]:
## Add path
import os
import sys
src_dir = os.path.abspath(os.path.join(os.pardir, 'src'))
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

In [3]:
## Private package

from corpora.pp_single_stopword import synthetic_single_stopword_terminal

from models.modelfront import topicmodel_inference_front

# creat synthetic benchmark data

In [4]:
## Parameters for distribution

V = 400 # nubmer of words
K = 5 # nubmer of topics

dist_w = 'uni'  # global word frequency. uni or zipf
# dist_t = dist_w # number of words in each topic. uni or zipf

dist_stop = 'uni' # probability of stopwords appearance.
p_s = 0.1 # 100 * p_s is the percentage of the stopword 

c_w = 0.8  # degree of structure for word mixing for each topic. 0: random; 1: structure
# c_t = c_w # degree of structure for topic mixing for each document. 0: random; 1: structure

## Parameters for document

D = 2000 # nubmer of document
m = 100 # length of document


## Parameters for random function
seed = 5
burstiness = None


In [5]:
%%time
dict_out_syn_stop = synthetic_single_stopword_terminal(V = V , K = K, D = D, m = m, dist_w =dist_w, dist_stop = dist_stop, p_s = p_s, c_w = c_w, seed = seed, burstiness = burstiness)

CPU times: user 2.19 s, sys: 48.8 ms, total: 2.24 s
Wall time: 2.25 s


In [6]:
dict_out_syn_stop.keys()

dict_keys(['V_t', 'p_wt', 'p_t', 'n_wj', 'p_td', 'word_topic_assign_list', 'n_jd', 'p_w_td', 'p_w', 'document_topic_assign_list', 'n_wd', 'texts', 'state_dwz'])

In [7]:
# example of the first document
len(dict_out_syn_stop['texts'][0]), dict_out_syn_stop['texts'][0][:20]

(100,
 ['6',
  '9',
  '17',
  '18',
  '18',
  '18',
  '33',
  '33',
  '37',
  '42',
  '48',
  '49',
  '50',
  '53',
  '57',
  '74',
  '75',
  '89',
  '89',
  '94'])

In [8]:
texts = dict_out_syn_stop['texts']
state_dwz = dict_out_syn_stop['state_dwz']

# run different topic modeling algorithms on synthetic benchmark corpora

Please note that for the synthetic benchmark corpus, since we have the true topic assignment of each token, we will use this token topic assignment as an input for topic modeling. And in this case, we put together topic inference and the structure overlap measurement.

## run topic model: ldavb

In [9]:

dict_input_ldavb = {
    
    ## choose topic model
    'topic_model': 'ldavb'
    
    ## provide corpus and number of topics if need
    , 'texts':texts
    , 'input_k': K

    ## optional, only works for synthetic corpus with token labeling
    , 'state_dwz_true': state_dwz
    , 'k_true': K 
    , 'input_v': V  # only need for ldavb- token labeling
    
    ## optional
    , 'dN_opt':0 ## optional
    , 'minimum_probability':0 ## optional   
    }

In [10]:
%%time

dict_output_ldavb = topicmodel_inference_front( dict_input_ldavb )

CPU times: user 20.9 s, sys: 309 ms, total: 21.2 s
Wall time: 21.4 s


In [11]:
dict_output_ldavb.keys()

dict_keys(['token_labeling_model_nmi', 'token_labeling_perfect_nmi', 'state_dwz_infer', 'p_wt_infer', 'token_labeling_rand_nmi', 'token_labeling_normal_nmi', 'p_td_infer'])

In [12]:
dict_output_ldavb['p_td_infer'][1]

array([ 0.00202538,  0.97245211,  0.00201647,  0.00203503,  0.021471  ])

In [13]:
token_labeling_model_nmi = dict_output_ldavb.get('token_labeling_model_nmi', None)
print(token_labeling_model_nmi)


0.384272764544


## run topic model: ldavb

In [14]:
dict_input_ldags = {
    ## choose topic model
    'topic_model': 'ldags'
    
    ## provide corpus and number of topics if need
    , 'texts':texts
    , 'input_k': K # only for ldavb and ladgs
    
    ## optional, only works for synthetic corpus with token labeling
    , 'state_dwz_true': state_dwz
    , 'k_true': K 

    
#     ## optional
#     , 'input_v': V  # only need for ldavb token labeling
#     , 'path_mallet': os.path.abspath(os.path.join(os.pardir,'src/external/mallet-2.0.8RC3/bin/mallet')) 
#     , 'dN_opt':0 
#     , 'N_iter':1000 
}

In [15]:
%%time

# in order to run ldags, we need to create a folder 'tmp' to save the intermediate files generated during the inference process
path_tmp_file  = os.path.abspath(os.path.join(os.pardir,'tmp'))
print(path_tmp_file)
if not os.path.exists(path_tmp_file):
    os.makedirs(path_tmp_file)


dict_output_ldags = topicmodel_inference_front( dict_input_ldags )

/Users/hanyushi/projects/s11_synthetic_benchmark_topic_model_tutorial/tmp
CPU times: user 8.88 s, sys: 217 ms, total: 9.09 s
Wall time: 28.6 s


In [16]:
dict_output_ldags.keys()

dict_keys(['token_labeling_perfect_nmi', 'token_labeling_model_nmi', 'p_wt_infer', 'token_labeling_rand_nmi', 'state_dwz_infer', 'token_labeling_normal_nmi', 'p_td_infer'])

In [17]:
dict_output_ldags['p_td_infer'][1]

array([ 0.1       ,  0.54666667,  0.12666667,  0.12      ,  0.10666667])

In [18]:
token_labeling_model_nmi = dict_output_ldags.get('token_labeling_model_nmi', None)
print(token_labeling_model_nmi)

0.515504183298


## run topic model: hdp

In [19]:
dict_input_hdp = {
    ## choose topic model
    'topic_model': 'hdp'

    ## provide corpus and number of topics if need
    , 'texts':texts

    ## optional, only works for synthetic corpus with token labeling
    , 'state_dwz_true': state_dwz
    , 'k_true': K 


    ## optional
#     , 'path_hdp': os.path.abspath(os.path.join(os.pardir,'src/external/hdp-bleilab/hdp-faster'))
}

In [20]:
%%time
dict_output_hdp = topicmodel_inference_front( dict_input_hdp )

CPU times: user 6.33 s, sys: 157 ms, total: 6.49 s
Wall time: 16.8 s


In [21]:
dict_output_hdp.keys()

dict_keys(['k_infer', 'token_labeling_normal_nmi', 'token_labeling_model_nmi', 'p_td_infer', 'token_labeling_perfect_nmi', 'state_dwz_infer', 'p_wt_infer', 'token_labeling_rand_nmi'])

In [22]:
dict_output_hdp['p_td_infer'][1]

array([ 0.22,  0.01,  0.11,  0.56,  0.  ,  0.1 ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ])

In [23]:
token_labeling_model_nmi = dict_output_hdp.get('token_labeling_model_nmi', None)
print(token_labeling_model_nmi)


0.455796473424


In [24]:
dict_output_hdp['k_infer']

12

## run topic model: tm

In [25]:
dict_input_tm = {
    ## choose topic model
    'topic_model': 'tm'

    ## provide corpus and number of topics if need
    , 'texts':texts

    ## optional, only works for synthetic corpus with token labeling
    , 'state_dwz_true': state_dwz
    , 'k_true': K 


    ## optional
    , 'path_tm': os.path.abspath(os.path.join(os.pardir,'src/external/topicmapping'))
}

In [26]:
%%time
dict_output_tm = topicmodel_inference_front( dict_input_tm )

CPU times: user 6.78 s, sys: 184 ms, total: 6.96 s
Wall time: 28.9 s


In [27]:
dict_output_tm.keys()

dict_keys(['k_infer', 'token_labeling_normal_nmi', 'token_labeling_model_nmi', 'p_td_infer', 'token_labeling_perfect_nmi', 'state_dwz_infer', 'p_wt_infer', 'token_labeling_rand_nmi'])

In [28]:
dict_output_tm['p_td_infer'][1]

array([  9.98886161e-01,   2.78407151e-04,   2.82944683e-04,
         2.72788160e-04,   2.79699309e-04])

In [29]:
token_labeling_model_nmi = dict_output_tm.get('token_labeling_model_nmi', None)
print(token_labeling_model_nmi)


0.591038354937


In [30]:
dict_output_tm['k_infer']

5

## end

In [31]:
1

1