<a href="https://colab.research.google.com/github/tomonari-masada/course-nlp2020/blob/master/11_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 標準使用ライブラリー
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import shutil
from icecream import ic
from tqdm import tqdm_notebook as tqdm 



# 追記
import json
import datetime
import math

# debug
#%pdb on

import pixiedust #%pixie_debugger

# tfがエラーはかないため
# tfがエラーはかないため
#import tensorflow as tf
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="5"
#physical_devices = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0], True)


Pixiedust database opened successfully


In [2]:
import pandas as pd
import re
import emoji
from wordcloud import WordCloud, STOPWORDS

df = pd.read_excel("../data/result0802.xlsx")

type(df["description"])
docs = df["description"].to_list()

In [3]:
print(len(docs))

93069


### Neologdを使ってtokenizeする

In [4]:

import subprocess
import MeCab

def make_neologd_tagger():
    cmd='echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
    path_neologd = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                               shell=True).communicate()[0]).decode('utf-8')
    m=MeCab.Tagger("-Ochasen -d "+str(path_neologd))
    return (m)


def neolog_prep_text( text, m):
    return_words = []

    
    splited_text = (re.split('[\t,]', line) for line in m.parse(text).split('\n'))
    for tmp_word in splited_text :
        if (tmp_word[0] in ('EOS', '', 't', 'ー') ):
           continue 
        if not re.match( '名詞' ,tmp_word[3]  ) or tmp_word[0] in emoji.UNICODE_EMOJI["en"]:
            continue
        else:
            return_words.append(tmp_word[0])

    return return_words


* tokenizationの実行

In [5]:
from tqdm import tqdm


m = make_neologd_tagger()

new_docs = list()
for doc in tqdm(docs):
  if str(doc) == "nan":
    continue
  tmp_words =  neolog_prep_text(str(doc), m)
  new_docs.append( tmp_words )
  

100%|██████████| 93069/93069 [00:23<00:00, 4004.35it/s]


* tokenizationの結果を確認

In [6]:
print(new_docs[5])

['ボカロ', '初音ミク', 'アニメ', 'ゲーム', '等', '好き', '20代', '社会人', 'ミク', '廃', 'さん', '達', '中心', 'フォロー', 'ミク', 'キャンバス', 'よろしくお願いします', '相方', '好き', 'カープ', '好き']


* 各文書を長い文字列で表しなおす（CountVectorizerを後で使うため）

In [7]:
corpus = [' '.join(doc) for doc in new_docs]

## 11-02 データ行列の作成
* LDAの場合、単に単語の出現頻度を重みとして各文書をベクトル化する。

### sklearnのCountVectorizerで疎行列化する

* 全文書の半分より多い文書に現れる単語は、高頻度語とみなして削除する。
* 30件未満の文書にしか現れない単語は、低頻度語とみなして削除する。

In [8]:
import os
import urllib.request
def download_stopwords(path):
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    if os.path.exists(path):
        print('File already exists.')
    else:
        print('Downloading...')
        # Download the file from `url` and save it locally under `file_name`:
        urllib.request.urlretrieve(url, path)

def create_stopwords(file_path):
    stop_words = []
    for w in open(path, "r"):
        w = w.replace('\n','')
        if len(w) > 0:
          stop_words.append(w)
    return stop_words    

path = "stop_words.txt"
download_stopwords(path)
stop_words = create_stopwords(path)

File already exists.


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

MIN_DF = 30

vectorizer = CountVectorizer(max_df=0.5, min_df= MIN_DF, stop_words=stop_words)

X = vectorizer.fit_transform(corpus)

In [10]:
print(len(vectorizer.get_feature_names()))

5050


* 文書数と語彙サイズを変数にセット

In [11]:
n_samples, n_features = X.shape

### TF-IDFで各文書における単語の重みを計算する

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
Xtfidf = tfidf.fit_transform(X)

In [13]:
print(Xtfidf[0])

  (0, 4595)	0.19276647176236258
  (0, 4587)	0.2674347536590711
  (0, 4375)	0.21747308136578852
  (0, 4170)	0.23235458283624436
  (0, 3856)	0.6269365304095426
  (0, 3044)	0.17976188275291347
  (0, 2944)	0.2683528929608284
  (0, 2666)	0.1882646033429383
  (0, 2603)	0.1657289328698403
  (0, 2423)	0.2209442473554379
  (0, 1546)	0.3033883831127719
  (0, 1427)	0.22979355219086722
  (0, 1415)	0.18995772072964873


In [14]:
Xtfidf.shape

(87733, 5050)

### LDAのインポート

In [15]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from matplotlib.backends.backend_pdf import PdfPages
import datetime
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

### トピックの重要語を取り出す関数の定義

In [16]:
def get_top_words(model, feature_names, n_top_words=30):
  top_features = list()
  weights = list()
  for topic_idx, topic in enumerate(model.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features.append([feature_names[i] for i in top_features_ind])
    weights.append(topic[top_features_ind])
  return top_features, weights

# LDAでトピック抽出

### LDAによるトピック抽出の実行

In [17]:
def make_word_cloud(n_components, lda):
    # matplotlib and seaborn for plotting
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    plt.style.use('dark_background')
    top_words, weights = get_top_words(lda, vectorizer.get_feature_names())
    topic_words = [dict(zip(top_words[i], weights[i])) for i in range(n_components)]
    FONT_PATH = "/usr/share/fonts/opentype/ipaexfont-mincho/ipaexm.ttf"
    cloud = WordCloud(stopwords=STOPWORDS,
                  font_path=FONT_PATH,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=100,
                  colormap='tab10'
                  )

    tate = math.ceil(len(topic_words) / 2)
    fig, axes = plt.subplots(tate, 2, figsize=(32, 50), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):

        if i >= len(topic_words):
            break

        fig.add_subplot(ax)
        cloud.generate_from_frequencies(topic_words[i], max_font_size=500)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()

    pdf = PdfPages( 
                    now_ +'topic.pdf')

    fignums = plt.get_fignums()
    for fignum in fignums:
        plt.figure(fignum)
        pdf.savefig()

    pdf.close() 


In [18]:
import logging
import pickle
import itertools

def lda_main (now_, batch_size ,n_components, topic_word_prior,doc_topic_prior  ,max_iter=30):

    folder_name = now_

    # フォルダを作成
    os.mkdir("../experiment/0803expt/"+folder_name)
    os.chdir("../experiment/0803expt/"+folder_name)


    
    
    lda = LatentDirichletAllocation(n_components=n_components, 
                                    max_iter=max_iter,
                                    topic_word_prior=topic_word_prior, # トピック数の逆数が目安の0.01,0.02,0.05,0.1などなど試す
                                    doc_topic_prior =  doc_topic_prior, 
                                    learning_method='online',
                                    learning_offset=50,
                                    batch_size= batch_size,# 多くする
                                    learning_decay = 0.7,
                                    mean_change_tol=1e-4,
                                    random_state=1,
                                    evaluate_every=1,
                                    verbose=1)
    print((f"Fitting LDA models with tf features, "
    f"n_samples={n_samples} and n_features={n_features}"))
    t0 = time()
    lda.fit(X)
    print(f"done in {time() - t0:0.3f}s.")
    # パラメータの比較はperplexity
    # ハイパーパラメータ調整を頑張る！（やってられない！といわない！！）
    
    
    coherance = metric_coherence_gensim(measure='c_v', 
#                         top_n=20, # これはデフォルトが20
                        topic_word_distrib=lda.components_, 
                        dtm=Xtfidf,  # tfidfの結果
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=new_docs)
    
    
    results = {
            "perplexity" : lda.perplexity(X) ,
            "coherance": coherance,
        }

    logger.warning('TIME:{0}'.format(now_) )
    logger.warning('MIN_DF:{0}'.format(MIN_DF) )
    logger.warning('params:batch_size:{0}'.format(batch_size)) 
    logger.warning('params:n_components:{0}'.format(n_components)) 
    logger.warning('params:topic_word_prior:{0}'.format(topic_word_prior)) 
    logger.warning('params:doc_topic_prior:{0}'.format(doc_topic_prior)) 
    logger.warning('params:max_iter:{0}'.format(max_iter)) 
    logger.warning('done n_iter:{0}'.format(lda.n_iter_)) 
    logger.warning('perplexity:{0}'.format(results["perplexity"])) 
    logger.warning('coherance:{0}'.format(results["coherance"]) )
    logger.warning('check all params:{0}'.format(lda.get_params() )) 
    make_word_cloud(n_components, lda)
    # pickle
    file_name = now_ + '_lda.pickle'
    with open(file_name, mode="wb") as f:
        pickle.dump(lda, f)
    
#     breakpoint()
    
    os.chdir("../../")
    return(results)
    

In [None]:
# logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='../experiment/0803expt/mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.WARNING)
import gc 

for batch_size,n_components, topic_word_prior, doc_topic_prior in itertools.product([1500] ,[6, 8, 10,12,15,18,22,25,30,35,40],
                                                                                    [0.005,0.01, 0.03,0.05, 0.15, 0.3, 0.4,  0.6, 0.8],[0.005,0.01, 0.03,0.05, 0.15, 0.3, 0.4,  0.6, 0.8] ):
    now_ = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M')
    
    lda_main(now_, n_components=n_components, topic_word_prior=topic_word_prior, doc_topic_prior = doc_topic_prior, batch_size=batch_size, max_iter=25)
    gc.collect()
    
    

Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1971.6696
iteration: 2 of max_iter: 25, perplexity: 1885.3969
iteration: 3 of max_iter: 25, perplexity: 1851.2197
iteration: 4 of max_iter: 25, perplexity: 1835.7455
iteration: 5 of max_iter: 25, perplexity: 1827.7021
iteration: 6 of max_iter: 25, perplexity: 1822.1569
iteration: 7 of max_iter: 25, perplexity: 1818.6281
iteration: 8 of max_iter: 25, perplexity: 1816.2183
iteration: 9 of max_iter: 25, perplexity: 1814.3506
iteration: 10 of max_iter: 25, perplexity: 1812.6123
iteration: 11 of max_iter: 25, perplexity: 1811.5839
iteration: 12 of max_iter: 25, perplexity: 1810.6091
iteration: 13 of max_iter: 25, perplexity: 1810.0052
iteration: 19 of max_iter: 25, perplexity: 1807.7450
iteration: 20 of max_iter: 25, perplexity: 1807.6765
done in 533.810s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1870.665

iteration: 15 of max_iter: 25, perplexity: 1459.8759
iteration: 16 of max_iter: 25, perplexity: 1457.8454
iteration: 17 of max_iter: 25, perplexity: 1456.0565
iteration: 18 of max_iter: 25, perplexity: 1454.4730
iteration: 19 of max_iter: 25, perplexity: 1453.0584
iteration: 20 of max_iter: 25, perplexity: 1451.7848
iteration: 21 of max_iter: 25, perplexity: 1450.6342
iteration: 22 of max_iter: 25, perplexity: 1449.5894
iteration: 23 of max_iter: 25, perplexity: 1448.6388
iteration: 24 of max_iter: 25, perplexity: 1447.7694
iteration: 25 of max_iter: 25, perplexity: 1446.9680
done in 715.862s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1748.0655
iteration: 2 of max_iter: 25, perplexity: 1634.8427
iteration: 3 of max_iter: 25, perplexity: 1574.0995
iteration: 4 of max_iter: 25, perplexity: 1537.4725
iteration: 5 of max_iter: 25, perplexity: 1514.0645
iteration: 6 of max_iter: 25, perplexity: 1498.1108
iteration: 7 o

iteration: 5 of max_iter: 25, perplexity: 1478.1384
iteration: 6 of max_iter: 25, perplexity: 1470.4374
iteration: 7 of max_iter: 25, perplexity: 1465.0643
iteration: 8 of max_iter: 25, perplexity: 1461.0530
iteration: 9 of max_iter: 25, perplexity: 1457.9959
iteration: 10 of max_iter: 25, perplexity: 1455.6584
iteration: 11 of max_iter: 25, perplexity: 1453.7738
iteration: 12 of max_iter: 25, perplexity: 1452.2245
iteration: 13 of max_iter: 25, perplexity: 1450.9139
iteration: 14 of max_iter: 25, perplexity: 1449.8215
iteration: 15 of max_iter: 25, perplexity: 1448.8316
iteration: 16 of max_iter: 25, perplexity: 1447.9975
iteration: 17 of max_iter: 25, perplexity: 1447.2605
iteration: 18 of max_iter: 25, perplexity: 1446.5822
iteration: 19 of max_iter: 25, perplexity: 1446.0171
iteration: 20 of max_iter: 25, perplexity: 1445.4890
iteration: 21 of max_iter: 25, perplexity: 1445.0220
iteration: 22 of max_iter: 25, perplexity: 1444.6147
iteration: 23 of max_iter: 25, perplexity: 1444.238

iteration: 9 of max_iter: 25, perplexity: 1535.7635
iteration: 10 of max_iter: 25, perplexity: 1534.9653
iteration: 11 of max_iter: 25, perplexity: 1534.3496
iteration: 12 of max_iter: 25, perplexity: 1533.8058
iteration: 13 of max_iter: 25, perplexity: 1533.5381
iteration: 14 of max_iter: 25, perplexity: 1533.3136
iteration: 15 of max_iter: 25, perplexity: 1533.0454
iteration: 16 of max_iter: 25, perplexity: 1532.8765
iteration: 17 of max_iter: 25, perplexity: 1532.7461
iteration: 18 of max_iter: 25, perplexity: 1532.6323
iteration: 19 of max_iter: 25, perplexity: 1532.5136
iteration: 20 of max_iter: 25, perplexity: 1532.4609
done in 531.318s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1618.0820
iteration: 2 of max_iter: 25, perplexity: 1536.1750
iteration: 3 of max_iter: 25, perplexity: 1509.9227
iteration: 4 of max_iter: 25, perplexity: 1497.9706
iteration: 5 of max_iter: 25, perplexity: 1491.7516
iteration: 6 o

iteration: 18 of max_iter: 25, perplexity: 1420.0054
iteration: 19 of max_iter: 25, perplexity: 1418.8207
iteration: 20 of max_iter: 25, perplexity: 1417.7502
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1828.5819
iteration: 2 of max_iter: 25, perplexity: 1774.1494
iteration: 3 of max_iter: 25, perplexity: 1758.0469
iteration: 4 of max_iter: 25, perplexity: 1752.3661
iteration: 5 of max_iter: 25, perplexity: 1749.3460
iteration: 6 of max_iter: 25, perplexity: 1747.2013
iteration: 7 of max_iter: 25, perplexity: 1746.1004
iteration: 8 of max_iter: 25, perplexity: 1745.3692
iteration: 9 of max_iter: 25, perplexity: 1744.5999
iteration: 10 of max_iter: 25, perplexity: 1744.6473
done in 285.203s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1736.1680
iteration: 2 of max_iter: 25, perplexity: 1680.8836
iteration: 3 of max_iter: 25, perplexity: 1664.3173


iteration: 13 of max_iter: 25, perplexity: 1426.1748
iteration: 14 of max_iter: 25, perplexity: 1424.2641
iteration: 15 of max_iter: 25, perplexity: 1422.5867
iteration: 16 of max_iter: 25, perplexity: 1421.1025
iteration: 17 of max_iter: 25, perplexity: 1419.7778
iteration: 18 of max_iter: 25, perplexity: 1418.5873
iteration: 19 of max_iter: 25, perplexity: 1417.5126
iteration: 20 of max_iter: 25, perplexity: 1416.5382
iteration: 21 of max_iter: 25, perplexity: 1415.6508
iteration: 22 of max_iter: 25, perplexity: 1414.8386
iteration: 23 of max_iter: 25, perplexity: 1414.0918
iteration: 24 of max_iter: 25, perplexity: 1413.4020
iteration: 25 of max_iter: 25, perplexity: 1412.7612
done in 587.220s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1661.3757
iteration: 2 of max_iter: 25, perplexity: 1558.6993
iteration: 3 of max_iter: 25, perplexity: 1512.0679
iteration: 4 of max_iter: 25, perplexity: 1485.7921
iteration: 5

iteration: 25 of max_iter: 25, perplexity: 1428.8264
done in 837.684s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1612.1463
iteration: 2 of max_iter: 25, perplexity: 1536.9700
iteration: 3 of max_iter: 25, perplexity: 1503.9628
iteration: 4 of max_iter: 25, perplexity: 1485.0343
iteration: 5 of max_iter: 25, perplexity: 1472.7180
iteration: 6 of max_iter: 25, perplexity: 1463.9636
iteration: 7 of max_iter: 25, perplexity: 1457.2570
iteration: 8 of max_iter: 25, perplexity: 1451.9817
iteration: 9 of max_iter: 25, perplexity: 1447.8226
iteration: 10 of max_iter: 25, perplexity: 1444.5078
iteration: 11 of max_iter: 25, perplexity: 1441.8232
iteration: 12 of max_iter: 25, perplexity: 1439.6136
iteration: 13 of max_iter: 25, perplexity: 1437.7580
iteration: 14 of max_iter: 25, perplexity: 1436.1804
iteration: 15 of max_iter: 25, perplexity: 1434.8210
iteration: 16 of max_iter: 25, perplexity: 1433.6404
iteration: 17 of 

iteration: 5 of max_iter: 25, perplexity: 1431.2846
iteration: 6 of max_iter: 25, perplexity: 1427.2930
iteration: 7 of max_iter: 25, perplexity: 1424.5145
iteration: 8 of max_iter: 25, perplexity: 1422.6819
iteration: 9 of max_iter: 25, perplexity: 1421.2340
iteration: 10 of max_iter: 25, perplexity: 1420.0940
iteration: 11 of max_iter: 25, perplexity: 1418.9895
iteration: 12 of max_iter: 25, perplexity: 1418.2458
iteration: 13 of max_iter: 25, perplexity: 1417.4015
iteration: 14 of max_iter: 25, perplexity: 1416.8379
iteration: 15 of max_iter: 25, perplexity: 1416.3296
iteration: 16 of max_iter: 25, perplexity: 1415.9906
iteration: 17 of max_iter: 25, perplexity: 1415.6300
iteration: 18 of max_iter: 25, perplexity: 1415.2583
iteration: 19 of max_iter: 25, perplexity: 1414.9218
iteration: 20 of max_iter: 25, perplexity: 1414.6626
iteration: 21 of max_iter: 25, perplexity: 1414.4483
iteration: 22 of max_iter: 25, perplexity: 1414.2021
iteration: 23 of max_iter: 25, perplexity: 1413.985

iteration: 1 of max_iter: 25, perplexity: 1701.2107
iteration: 2 of max_iter: 25, perplexity: 1655.7932
iteration: 3 of max_iter: 25, perplexity: 1640.8384
iteration: 4 of max_iter: 25, perplexity: 1632.7260
iteration: 5 of max_iter: 25, perplexity: 1628.0257
iteration: 6 of max_iter: 25, perplexity: 1624.1466
iteration: 7 of max_iter: 25, perplexity: 1621.2112
iteration: 8 of max_iter: 25, perplexity: 1619.1679
iteration: 9 of max_iter: 25, perplexity: 1617.9922
iteration: 10 of max_iter: 25, perplexity: 1616.3806
iteration: 11 of max_iter: 25, perplexity: 1614.8770
iteration: 12 of max_iter: 25, perplexity: 1613.8841
iteration: 13 of max_iter: 25, perplexity: 1612.9307
iteration: 14 of max_iter: 25, perplexity: 1612.0047
iteration: 15 of max_iter: 25, perplexity: 1611.3351
iteration: 16 of max_iter: 25, perplexity: 1610.1694
iteration: 17 of max_iter: 25, perplexity: 1608.8346
iteration: 18 of max_iter: 25, perplexity: 1607.8596
iteration: 19 of max_iter: 25, perplexity: 1607.2013
it

iteration: 5 of max_iter: 25, perplexity: 1485.8833
iteration: 6 of max_iter: 25, perplexity: 1477.6576
iteration: 7 of max_iter: 25, perplexity: 1471.3576
iteration: 8 of max_iter: 25, perplexity: 1466.4515
iteration: 9 of max_iter: 25, perplexity: 1462.5555
iteration: 10 of max_iter: 25, perplexity: 1459.3977
iteration: 11 of max_iter: 25, perplexity: 1456.8177
iteration: 12 of max_iter: 25, perplexity: 1454.6668
iteration: 13 of max_iter: 25, perplexity: 1452.8399
iteration: 14 of max_iter: 25, perplexity: 1451.2699
iteration: 15 of max_iter: 25, perplexity: 1449.8998
iteration: 16 of max_iter: 25, perplexity: 1448.6950
iteration: 17 of max_iter: 25, perplexity: 1447.6327
iteration: 18 of max_iter: 25, perplexity: 1446.6807
iteration: 19 of max_iter: 25, perplexity: 1445.8214
iteration: 20 of max_iter: 25, perplexity: 1445.0421
iteration: 21 of max_iter: 25, perplexity: 1444.3287
iteration: 22 of max_iter: 25, perplexity: 1443.6696
iteration: 23 of max_iter: 25, perplexity: 1443.057

iteration: 4 of max_iter: 25, perplexity: 1423.4237
iteration: 5 of max_iter: 25, perplexity: 1418.8567
iteration: 6 of max_iter: 25, perplexity: 1416.1360
iteration: 7 of max_iter: 25, perplexity: 1413.9449
iteration: 8 of max_iter: 25, perplexity: 1412.3866
iteration: 9 of max_iter: 25, perplexity: 1411.3396
iteration: 10 of max_iter: 25, perplexity: 1410.5569
iteration: 11 of max_iter: 25, perplexity: 1409.7682
iteration: 12 of max_iter: 25, perplexity: 1409.0145
iteration: 13 of max_iter: 25, perplexity: 1408.3900
iteration: 14 of max_iter: 25, perplexity: 1407.8971
iteration: 15 of max_iter: 25, perplexity: 1407.5100
iteration: 16 of max_iter: 25, perplexity: 1407.1855
iteration: 17 of max_iter: 25, perplexity: 1406.8143
iteration: 18 of max_iter: 25, perplexity: 1406.5065
iteration: 19 of max_iter: 25, perplexity: 1406.1606
iteration: 20 of max_iter: 25, perplexity: 1405.8317
iteration: 21 of max_iter: 25, perplexity: 1405.5412
iteration: 22 of max_iter: 25, perplexity: 1405.3180

iteration: 4 of max_iter: 25, perplexity: 1595.4978
iteration: 5 of max_iter: 25, perplexity: 1591.5554
iteration: 6 of max_iter: 25, perplexity: 1587.1184
iteration: 7 of max_iter: 25, perplexity: 1583.6613
iteration: 8 of max_iter: 25, perplexity: 1581.5733
iteration: 9 of max_iter: 25, perplexity: 1578.9008
iteration: 10 of max_iter: 25, perplexity: 1576.4175
iteration: 11 of max_iter: 25, perplexity: 1574.5877
iteration: 12 of max_iter: 25, perplexity: 1572.8752
iteration: 13 of max_iter: 25, perplexity: 1571.1575
iteration: 14 of max_iter: 25, perplexity: 1568.6267
iteration: 15 of max_iter: 25, perplexity: 1566.8166
iteration: 16 of max_iter: 25, perplexity: 1565.3497
iteration: 17 of max_iter: 25, perplexity: 1564.4419
iteration: 18 of max_iter: 25, perplexity: 1563.2096
iteration: 19 of max_iter: 25, perplexity: 1562.3453
iteration: 20 of max_iter: 25, perplexity: 1561.4351
iteration: 21 of max_iter: 25, perplexity: 1560.8595
iteration: 22 of max_iter: 25, perplexity: 1560.1452

done in 850.413s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1625.9320
iteration: 2 of max_iter: 25, perplexity: 1564.3832
iteration: 3 of max_iter: 25, perplexity: 1537.2094
iteration: 4 of max_iter: 25, perplexity: 1521.7007
iteration: 5 of max_iter: 25, perplexity: 1511.2487
iteration: 6 of max_iter: 25, perplexity: 1503.6591
iteration: 7 of max_iter: 25, perplexity: 1498.0339
iteration: 8 of max_iter: 25, perplexity: 1493.7867
iteration: 9 of max_iter: 25, perplexity: 1490.4860
iteration: 10 of max_iter: 25, perplexity: 1487.8378
iteration: 11 of max_iter: 25, perplexity: 1485.6678
iteration: 12 of max_iter: 25, perplexity: 1483.8351
iteration: 13 of max_iter: 25, perplexity: 1482.2589
iteration: 14 of max_iter: 25, perplexity: 1480.8995
iteration: 15 of max_iter: 25, perplexity: 1479.7044
iteration: 16 of max_iter: 25, perplexity: 1478.6518
iteration: 17 of max_iter: 25, perplexity: 1477.7101
iteration: 18 of 

iteration: 11 of max_iter: 25, perplexity: 1520.4613
iteration: 12 of max_iter: 25, perplexity: 1518.9201
iteration: 13 of max_iter: 25, perplexity: 1517.6077
iteration: 14 of max_iter: 25, perplexity: 1516.7096
iteration: 15 of max_iter: 25, perplexity: 1515.9094
iteration: 16 of max_iter: 25, perplexity: 1515.2174
iteration: 17 of max_iter: 25, perplexity: 1514.6863
iteration: 18 of max_iter: 25, perplexity: 1514.2053
iteration: 19 of max_iter: 25, perplexity: 1513.7613
iteration: 20 of max_iter: 25, perplexity: 1513.3564
iteration: 21 of max_iter: 25, perplexity: 1513.0980
iteration: 22 of max_iter: 25, perplexity: 1512.8409
iteration: 23 of max_iter: 25, perplexity: 1512.5365
iteration: 24 of max_iter: 25, perplexity: 1512.3662
iteration: 25 of max_iter: 25, perplexity: 1512.1305
done in 666.525s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1772.0147
iteration: 2 of max_iter: 25, perplexity: 1614.4800
iteration:

iteration: 8 of max_iter: 25, perplexity: 1780.1629
iteration: 9 of max_iter: 25, perplexity: 1779.2858
iteration: 10 of max_iter: 25, perplexity: 1778.7901
iteration: 11 of max_iter: 25, perplexity: 1778.7557
done in 320.279s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1767.6182
iteration: 2 of max_iter: 25, perplexity: 1642.1833
iteration: 3 of max_iter: 25, perplexity: 1598.6132
iteration: 4 of max_iter: 25, perplexity: 1580.4743
iteration: 5 of max_iter: 25, perplexity: 1571.1350
iteration: 6 of max_iter: 25, perplexity: 1565.1157
iteration: 7 of max_iter: 25, perplexity: 1561.2719
iteration: 8 of max_iter: 25, perplexity: 1558.5169
iteration: 9 of max_iter: 25, perplexity: 1556.7366
iteration: 10 of max_iter: 25, perplexity: 1555.1126
iteration: 11 of max_iter: 25, perplexity: 1554.0790
iteration: 12 of max_iter: 25, perplexity: 1553.1479
iteration: 13 of max_iter: 25, perplexity: 1552.6154
iteration: 14 of ma

iteration: 5 of max_iter: 25, perplexity: 1480.9657
iteration: 6 of max_iter: 25, perplexity: 1458.6338
iteration: 7 of max_iter: 25, perplexity: 1443.0775
iteration: 8 of max_iter: 25, perplexity: 1431.6937
iteration: 9 of max_iter: 25, perplexity: 1423.0211
iteration: 10 of max_iter: 25, perplexity: 1416.1983
iteration: 11 of max_iter: 25, perplexity: 1410.6954
iteration: 12 of max_iter: 25, perplexity: 1406.1684
iteration: 13 of max_iter: 25, perplexity: 1402.3821
iteration: 14 of max_iter: 25, perplexity: 1399.1696
iteration: 15 of max_iter: 25, perplexity: 1396.4087
iteration: 16 of max_iter: 25, perplexity: 1394.0079
iteration: 17 of max_iter: 25, perplexity: 1391.8987
iteration: 18 of max_iter: 25, perplexity: 1390.0282
iteration: 19 of max_iter: 25, perplexity: 1388.3560
iteration: 20 of max_iter: 25, perplexity: 1386.8501
iteration: 21 of max_iter: 25, perplexity: 1385.4851
iteration: 22 of max_iter: 25, perplexity: 1384.2401
iteration: 23 of max_iter: 25, perplexity: 1383.098

iteration: 1 of max_iter: 25, perplexity: 1700.4186
iteration: 2 of max_iter: 25, perplexity: 1561.0627
iteration: 3 of max_iter: 25, perplexity: 1498.7118
iteration: 4 of max_iter: 25, perplexity: 1464.8148
iteration: 5 of max_iter: 25, perplexity: 1443.7944
iteration: 6 of max_iter: 25, perplexity: 1429.5765
iteration: 7 of max_iter: 25, perplexity: 1419.3644
iteration: 13 of max_iter: 25, perplexity: 1391.3366
iteration: 14 of max_iter: 25, perplexity: 1389.0801
iteration: 15 of max_iter: 25, perplexity: 1387.1331
iteration: 16 of max_iter: 25, perplexity: 1385.4300
iteration: 17 of max_iter: 25, perplexity: 1383.9221
iteration: 18 of max_iter: 25, perplexity: 1382.5759
iteration: 19 of max_iter: 25, perplexity: 1381.3659
iteration: 20 of max_iter: 25, perplexity: 1380.2717
iteration: 21 of max_iter: 25, perplexity: 1379.2784
iteration: 22 of max_iter: 25, perplexity: 1378.3730
iteration: 23 of max_iter: 25, perplexity: 1377.5449
iteration: 24 of max_iter: 25, perplexity: 1376.7845


iteration: 14 of max_iter: 25, perplexity: 1416.3233
iteration: 15 of max_iter: 25, perplexity: 1415.4435
iteration: 16 of max_iter: 25, perplexity: 1414.6631
iteration: 17 of max_iter: 25, perplexity: 1413.9949
iteration: 18 of max_iter: 25, perplexity: 1413.3791
iteration: 19 of max_iter: 25, perplexity: 1412.8680
iteration: 20 of max_iter: 25, perplexity: 1412.4161
iteration: 21 of max_iter: 25, perplexity: 1412.0063
iteration: 22 of max_iter: 25, perplexity: 1411.6395
iteration: 23 of max_iter: 25, perplexity: 1411.2951
iteration: 24 of max_iter: 25, perplexity: 1410.9981
iteration: 25 of max_iter: 25, perplexity: 1410.7479
done in 684.833s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1715.8223
iteration: 2 of max_iter: 25, perplexity: 1588.4840
iteration: 3 of max_iter: 25, perplexity: 1535.6196
iteration: 4 of max_iter: 25, perplexity: 1507.3873
iteration: 5 of max_iter: 25, perplexity: 1490.0227
iteration: 6 

iteration: 6 of max_iter: 25, perplexity: 1498.5070
iteration: 7 of max_iter: 25, perplexity: 1496.8676
iteration: 8 of max_iter: 25, perplexity: 1495.9869
iteration: 9 of max_iter: 25, perplexity: 1495.5474
iteration: 10 of max_iter: 25, perplexity: 1495.4804
done in 295.994s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1577.9904
iteration: 2 of max_iter: 25, perplexity: 1504.8039
iteration: 3 of max_iter: 25, perplexity: 1480.1319
iteration: 4 of max_iter: 25, perplexity: 1468.7955
iteration: 5 of max_iter: 25, perplexity: 1462.5392
iteration: 6 of max_iter: 25, perplexity: 1458.4777
iteration: 7 of max_iter: 25, perplexity: 1455.4741
iteration: 8 of max_iter: 25, perplexity: 1453.5939
iteration: 9 of max_iter: 25, perplexity: 1452.5742
iteration: 10 of max_iter: 25, perplexity: 1451.6996
iteration: 11 of max_iter: 25, perplexity: 1451.0183
iteration: 12 of max_iter: 25, perplexity: 1450.4416
iteration: 13 of max_

iteration: 24 of max_iter: 25, perplexity: 1389.4721
iteration: 25 of max_iter: 25, perplexity: 1388.8461
done in 539.655s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1868.5378
iteration: 2 of max_iter: 25, perplexity: 1786.5881
iteration: 3 of max_iter: 25, perplexity: 1752.2544
iteration: 4 of max_iter: 25, perplexity: 1734.8626
iteration: 5 of max_iter: 25, perplexity: 1727.0513
iteration: 6 of max_iter: 25, perplexity: 1721.8406
iteration: 7 of max_iter: 25, perplexity: 1718.4852
iteration: 8 of max_iter: 25, perplexity: 1717.1857
iteration: 9 of max_iter: 25, perplexity: 1716.1771
iteration: 10 of max_iter: 25, perplexity: 1715.7067
iteration: 11 of max_iter: 25, perplexity: 1715.0524
iteration: 12 of max_iter: 25, perplexity: 1715.1552
iteration: 13 of max_iter: 25, perplexity: 1715.0435
iteration: 14 of max_iter: 25, perplexity: 1714.5532
iteration: 15 of max_iter: 25, perplexity: 1714.3363
iteration: 16 of 

Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1684.8755
iteration: 2 of max_iter: 25, perplexity: 1586.8514
iteration: 3 of max_iter: 25, perplexity: 1540.1055
iteration: 4 of max_iter: 25, perplexity: 1511.6272
iteration: 5 of max_iter: 25, perplexity: 1493.1332
iteration: 6 of max_iter: 25, perplexity: 1480.3041
iteration: 7 of max_iter: 25, perplexity: 1470.7553
iteration: 8 of max_iter: 25, perplexity: 1463.3138
iteration: 9 of max_iter: 25, perplexity: 1457.2103
iteration: 10 of max_iter: 25, perplexity: 1451.9905
iteration: 11 of max_iter: 25, perplexity: 1447.5318
iteration: 12 of max_iter: 25, perplexity: 1443.7759
iteration: 13 of max_iter: 25, perplexity: 1440.5568
iteration: 14 of max_iter: 25, perplexity: 1437.7990
iteration: 15 of max_iter: 25, perplexity: 1435.4376
iteration: 16 of max_iter: 25, perplexity: 1433.3772
iteration: 17 of max_iter: 25, perplexity: 1431.5460
iteration: 18 of max_iter: 25, perp

iteration: 1 of max_iter: 25, perplexity: 1546.0922
iteration: 2 of max_iter: 25, perplexity: 1478.4961
iteration: 3 of max_iter: 25, perplexity: 1454.5635
iteration: 4 of max_iter: 25, perplexity: 1442.7193
iteration: 5 of max_iter: 25, perplexity: 1434.9142
iteration: 6 of max_iter: 25, perplexity: 1429.7295
iteration: 7 of max_iter: 25, perplexity: 1426.0007
iteration: 8 of max_iter: 25, perplexity: 1422.8184
iteration: 9 of max_iter: 25, perplexity: 1420.1364
iteration: 10 of max_iter: 25, perplexity: 1418.0620
iteration: 11 of max_iter: 25, perplexity: 1415.9330
iteration: 12 of max_iter: 25, perplexity: 1414.1053
iteration: 13 of max_iter: 25, perplexity: 1412.6538
iteration: 14 of max_iter: 25, perplexity: 1411.2250
iteration: 15 of max_iter: 25, perplexity: 1410.0310
iteration: 16 of max_iter: 25, perplexity: 1409.0516
iteration: 17 of max_iter: 25, perplexity: 1408.1755
iteration: 18 of max_iter: 25, perplexity: 1407.4022
iteration: 19 of max_iter: 25, perplexity: 1406.7813
it

iteration: 5 of max_iter: 25, perplexity: 1644.1357
iteration: 6 of max_iter: 25, perplexity: 1639.4211
iteration: 7 of max_iter: 25, perplexity: 1635.8662
iteration: 8 of max_iter: 25, perplexity: 1633.7792
iteration: 9 of max_iter: 25, perplexity: 1632.4756
iteration: 10 of max_iter: 25, perplexity: 1630.8144
iteration: 11 of max_iter: 25, perplexity: 1629.5476
iteration: 12 of max_iter: 25, perplexity: 1627.9381
iteration: 13 of max_iter: 25, perplexity: 1627.0993
iteration: 14 of max_iter: 25, perplexity: 1626.3541
iteration: 15 of max_iter: 25, perplexity: 1625.1044
iteration: 16 of max_iter: 25, perplexity: 1623.8050
iteration: 17 of max_iter: 25, perplexity: 1623.1534
iteration: 18 of max_iter: 25, perplexity: 1622.1460
iteration: 19 of max_iter: 25, perplexity: 1621.1433
iteration: 20 of max_iter: 25, perplexity: 1620.1683
iteration: 21 of max_iter: 25, perplexity: 1619.1584
iteration: 22 of max_iter: 25, perplexity: 1618.1025
iteration: 23 of max_iter: 25, perplexity: 1617.328

iteration: 5 of max_iter: 25, perplexity: 1517.9574
iteration: 6 of max_iter: 25, perplexity: 1505.5486
iteration: 7 of max_iter: 25, perplexity: 1496.0829
iteration: 8 of max_iter: 25, perplexity: 1488.3281
iteration: 9 of max_iter: 25, perplexity: 1481.8833
iteration: 10 of max_iter: 25, perplexity: 1476.6802
iteration: 11 of max_iter: 25, perplexity: 1472.4305
iteration: 12 of max_iter: 25, perplexity: 1468.8752
iteration: 13 of max_iter: 25, perplexity: 1465.8367
iteration: 14 of max_iter: 25, perplexity: 1463.2419
iteration: 15 of max_iter: 25, perplexity: 1460.9849
iteration: 16 of max_iter: 25, perplexity: 1458.9961
iteration: 17 of max_iter: 25, perplexity: 1457.2349
iteration: 18 of max_iter: 25, perplexity: 1455.6754
iteration: 19 of max_iter: 25, perplexity: 1454.2764
iteration: 20 of max_iter: 25, perplexity: 1453.0068
iteration: 21 of max_iter: 25, perplexity: 1451.8556
iteration: 22 of max_iter: 25, perplexity: 1450.8011
iteration: 23 of max_iter: 25, perplexity: 1449.828

iteration: 1 of max_iter: 25, perplexity: 1533.9535
iteration: 2 of max_iter: 25, perplexity: 1472.7592
iteration: 3 of max_iter: 25, perplexity: 1451.1999
iteration: 4 of max_iter: 25, perplexity: 1439.3945
iteration: 5 of max_iter: 25, perplexity: 1432.3008
iteration: 6 of max_iter: 25, perplexity: 1427.1819
iteration: 7 of max_iter: 25, perplexity: 1423.3800
iteration: 8 of max_iter: 25, perplexity: 1419.9864
iteration: 9 of max_iter: 25, perplexity: 1417.3268
iteration: 10 of max_iter: 25, perplexity: 1414.8703
iteration: 11 of max_iter: 25, perplexity: 1412.5805
iteration: 12 of max_iter: 25, perplexity: 1410.4385
iteration: 13 of max_iter: 25, perplexity: 1408.7291
iteration: 14 of max_iter: 25, perplexity: 1407.3069
iteration: 15 of max_iter: 25, perplexity: 1406.0368
iteration: 16 of max_iter: 25, perplexity: 1405.0424
iteration: 17 of max_iter: 25, perplexity: 1404.3864
iteration: 18 of max_iter: 25, perplexity: 1403.8231
iteration: 19 of max_iter: 25, perplexity: 1403.2101
it

iteration: 24 of max_iter: 25, perplexity: 1487.6486
iteration: 25 of max_iter: 25, perplexity: 1487.2682
done in 567.449s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 2361.5176
iteration: 2 of max_iter: 25, perplexity: 2196.5638
iteration: 3 of max_iter: 25, perplexity: 2132.3635
iteration: 4 of max_iter: 25, perplexity: 2106.9988
iteration: 5 of max_iter: 25, perplexity: 2097.2035
iteration: 6 of max_iter: 25, perplexity: 2093.8069
iteration: 7 of max_iter: 25, perplexity: 2092.0946
iteration: 8 of max_iter: 25, perplexity: 2092.2012
iteration: 9 of max_iter: 25, perplexity: 2091.8592
iteration: 10 of max_iter: 25, perplexity: 2091.7915
done in 296.891s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 2174.7089
iteration: 2 of max_iter: 25, perplexity: 2009.8071
iteration: 3 of max_iter: 25, perplexity: 1943.1713
iteration: 4 of max_iter: 25, perpl

iteration: 11 of max_iter: 25, perplexity: 1446.9052
iteration: 12 of max_iter: 25, perplexity: 1442.2864
iteration: 13 of max_iter: 25, perplexity: 1438.4122
iteration: 14 of max_iter: 25, perplexity: 1435.1244
iteration: 15 of max_iter: 25, perplexity: 1432.3035
iteration: 16 of max_iter: 25, perplexity: 1429.8411
iteration: 17 of max_iter: 25, perplexity: 1427.6894
iteration: 18 of max_iter: 25, perplexity: 1425.7856
iteration: 19 of max_iter: 25, perplexity: 1424.0847
iteration: 20 of max_iter: 25, perplexity: 1422.5641
iteration: 21 of max_iter: 25, perplexity: 1421.1982
iteration: 22 of max_iter: 25, perplexity: 1419.9622
iteration: 23 of max_iter: 25, perplexity: 1418.8386
iteration: 24 of max_iter: 25, perplexity: 1417.8141
iteration: 25 of max_iter: 25, perplexity: 1416.8729
done in 686.559s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1915.7620
iteration: 2 of max_iter: 25, perplexity: 1696.6749
iteration:

iteration: 13 of max_iter: 25, perplexity: 1478.2097
iteration: 14 of max_iter: 25, perplexity: 1476.4626
iteration: 15 of max_iter: 25, perplexity: 1474.9859
iteration: 16 of max_iter: 25, perplexity: 1473.6914
iteration: 17 of max_iter: 25, perplexity: 1472.5331
iteration: 18 of max_iter: 25, perplexity: 1471.5120
iteration: 19 of max_iter: 25, perplexity: 1470.5957
iteration: 20 of max_iter: 25, perplexity: 1469.7747
iteration: 21 of max_iter: 25, perplexity: 1469.0259
iteration: 22 of max_iter: 25, perplexity: 1468.3694
iteration: 23 of max_iter: 25, perplexity: 1467.7505
iteration: 24 of max_iter: 25, perplexity: 1467.2141
iteration: 25 of max_iter: 25, perplexity: 1466.7122
done in 699.991s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1879.0793
iteration: 2 of max_iter: 25, perplexity: 1692.3374
iteration: 3 of max_iter: 25, perplexity: 1614.1216
iteration: 4 of max_iter: 25, perplexity: 1575.0056
iteration: 5

iteration: 10 of max_iter: 25, perplexity: 1827.4828
iteration: 11 of max_iter: 25, perplexity: 1827.3581
iteration: 12 of max_iter: 25, perplexity: 1827.2641
done in 350.881s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1802.9188
iteration: 2 of max_iter: 25, perplexity: 1696.4438
iteration: 3 of max_iter: 25, perplexity: 1659.5709
iteration: 4 of max_iter: 25, perplexity: 1641.3479
iteration: 5 of max_iter: 25, perplexity: 1630.8105
iteration: 6 of max_iter: 25, perplexity: 1624.0835
iteration: 7 of max_iter: 25, perplexity: 1619.8436
iteration: 8 of max_iter: 25, perplexity: 1617.2757
iteration: 9 of max_iter: 25, perplexity: 1615.6812
iteration: 10 of max_iter: 25, perplexity: 1614.2253
iteration: 11 of max_iter: 25, perplexity: 1613.1683
iteration: 12 of max_iter: 25, perplexity: 1612.4454
iteration: 13 of max_iter: 25, perplexity: 1611.9167
iteration: 14 of max_iter: 25, perplexity: 1611.2122
iteration: 15 of 

iteration: 25 of max_iter: 25, perplexity: 1364.9632
done in 572.891s.
Fitting LDA models with tf features, n_samples=87733 and n_features=5050
iteration: 1 of max_iter: 25, perplexity: 1861.0799
iteration: 2 of max_iter: 25, perplexity: 1640.0330
iteration: 3 of max_iter: 25, perplexity: 1541.3486
iteration: 4 of max_iter: 25, perplexity: 1488.7233
iteration: 5 of max_iter: 25, perplexity: 1457.4304
iteration: 6 of max_iter: 25, perplexity: 1437.0605
iteration: 7 of max_iter: 25, perplexity: 1422.7883
iteration: 8 of max_iter: 25, perplexity: 1412.2307
iteration: 9 of max_iter: 25, perplexity: 1404.1114
iteration: 10 of max_iter: 25, perplexity: 1397.6926
iteration: 11 of max_iter: 25, perplexity: 1392.5174
iteration: 12 of max_iter: 25, perplexity: 1388.2695
iteration: 13 of max_iter: 25, perplexity: 1384.7197
iteration: 14 of max_iter: 25, perplexity: 1381.7011
iteration: 15 of max_iter: 25, perplexity: 1379.0951
iteration: 16 of max_iter: 25, perplexity: 1376.8188
iteration: 17 of 

### results

### プロフィールを

In [None]:
# def get_top_prof (topic_idx, top_n = 100):
    
#     topics = lda.transform(X)
#     prof_idx_list = topics[:, topic_idx].argsort()[:-top_n - 1:-1]
#     return [docs[d] for d in prof_idx_list]

In [None]:
# get_top_prof(0, 20)

In [None]:
# get_top_prof(1, 20)

In [None]:
# get_top_prof(2, 20)

In [None]:
# get_top_prof(3, 20)

In [None]:
# get_top_prof(4, 20)

In [None]:
# get_top_prof(5, 20)

## corpusとauthorをpickle

In [21]:
len(corpus)

87733

In [31]:
author_nonan = []
for i,doc in tqdm(enumerate(docs)):
    if str(doc) == "nan":
        continue
    else:
        author_nonan.append(df["id"].iloc[i])


print(author_nonan[:10])

93069it [00:01, 80988.49it/s]


[235517449,
 3004809181,
 960755580775276544,
 1326081427,
 1147981279,
 1018121710048964609,
 1165141243443068928,
 1171692111084699650,
 3265972872,
 849903469024518144]

In [41]:
dir_name  = "/home/input/twitter_api/experiment/0803expt/"

file_name = 'author_nonan.pickle'
with open(dir_name + file_name, mode="wb") as f:
    pickle.dump(author_nonan, f)
        

In [42]:
file_name = 'corpus.pickle'
with open(dir_name + file_name, mode="wb") as f:
    pickle.dump( corpus, f)