<a href="https://colab.research.google.com/github/tomonari-masada/course-nlp2020/blob/master/11_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 標準使用ライブラリー
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import shutil
from icecream import ic
from tqdm import tqdm_notebook as tqdm 



# 追記
import json
import datetime
import math

# debug
#%pdb on

import pixiedust #%pixie_debugger

# tfがエラーはかないため
# tfがエラーはかないため
#import tensorflow as tf
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="5"
#physical_devices = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0], True)


Pixiedust database opened successfully


In [2]:
import pandas as pd
import re
import emoji
from wordcloud import WordCloud, STOPWORDS

df = pd.read_csv("./result0605.csv", engine='python')

type(df["description"])
docs = df["description"].to_list()

In [3]:
print(len(docs))

93794


### Neologdを使ってtokenizeする

In [4]:

import subprocess
import MeCab

def make_neologd_tagger():
    cmd='echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
    path_neologd = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                               shell=True).communicate()[0]).decode('utf-8')
    m=MeCab.Tagger("-Ochasen -d "+str(path_neologd))
    return (m)


def neolog_prep_text( text, m):
    return_words = []

    
    splited_text = (re.split('[\t,]', line) for line in m.parse(text).split('\n'))
    for tmp_word in splited_text :
        if (tmp_word[0] in ('EOS', '', 't', 'ー') ):
           continue 
        if not re.match( '名詞' ,tmp_word[3]  ) or tmp_word[0] in emoji.UNICODE_EMOJI["en"]:
            continue
        else:
            return_words.append(tmp_word[0])

    return return_words


* tokenizationの実行

In [5]:
from tqdm import tqdm


m = make_neologd_tagger()

new_docs = list()
for doc in tqdm(docs):
  if str(doc) == "nan":
    continue
  tmp_words =  neolog_prep_text(str(doc), m)
  new_docs.append( tmp_words )
  

100%|██████████| 93794/93794 [00:24<00:00, 3836.75it/s]


* tokenizationの結果を確認

In [6]:
print(new_docs[5])

['過去', 'ジャパリカート', '動画', 'TSUMURI', 'KART', 'VRChat', 'ワリスノ', 'MK', '8', 'DX', '一位', 'りし', 'た人', '社会', '出て', '配信', 'https', 'co', 'FJoitl', '8', 'JHE', 'ヘッダ', '猫', '飼い主', 'smmmmm']


* 各文書を長い文字列で表しなおす（CountVectorizerを後で使うため）

In [7]:
corpus = [' '.join(doc) for doc in new_docs]

## 11-02 データ行列の作成
* LDAの場合、単に単語の出現頻度を重みとして各文書をベクトル化する。

### sklearnのCountVectorizerで疎行列化する

* 全文書の半分より多い文書に現れる単語は、高頻度語とみなして削除する。
* 30件未満の文書にしか現れない単語は、低頻度語とみなして削除する。

In [8]:
import os
import urllib.request
def download_stopwords(path):
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    if os.path.exists(path):
        print('File already exists.')
    else:
        print('Downloading...')
        # Download the file from `url` and save it locally under `file_name`:
        urllib.request.urlretrieve(url, path)

def create_stopwords(file_path):
    stop_words = []
    for w in open(path, "r"):
        w = w.replace('\n','')
        if len(w) > 0:
          stop_words.append(w)
    return stop_words    

path = "stop_words.txt"
download_stopwords(path)
stop_words = create_stopwords(path)

File already exists.


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

MIN_DF = 30

vectorizer = CountVectorizer(max_df=0.5, min_df= MIN_DF, stop_words=stop_words)

X = vectorizer.fit_transform(corpus)

In [10]:
print(len(vectorizer.get_feature_names()))

5100


* 文書数と語彙サイズを変数にセット

In [11]:
n_samples, n_features = X.shape

### TF-IDFで各文書における単語の重みを計算する

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
Xtfidf = tfidf.fit_transform(X)

In [13]:
print(Xtfidf[0])

  (0, 4227)	0.603862693464167
  (0, 4148)	0.4706802938434637
  (0, 3490)	0.1847556021921528
  (0, 3414)	0.3341859275888348
  (0, 3402)	0.3582132311126841
  (0, 1747)	0.24047370992039763
  (0, 1109)	0.2860956441170824


In [14]:
Xtfidf.shape

(88481, 5100)

### LDAのインポート

In [15]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from matplotlib.backends.backend_pdf import PdfPages
import datetime
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

### トピックの重要語を取り出す関数の定義

In [16]:
def get_top_words(model, feature_names, n_top_words=30):
  top_features = list()
  weights = list()
  for topic_idx, topic in enumerate(model.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features.append([feature_names[i] for i in top_features_ind])
    weights.append(topic[top_features_ind])
  return top_features, weights

# LDAでトピック抽出

### LDAによるトピック抽出の実行

In [17]:
def make_word_cloud(n_components, lda):
    # matplotlib and seaborn for plotting
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    plt.style.use('dark_background')
    top_words, weights = get_top_words(lda, vectorizer.get_feature_names())
    topic_words = [dict(zip(top_words[i], weights[i])) for i in range(n_components)]
    FONT_PATH = "/usr/share/fonts/opentype/ipaexfont-mincho/ipaexm.ttf"
    cloud = WordCloud(stopwords=STOPWORDS,
                  font_path=FONT_PATH,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=100,
                  colormap='tab10'
                  )

    tate = math.ceil(len(topic_words) / 2)
    fig, axes = plt.subplots(tate, 2, figsize=(32, 50), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):

        if i >= len(topic_words):
            break

        fig.add_subplot(ax)
        cloud.generate_from_frequencies(topic_words[i], max_font_size=500)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()

    pdf = PdfPages( 
        (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M') + 'topic.pdf')


    fignums = plt.get_fignums()
    for fignum in fignums:
        plt.figure(fignum)
        pdf.savefig()

    pdf.close() 


In [18]:
import logging
import pickle
import itertools

def lda_main (batch_size ,n_components, topic_word_prior,doc_topic_prior  ,max_iter=30):

    folder_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M')

    # フォルダを作成
    os.mkdir("./0701expt/"+folder_name)
    os.chdir("./0701expt/"+folder_name)

    # logging
    logger = logging.getLogger()
    fhandler = logging.FileHandler(filename='mylog.log', mode='a')
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fhandler.setFormatter(formatter)
    logger.addHandler(fhandler)
    logger.setLevel(logging.WARNING)
    
    
    lda = LatentDirichletAllocation(n_components=n_components, 
                                    max_iter=max_iter,
                                    topic_word_prior=topic_word_prior, # トピック数の逆数が目安の0.01,0.02,0.05,0.1などなど試す
                                    doc_topic_prior =  doc_topic_prior, 
                                    learning_method='online',
                                    learning_offset=50,
                                    batch_size= batch_size,# 多くする
                                    learning_decay = 1.3,
                                    mean_change_tol=1e-4,
                                    random_state=1,
                                    evaluate_every=1,
                                    verbose=1)
    print((f"Fitting LDA models with tf features, "
    f"n_samples={n_samples} and n_features={n_features}"))
    t0 = time()
    lda.fit(X)
    print(f"done in {time() - t0:0.3f}s.")
    # パラメータの比較はperplexity
    # ハイパーパラメータ調整を頑張る！（やってられない！といわない！！）
    
    
    coherance = metric_coherence_gensim(measure='c_v', 
#                         top_n=20, # これはデフォルトが20
                        topic_word_distrib=lda.components_, 
                        dtm=Xtfidf,  # tfidfの結果
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=new_docs)
    
    
    results = {
            "perplexity" : lda.perplexity(X) ,
            "coherance": coherance,
        }

    logger.warning('MIN_DF:{0}'.format(MIN_DF) )
    logger.warning('params:batch_size:{0}'.format(batch_size)) 
    logger.warning('params:n_components:{0}'.format(n_components)) 
    logger.warning('params:topic_word_prior:{0}'.format(topic_word_prior)) 
    logger.warning('params:doc_topic_prior:{0}'.format(doc_topic_prior)) 
    logger.warning('params:max_iter:{0}'.format(max_iter)) 
    logger.warning('done n_iter:{0}'.format(lda.n_iter_)) 
    logger.warning('perplexity:{0}'.format(results["perplexity"])) 
    logger.warning('coherance:{0}'.format(results["coherance"]) )
    logger.warning('check all params:{0}'.format(lda.get_params() )) 
    make_word_cloud(n_components, lda)
    # pickle
    file_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M') + '_lda.pickle'
    with open(file_name, mode="wb") as f:
        pickle.dump(lda, f)
    
#     breakpoint()
    
    os.chdir("../../")
    return(results)
    

In [None]:
for batch_size,n_components, topic_word_prior, doc_topic_prior in itertools.product([1500,3000 ] ,[6,8,10,12,15,18,22,25,30,35,40],
                                                                                    [0.01, 0.03,0.05, 0.15, 0.3, 0.4, 0.5, 0.6, 0.8],[0.01, 0.03,0.05, 0.15, 0.3, 0.4, 0.5, 0.6, 0.8] ):

    lda_main(n_components=n_components, topic_word_prior=topic_word_prior, doc_topic_prior = doc_topic_prior, batch_size=batch_size, max_iter=50)
    
    
    

Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1873.1893
iteration: 2 of max_iter: 50, perplexity: 1813.8260
iteration: 3 of max_iter: 50, perplexity: 1790.2595
iteration: 4 of max_iter: 50, perplexity: 1776.9134
iteration: 5 of max_iter: 50, perplexity: 1767.9834
iteration: 6 of max_iter: 50, perplexity: 1761.9745
iteration: 7 of max_iter: 50, perplexity: 1757.5418
iteration: 8 of max_iter: 50, perplexity: 1753.9461
iteration: 9 of max_iter: 50, perplexity: 1750.6256
iteration: 14 of max_iter: 50, perplexity: 1740.8586
iteration: 15 of max_iter: 50, perplexity: 1739.6057
iteration: 16 of max_iter: 50, perplexity: 1738.3970
iteration: 17 of max_iter: 50, perplexity: 1737.3847
iteration: 18 of max_iter: 50, perplexity: 1736.4357
iteration: 19 of max_iter: 50, perplexity: 1735.5080
iteration: 20 of max_iter: 50, perplexity: 1734.7204
iteration: 21 of max_iter: 50, perplexity: 1733.9279
iteration: 22 of max_iter: 50, perp

iteration: 5 of max_iter: 50, perplexity: 1869.5752
iteration: 6 of max_iter: 50, perplexity: 1858.2278
iteration: 7 of max_iter: 50, perplexity: 1849.4780
iteration: 8 of max_iter: 50, perplexity: 1842.3684
iteration: 9 of max_iter: 50, perplexity: 1836.4543
iteration: 10 of max_iter: 50, perplexity: 1831.5159
iteration: 11 of max_iter: 50, perplexity: 1827.2192
iteration: 12 of max_iter: 50, perplexity: 1823.5000
iteration: 13 of max_iter: 50, perplexity: 1820.1872
iteration: 14 of max_iter: 50, perplexity: 1817.2461
iteration: 15 of max_iter: 50, perplexity: 1814.6321
iteration: 16 of max_iter: 50, perplexity: 1812.2227
iteration: 17 of max_iter: 50, perplexity: 1810.0508
iteration: 18 of max_iter: 50, perplexity: 1808.0230
iteration: 19 of max_iter: 50, perplexity: 1806.1826
iteration: 20 of max_iter: 50, perplexity: 1804.4885
iteration: 21 of max_iter: 50, perplexity: 1802.8884
iteration: 22 of max_iter: 50, perplexity: 1801.4169
iteration: 23 of max_iter: 50, perplexity: 1800.059

iteration: 8 of max_iter: 50, perplexity: 1910.7550
iteration: 9 of max_iter: 50, perplexity: 1903.5300
iteration: 10 of max_iter: 50, perplexity: 1897.4686
iteration: 11 of max_iter: 50, perplexity: 1892.2888
iteration: 12 of max_iter: 50, perplexity: 1887.7955
iteration: 13 of max_iter: 50, perplexity: 1883.8489
iteration: 14 of max_iter: 50, perplexity: 1880.3461
iteration: 15 of max_iter: 50, perplexity: 1877.2093
iteration: 16 of max_iter: 50, perplexity: 1874.3785
iteration: 17 of max_iter: 50, perplexity: 1871.8066
iteration: 18 of max_iter: 50, perplexity: 1869.4561
iteration: 19 of max_iter: 50, perplexity: 1867.2967
iteration: 20 of max_iter: 50, perplexity: 1865.3035
iteration: 21 of max_iter: 50, perplexity: 1863.4561
iteration: 22 of max_iter: 50, perplexity: 1861.7373
iteration: 23 of max_iter: 50, perplexity: 1860.1326
iteration: 24 of max_iter: 50, perplexity: 1858.6298
iteration: 25 of max_iter: 50, perplexity: 1857.2184
iteration: 26 of max_iter: 50, perplexity: 1855.

iteration: 12 of max_iter: 50, perplexity: 1680.7594
iteration: 13 of max_iter: 50, perplexity: 1679.3355
iteration: 14 of max_iter: 50, perplexity: 1677.9104
iteration: 15 of max_iter: 50, perplexity: 1676.6028
iteration: 16 of max_iter: 50, perplexity: 1675.4568
iteration: 17 of max_iter: 50, perplexity: 1674.4239
iteration: 18 of max_iter: 50, perplexity: 1673.5735
iteration: 19 of max_iter: 50, perplexity: 1672.7888
iteration: 20 of max_iter: 50, perplexity: 1671.9201
iteration: 21 of max_iter: 50, perplexity: 1671.2140
iteration: 22 of max_iter: 50, perplexity: 1670.5005
iteration: 23 of max_iter: 50, perplexity: 1669.7670
iteration: 24 of max_iter: 50, perplexity: 1669.0491
iteration: 25 of max_iter: 50, perplexity: 1668.6325
iteration: 26 of max_iter: 50, perplexity: 1667.9406
iteration: 27 of max_iter: 50, perplexity: 1667.5614
iteration: 28 of max_iter: 50, perplexity: 1666.7094
iteration: 29 of max_iter: 50, perplexity: 1666.2089
iteration: 30 of max_iter: 50, perplexity: 166

iteration: 12 of max_iter: 50, perplexity: 1758.0319
iteration: 13 of max_iter: 50, perplexity: 1754.8584
iteration: 14 of max_iter: 50, perplexity: 1752.0169
iteration: 15 of max_iter: 50, perplexity: 1749.5124
iteration: 16 of max_iter: 50, perplexity: 1747.2363
iteration: 17 of max_iter: 50, perplexity: 1745.1307
iteration: 18 of max_iter: 50, perplexity: 1743.2056
iteration: 19 of max_iter: 50, perplexity: 1741.4323
iteration: 20 of max_iter: 50, perplexity: 1739.8126
iteration: 21 of max_iter: 50, perplexity: 1738.2905
iteration: 22 of max_iter: 50, perplexity: 1736.8640
iteration: 23 of max_iter: 50, perplexity: 1735.5882
iteration: 24 of max_iter: 50, perplexity: 1734.3566
iteration: 25 of max_iter: 50, perplexity: 1733.2067
iteration: 26 of max_iter: 50, perplexity: 1732.1225
iteration: 27 of max_iter: 50, perplexity: 1731.0668
iteration: 28 of max_iter: 50, perplexity: 1730.0983
iteration: 29 of max_iter: 50, perplexity: 1729.1739
iteration: 30 of max_iter: 50, perplexity: 172

iteration: 12 of max_iter: 50, perplexity: 1819.8848
iteration: 13 of max_iter: 50, perplexity: 1816.1004
iteration: 14 of max_iter: 50, perplexity: 1812.7418
iteration: 15 of max_iter: 50, perplexity: 1809.7342
iteration: 16 of max_iter: 50, perplexity: 1807.0203
iteration: 17 of max_iter: 50, perplexity: 1804.5547
iteration: 18 of max_iter: 50, perplexity: 1802.3016
iteration: 19 of max_iter: 50, perplexity: 1800.2317
iteration: 20 of max_iter: 50, perplexity: 1798.3213
iteration: 21 of max_iter: 50, perplexity: 1796.5506
iteration: 22 of max_iter: 50, perplexity: 1794.9033
iteration: 23 of max_iter: 50, perplexity: 1793.3654
iteration: 24 of max_iter: 50, perplexity: 1791.9252
iteration: 25 of max_iter: 50, perplexity: 1790.5727
iteration: 26 of max_iter: 50, perplexity: 1789.2991
iteration: 27 of max_iter: 50, perplexity: 1788.0970
iteration: 28 of max_iter: 50, perplexity: 1786.9598
iteration: 29 of max_iter: 50, perplexity: 1785.8818
iteration: 30 of max_iter: 50, perplexity: 178

iteration: 12 of max_iter: 50, perplexity: 1655.4807
iteration: 13 of max_iter: 50, perplexity: 1654.0743
iteration: 14 of max_iter: 50, perplexity: 1652.7439
iteration: 15 of max_iter: 50, perplexity: 1651.6469
iteration: 16 of max_iter: 50, perplexity: 1650.5525
iteration: 17 of max_iter: 50, perplexity: 1649.6242
iteration: 18 of max_iter: 50, perplexity: 1648.7073
iteration: 19 of max_iter: 50, perplexity: 1647.8897
iteration: 20 of max_iter: 50, perplexity: 1647.1447
iteration: 21 of max_iter: 50, perplexity: 1646.4389
iteration: 22 of max_iter: 50, perplexity: 1645.7747
iteration: 23 of max_iter: 50, perplexity: 1645.1901
iteration: 24 of max_iter: 50, perplexity: 1644.6275
iteration: 25 of max_iter: 50, perplexity: 1643.9876
iteration: 26 of max_iter: 50, perplexity: 1643.4516
iteration: 27 of max_iter: 50, perplexity: 1642.8287
iteration: 28 of max_iter: 50, perplexity: 1642.3513
iteration: 29 of max_iter: 50, perplexity: 1641.8597
iteration: 30 of max_iter: 50, perplexity: 164

iteration: 17 of max_iter: 50, perplexity: 1718.6222
iteration: 18 of max_iter: 50, perplexity: 1716.7087
iteration: 19 of max_iter: 50, perplexity: 1714.9857
iteration: 20 of max_iter: 50, perplexity: 1713.3983
iteration: 21 of max_iter: 50, perplexity: 1711.9178
iteration: 22 of max_iter: 50, perplexity: 1710.5349
iteration: 23 of max_iter: 50, perplexity: 1709.2692
iteration: 24 of max_iter: 50, perplexity: 1708.0795
iteration: 25 of max_iter: 50, perplexity: 1706.9467
iteration: 26 of max_iter: 50, perplexity: 1705.8955
iteration: 27 of max_iter: 50, perplexity: 1704.8979
iteration: 28 of max_iter: 50, perplexity: 1703.9333
iteration: 29 of max_iter: 50, perplexity: 1703.0379
iteration: 30 of max_iter: 50, perplexity: 1702.2035
iteration: 31 of max_iter: 50, perplexity: 1701.3764
iteration: 32 of max_iter: 50, perplexity: 1700.5744
iteration: 33 of max_iter: 50, perplexity: 1699.8306
iteration: 34 of max_iter: 50, perplexity: 1699.1310
iteration: 35 of max_iter: 50, perplexity: 169

iteration: 21 of max_iter: 50, perplexity: 1769.1372
iteration: 22 of max_iter: 50, perplexity: 1767.5244
iteration: 23 of max_iter: 50, perplexity: 1766.0188
iteration: 24 of max_iter: 50, perplexity: 1764.6089
iteration: 25 of max_iter: 50, perplexity: 1763.2849
iteration: 26 of max_iter: 50, perplexity: 1762.0381
iteration: 27 of max_iter: 50, perplexity: 1760.8614
iteration: 28 of max_iter: 50, perplexity: 1759.7483
iteration: 29 of max_iter: 50, perplexity: 1758.6931
iteration: 30 of max_iter: 50, perplexity: 1757.6910
iteration: 31 of max_iter: 50, perplexity: 1756.7375
iteration: 32 of max_iter: 50, perplexity: 1755.8287
iteration: 33 of max_iter: 50, perplexity: 1754.9613
iteration: 34 of max_iter: 50, perplexity: 1754.1321
iteration: 35 of max_iter: 50, perplexity: 1753.3383
iteration: 36 of max_iter: 50, perplexity: 1752.5774
iteration: 37 of max_iter: 50, perplexity: 1751.8473
iteration: 38 of max_iter: 50, perplexity: 1751.1457
iteration: 39 of max_iter: 50, perplexity: 175

iteration: 42 of max_iter: 50, perplexity: 1593.1653
iteration: 43 of max_iter: 50, perplexity: 1592.9930
iteration: 44 of max_iter: 50, perplexity: 1592.7847
iteration: 45 of max_iter: 50, perplexity: 1592.4645
iteration: 46 of max_iter: 50, perplexity: 1592.2465
iteration: 47 of max_iter: 50, perplexity: 1592.0909
iteration: 48 of max_iter: 50, perplexity: 1591.8762
iteration: 49 of max_iter: 50, perplexity: 1591.6279
iteration: 50 of max_iter: 50, perplexity: 1591.5501
done in 1537.382s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1716.1992
iteration: 2 of max_iter: 50, perplexity: 1655.0351
iteration: 3 of max_iter: 50, perplexity: 1630.8101
iteration: 4 of max_iter: 50, perplexity: 1616.8932
iteration: 5 of max_iter: 50, perplexity: 1607.7668
iteration: 6 of max_iter: 50, perplexity: 1601.3444
iteration: 7 of max_iter: 50, perplexity: 1596.2584
iteration: 8 of max_iter: 50, perplexity: 1592.5455
iteration: 9 of

iteration: 45 of max_iter: 50, perplexity: 1650.1715
iteration: 46 of max_iter: 50, perplexity: 1649.7454
iteration: 47 of max_iter: 50, perplexity: 1649.3228
iteration: 48 of max_iter: 50, perplexity: 1648.9238
iteration: 49 of max_iter: 50, perplexity: 1648.5396
iteration: 50 of max_iter: 50, perplexity: 1648.1693
done in 1727.193s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1965.3490
iteration: 2 of max_iter: 50, perplexity: 1863.4540
iteration: 3 of max_iter: 50, perplexity: 1821.1535
iteration: 19 of max_iter: 50, perplexity: 1720.3836
iteration: 20 of max_iter: 50, perplexity: 1718.8433
iteration: 21 of max_iter: 50, perplexity: 1717.4156
iteration: 22 of max_iter: 50, perplexity: 1716.0900
iteration: 23 of max_iter: 50, perplexity: 1714.8509
iteration: 24 of max_iter: 50, perplexity: 1713.6929
iteration: 25 of max_iter: 50, perplexity: 1712.6041
iteration: 26 of max_iter: 50, perplexity: 1711.5812
iteration:

iteration: 11 of max_iter: 50, perplexity: 1775.6856
iteration: 12 of max_iter: 50, perplexity: 1771.5271
iteration: 13 of max_iter: 50, perplexity: 1767.8781
iteration: 14 of max_iter: 50, perplexity: 1764.6421
iteration: 15 of max_iter: 50, perplexity: 1761.7463
iteration: 16 of max_iter: 50, perplexity: 1759.1348
iteration: 17 of max_iter: 50, perplexity: 1756.7636
iteration: 18 of max_iter: 50, perplexity: 1754.5978
iteration: 19 of max_iter: 50, perplexity: 1752.6090
iteration: 20 of max_iter: 50, perplexity: 1750.7742
iteration: 21 of max_iter: 50, perplexity: 1749.0743
iteration: 22 of max_iter: 50, perplexity: 1747.4934
iteration: 23 of max_iter: 50, perplexity: 1746.0181
iteration: 24 of max_iter: 50, perplexity: 1744.6369
iteration: 25 of max_iter: 50, perplexity: 1743.3402
iteration: 26 of max_iter: 50, perplexity: 1742.1196
iteration: 27 of max_iter: 50, perplexity: 1740.9677
iteration: 28 of max_iter: 50, perplexity: 1739.8783
iteration: 29 of max_iter: 50, perplexity: 173

iteration: 34 of max_iter: 50, perplexity: 1585.2437
iteration: 35 of max_iter: 50, perplexity: 1584.8154
iteration: 36 of max_iter: 50, perplexity: 1584.4090
iteration: 37 of max_iter: 50, perplexity: 1583.9970
iteration: 38 of max_iter: 50, perplexity: 1583.5975
iteration: 39 of max_iter: 50, perplexity: 1583.2457
iteration: 40 of max_iter: 50, perplexity: 1582.8719
iteration: 41 of max_iter: 50, perplexity: 1582.5064
iteration: 42 of max_iter: 50, perplexity: 1582.1912
iteration: 43 of max_iter: 50, perplexity: 1581.8673
iteration: 44 of max_iter: 50, perplexity: 1581.5399
iteration: 45 of max_iter: 50, perplexity: 1581.2411
iteration: 46 of max_iter: 50, perplexity: 1580.9303
iteration: 47 of max_iter: 50, perplexity: 1580.6567
iteration: 48 of max_iter: 50, perplexity: 1580.3668
iteration: 49 of max_iter: 50, perplexity: 1580.0497
iteration: 50 of max_iter: 50, perplexity: 1579.7786
done in 1623.557s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iterati

iteration: 39 of max_iter: 50, perplexity: 1692.1948
iteration: 40 of max_iter: 50, perplexity: 1691.6084
iteration: 41 of max_iter: 50, perplexity: 1691.0434
iteration: 42 of max_iter: 50, perplexity: 1690.4984
iteration: 43 of max_iter: 50, perplexity: 1689.9723
iteration: 44 of max_iter: 50, perplexity: 1689.4639
iteration: 45 of max_iter: 50, perplexity: 1688.9723
iteration: 46 of max_iter: 50, perplexity: 1688.4966
iteration: 47 of max_iter: 50, perplexity: 1688.0359
iteration: 48 of max_iter: 50, perplexity: 1687.5894
iteration: 49 of max_iter: 50, perplexity: 1687.1564
iteration: 50 of max_iter: 50, perplexity: 1686.7362
done in 1715.241s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1990.0500
iteration: 2 of max_iter: 50, perplexity: 1874.0394
iteration: 3 of max_iter: 50, perplexity: 1826.4956
iteration: 4 of max_iter: 50, perplexity: 1799.6174
iteration: 5 of max_iter: 50, perplexity: 1781.9400
iteration: 6

### results

In [None]:
tmp_results

### プロフィールを

In [None]:
def get_top_prof (topic_idx, top_n = 100):
    
    topics = lda.transform(X)
    prof_idx_list = topics[:, topic_idx].argsort()[:-top_n - 1:-1]
    return [docs[d] for d in prof_idx_list]

In [None]:
# get_top_prof(0, 20)

In [None]:
# get_top_prof(1, 20)

In [None]:
# get_top_prof(2, 20)

In [None]:
# get_top_prof(3, 20)

In [None]:
# get_top_prof(4, 20)

In [None]:
# get_top_prof(5, 20)

In [None]:
nownow_file = (datetime.datetime.now() + datetime.timedelta(hours=9) ).strftime('%m%d_%H%M')+"topic_modeling.ipynb"

!cp ./topic_modeling.ipynb ./jupyter_backup_for_param/$nownow_file

In [None]:
file_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M')+"_output.txt"