<a href="https://colab.research.google.com/github/tomonari-masada/course-nlp2020/blob/master/11_topic_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 標準使用ライブラリー
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')
import gc
import os
import shutil
from icecream import ic
from tqdm import tqdm_notebook as tqdm 



# 追記
import json
import datetime
import math

# debug
#%pdb on

import pixiedust #%pixie_debugger

# tfがエラーはかないため
# tfがエラーはかないため
#import tensorflow as tf
#import os
#os.environ["CUDA_VISIBLE_DEVICES"]="5"
#physical_devices = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0], True)


Pixiedust database opened successfully


In [2]:
import pandas as pd
import re
import emoji
from wordcloud import WordCloud, STOPWORDS

df = pd.read_csv("./result0605.csv", engine='python')

type(df["description"])
docs = df["description"].to_list()

In [3]:
print(len(docs))

93794


### Neologdを使ってtokenizeする

In [4]:

import subprocess
import MeCab

def make_neologd_tagger():
    cmd='echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
    path_neologd = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                               shell=True).communicate()[0]).decode('utf-8')
    m=MeCab.Tagger("-Ochasen -d "+str(path_neologd))
    return (m)


def neolog_prep_text( text, m):
    return_words = []

    
    splited_text = (re.split('[\t,]', line) for line in m.parse(text).split('\n'))
    for tmp_word in splited_text :
        if (tmp_word[0] in ('EOS', '', 't', 'ー') ):
           continue 
        if not re.match( '名詞' ,tmp_word[3]  ) or tmp_word[0] in emoji.UNICODE_EMOJI["en"]:
            continue
        else:
            return_words.append(tmp_word[0])

    return return_words


* tokenizationの実行

In [5]:
from tqdm import tqdm


m = make_neologd_tagger()

new_docs = list()
for doc in tqdm(docs):
  if str(doc) == "nan":
    continue
  tmp_words =  neolog_prep_text(str(doc), m)
  new_docs.append( tmp_words )
  

100%|██████████| 93794/93794 [00:23<00:00, 3931.52it/s]


* tokenizationの結果を確認

In [6]:
print(new_docs[5])

['過去', 'ジャパリカート', '動画', 'TSUMURI', 'KART', 'VRChat', 'ワリスノ', 'MK', '8', 'DX', '一位', 'りし', 'た人', '社会', '出て', '配信', 'https', 'co', 'FJoitl', '8', 'JHE', 'ヘッダ', '猫', '飼い主', 'smmmmm']


* 各文書を長い文字列で表しなおす（CountVectorizerを後で使うため）

In [7]:
corpus = [' '.join(doc) for doc in new_docs]

## 11-02 データ行列の作成
* LDAの場合、単に単語の出現頻度を重みとして各文書をベクトル化する。

### sklearnのCountVectorizerで疎行列化する

* 全文書の半分より多い文書に現れる単語は、高頻度語とみなして削除する。
* 30件未満の文書にしか現れない単語は、低頻度語とみなして削除する。

In [8]:
import os
import urllib.request
def download_stopwords(path):
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    if os.path.exists(path):
        print('File already exists.')
    else:
        print('Downloading...')
        # Download the file from `url` and save it locally under `file_name`:
        urllib.request.urlretrieve(url, path)

def create_stopwords(file_path):
    stop_words = []
    for w in open(path, "r"):
        w = w.replace('\n','')
        if len(w) > 0:
          stop_words.append(w)
    return stop_words    

path = "stop_words.txt"
download_stopwords(path)
stop_words = create_stopwords(path)

File already exists.


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

MIN_DF = 30

vectorizer = CountVectorizer(max_df=0.5, min_df= MIN_DF, stop_words=stop_words)

X = vectorizer.fit_transform(corpus)

In [10]:
print(len(vectorizer.get_feature_names()))

5100


* 文書数と語彙サイズを変数にセット

In [11]:
n_samples, n_features = X.shape

### TF-IDFで各文書における単語の重みを計算する

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
Xtfidf = tfidf.fit_transform(X)

In [13]:
print(Xtfidf[0])

  (0, 4227)	0.603862693464167
  (0, 4148)	0.4706802938434637
  (0, 3490)	0.1847556021921528
  (0, 3414)	0.3341859275888348
  (0, 3402)	0.3582132311126841
  (0, 1747)	0.24047370992039763
  (0, 1109)	0.2860956441170824


In [14]:
Xtfidf.shape

(88481, 5100)

### LDAのインポート

In [15]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
from time import time
from matplotlib.backends.backend_pdf import PdfPages
import datetime
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

### トピックの重要語を取り出す関数の定義

In [16]:
def get_top_words(model, feature_names, n_top_words=30):
  top_features = list()
  weights = list()
  for topic_idx, topic in enumerate(model.components_):
    top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
    top_features.append([feature_names[i] for i in top_features_ind])
    weights.append(topic[top_features_ind])
  return top_features, weights

# LDAでトピック抽出

### LDAによるトピック抽出の実行

In [17]:
def make_word_cloud(n_components, lda):
    # matplotlib and seaborn for plotting
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    plt.style.use('dark_background')
    top_words, weights = get_top_words(lda, vectorizer.get_feature_names())
    topic_words = [dict(zip(top_words[i], weights[i])) for i in range(n_components)]
    FONT_PATH = "/usr/share/fonts/opentype/ipaexfont-mincho/ipaexm.ttf"
    cloud = WordCloud(stopwords=STOPWORDS,
                  font_path=FONT_PATH,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=100,
                  colormap='tab10'
                  )

    tate = math.ceil(n_components / 2)
    fig, axes = plt.subplots(tate, 2, figsize=(32, 50), sharex=True, sharey=True)

    for i, ax in enumerate(axes.flatten()):

        if i > len(topic_words)-1:
            break

        fig.add_subplot(ax)
        cloud.generate_from_frequencies(topic_words[i], max_font_size=500)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()

    pdf = PdfPages( 
        (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M') + 'topic.pdf')


    fignums = plt.get_fignums()
    for fignum in fignums:
        plt.figure(fignum)
        pdf.savefig()

    pdf.close()



In [18]:
import logging
import pickle
import itertools

def lda_main (batch_size ,n_components, topic_word_prior,doc_topic_prior  ,max_iter=30):

    folder_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M')

    # フォルダを作成
    os.mkdir("./"+folder_name)
    os.chdir("./"+folder_name)

    # logging
    logger = logging.getLogger()
    fhandler = logging.FileHandler(filename='mylog.log', mode='a')
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fhandler.setFormatter(formatter)
    logger.addHandler(fhandler)
    logger.setLevel(logging.WARNING)
    
    
    lda = LatentDirichletAllocation(n_components=n_components, 
                                    max_iter=max_iter,
                                    topic_word_prior=topic_word_prior, # トピック数の逆数が目安の0.01,0.02,0.05,0.1などなど試す
                                    doc_topic_prior =  doc_topic_prior, 
                                    learning_method='online',
                                    learning_offset=50,
                                    batch_size= batch_size,# 多くする
                                    # learning_decay : 2??
                        
                                    mean_change_tol=1e-4,
                                    random_state=1,
                                    evaluate_every=1,
                                    verbose=1)
    print((f"Fitting LDA models with tf features, "
    f"n_samples={n_samples} and n_features={n_features}"))
    t0 = time()
    lda.fit(X)
    print(f"done in {time() - t0:0.3f}s.")
    # パラメータの比較はperplexity
    # ハイパーパラメータ調整を頑張る！（やってられない！といわない！！）
    
    
    coherance = metric_coherence_gensim(measure='c_v', 
#                         top_n=20, # これはデフォルトが20
                        topic_word_distrib=lda.components_, 
                        dtm=Xtfidf,  # tfidfの結果
                        vocab=np.array([x for x in vectorizer.vocabulary_.keys()]), 
                        texts=new_docs)
    
    
    results = {
            "perplexity" : lda.perplexity(X) ,
            "coherance": coherance,
        }

    logger.warning('MIN_DF:{0}'.format(MIN_DF) )
    logger.warning('params:batch_size:{0}'.format(batch_size)) 
    logger.warning('params:n_components:{0}'.format(n_components)) 
    logger.warning('params:topic_word_prior:{0}'.format(topic_word_prior)) 
    logger.warning('params:doc_topic_prior:{0}'.format(doc_topic_prior)) 
    logger.warning('params:max_iter:{0}'.format(max_iter)) 
    logger.warning('done n_iter:{0}'.format(lda.n_iter_)) 
    logger.warning('perplexity:{0}'.format(results["perplexity"])) 
    logger.warning('coherance:{0}'.format(results["coherance"]) )
    logger.warning('check all params:{0}'.format(lda.get_params() )) 
    make_word_cloud(n_components, lda)
    # pickle
    file_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M') + '_lda.pickle'
    with open(file_name, mode="wb") as f:
        pickle.dump(lda, f)
    
#     breakpoint()
    
    os.chdir("../")
    return(results)
    

In [None]:
for batch_size,n_components, topic_word_prior, doc_topic_prior in itertools.product([1500, 2500, 4000,5000 ] ,[6,7,8,9,10],
                                                                                    [0.05, 0.15, 0.3, 0.4, 0.5, 0.6, 0.8],[0.05, 0.15, 0.3, 0.4, 0.5, 0.6, 0.8] ):

    lda_main(n_components=n_components, topic_word_prior=topic_word_prior, doc_topic_prior = doc_topic_prior, batch_size=batch_size, max_iter=50)
    

Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1646.4481
iteration: 2 of max_iter: 50, perplexity: 1572.3380
iteration: 3 of max_iter: 50, perplexity: 1546.3431
iteration: 4 of max_iter: 50, perplexity: 1533.4905
iteration: 5 of max_iter: 50, perplexity: 1526.0198
iteration: 6 of max_iter: 50, perplexity: 1521.8285
iteration: 7 of max_iter: 50, perplexity: 1518.9320
iteration: 8 of max_iter: 50, perplexity: 1517.0897
iteration: 9 of max_iter: 50, perplexity: 1515.6519
iteration: 10 of max_iter: 50, perplexity: 1514.5505
iteration: 15 of max_iter: 50, perplexity: 1511.7782
iteration: 16 of max_iter: 50, perplexity: 1511.4006
iteration: 17 of max_iter: 50, perplexity: 1511.1495
iteration: 18 of max_iter: 50, perplexity: 1510.9151
iteration: 19 of max_iter: 50, perplexity: 1510.7403
iteration: 20 of max_iter: 50, perplexity: 1510.6029
iteration: 21 of max_iter: 50, perplexity: 1510.4800
iteration: 22 of max_iter: 50, perp

iteration: 6 of max_iter: 50, perplexity: 1467.1818
iteration: 7 of max_iter: 50, perplexity: 1458.7508
iteration: 8 of max_iter: 50, perplexity: 1452.4079
iteration: 9 of max_iter: 50, perplexity: 1447.4833
iteration: 10 of max_iter: 50, perplexity: 1443.5597
iteration: 11 of max_iter: 50, perplexity: 1440.3666
iteration: 12 of max_iter: 50, perplexity: 1437.7084
iteration: 13 of max_iter: 50, perplexity: 1435.4660
iteration: 14 of max_iter: 50, perplexity: 1433.5452
iteration: 15 of max_iter: 50, perplexity: 1431.9043
iteration: 16 of max_iter: 50, perplexity: 1430.4874
iteration: 17 of max_iter: 50, perplexity: 1429.2475
iteration: 18 of max_iter: 50, perplexity: 1428.1510
iteration: 19 of max_iter: 50, perplexity: 1427.1729
iteration: 20 of max_iter: 50, perplexity: 1426.2944
iteration: 21 of max_iter: 50, perplexity: 1425.5011
iteration: 22 of max_iter: 50, perplexity: 1424.7818
iteration: 23 of max_iter: 50, perplexity: 1424.1267
iteration: 24 of max_iter: 50, perplexity: 1423.52

iteration: 11 of max_iter: 50, perplexity: 1479.9714
iteration: 12 of max_iter: 50, perplexity: 1479.3370
iteration: 13 of max_iter: 50, perplexity: 1478.9155
iteration: 14 of max_iter: 50, perplexity: 1478.7006
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1602.2171
iteration: 2 of max_iter: 50, perplexity: 1529.5834
iteration: 3 of max_iter: 50, perplexity: 1499.9005
iteration: 4 of max_iter: 50, perplexity: 1483.8807
iteration: 5 of max_iter: 50, perplexity: 1474.6127
iteration: 6 of max_iter: 50, perplexity: 1468.6322
iteration: 7 of max_iter: 50, perplexity: 1464.4918
iteration: 8 of max_iter: 50, perplexity: 1461.5046
iteration: 9 of max_iter: 50, perplexity: 1459.3353
iteration: 10 of max_iter: 50, perplexity: 1457.6720
iteration: 11 of max_iter: 50, perplexity: 1456.3882
iteration: 12 of max_iter: 50, perplexity: 1455.3643
iteration: 13 of max_iter: 50, perplexity: 1454.5423
iteration: 14 of max_iter: 50, perp

iteration: 22 of max_iter: 50, perplexity: 1429.3765
iteration: 23 of max_iter: 50, perplexity: 1428.8204
iteration: 24 of max_iter: 50, perplexity: 1428.3092
iteration: 25 of max_iter: 50, perplexity: 1427.8374
iteration: 26 of max_iter: 50, perplexity: 1427.4026
iteration: 27 of max_iter: 50, perplexity: 1427.0015
iteration: 28 of max_iter: 50, perplexity: 1426.6313
iteration: 29 of max_iter: 50, perplexity: 1426.2890
iteration: 30 of max_iter: 50, perplexity: 1425.9714
iteration: 31 of max_iter: 50, perplexity: 1425.6759
iteration: 32 of max_iter: 50, perplexity: 1425.3998
iteration: 33 of max_iter: 50, perplexity: 1425.1409
iteration: 34 of max_iter: 50, perplexity: 1424.8974
iteration: 35 of max_iter: 50, perplexity: 1424.6676
iteration: 36 of max_iter: 50, perplexity: 1424.4502
iteration: 37 of max_iter: 50, perplexity: 1424.2443
iteration: 38 of max_iter: 50, perplexity: 1424.0492
iteration: 39 of max_iter: 50, perplexity: 1423.8640
iteration: 40 of max_iter: 50, perplexity: 142

iteration: 31 of max_iter: 50, perplexity: 1430.2317
iteration: 32 of max_iter: 50, perplexity: 1430.0419
iteration: 33 of max_iter: 50, perplexity: 1429.8400
iteration: 34 of max_iter: 50, perplexity: 1429.6003
iteration: 35 of max_iter: 50, perplexity: 1429.4254
iteration: 36 of max_iter: 50, perplexity: 1429.2478
iteration: 37 of max_iter: 50, perplexity: 1429.0682
iteration: 38 of max_iter: 50, perplexity: 1428.9367
iteration: 39 of max_iter: 50, perplexity: 1428.8398
done in 969.194s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1589.7248
iteration: 2 of max_iter: 50, perplexity: 1519.6018
iteration: 3 of max_iter: 50, perplexity: 1490.4346
iteration: 4 of max_iter: 50, perplexity: 1474.4665
iteration: 5 of max_iter: 50, perplexity: 1464.1777
iteration: 6 of max_iter: 50, perplexity: 1457.2610
iteration: 7 of max_iter: 50, perplexity: 1452.4240
iteration: 8 of max_iter: 50, perplexity: 1448.8771
iteration: 9 of 

iteration: 49 of max_iter: 50, perplexity: 1462.5236
iteration: 50 of max_iter: 50, perplexity: 1462.3845
done in 1275.039s.
Fitting LDA models with tf features, n_samples=88481 and n_features=5100
iteration: 1 of max_iter: 50, perplexity: 1631.1887
iteration: 2 of max_iter: 50, perplexity: 1556.8661
iteration: 7 of max_iter: 50, perplexity: 1471.4922
iteration: 8 of max_iter: 50, perplexity: 1466.3827
iteration: 9 of max_iter: 50, perplexity: 1462.2446
iteration: 10 of max_iter: 50, perplexity: 1458.6641
iteration: 11 of max_iter: 50, perplexity: 1455.6580
iteration: 12 of max_iter: 50, perplexity: 1453.2020
iteration: 13 of max_iter: 50, perplexity: 1451.1386
iteration: 14 of max_iter: 50, perplexity: 1449.3722
iteration: 15 of max_iter: 50, perplexity: 1447.8550
iteration: 16 of max_iter: 50, perplexity: 1446.5431
iteration: 17 of max_iter: 50, perplexity: 1445.3950
iteration: 18 of max_iter: 50, perplexity: 1444.3755
iteration: 19 of max_iter: 50, perplexity: 1443.4538
iteration: 2

iteration: 3 of max_iter: 50, perplexity: 1475.7122
iteration: 4 of max_iter: 50, perplexity: 1460.5632
iteration: 5 of max_iter: 50, perplexity: 1451.0180
iteration: 6 of max_iter: 50, perplexity: 1444.4273
iteration: 7 of max_iter: 50, perplexity: 1439.4571
iteration: 8 of max_iter: 50, perplexity: 1436.0518
iteration: 9 of max_iter: 50, perplexity: 1433.3714
iteration: 10 of max_iter: 50, perplexity: 1431.0577
iteration: 11 of max_iter: 50, perplexity: 1428.9141
iteration: 12 of max_iter: 50, perplexity: 1427.4503
iteration: 13 of max_iter: 50, perplexity: 1426.2329
iteration: 14 of max_iter: 50, perplexity: 1425.2725
iteration: 15 of max_iter: 50, perplexity: 1424.4396
iteration: 16 of max_iter: 50, perplexity: 1423.7945
iteration: 17 of max_iter: 50, perplexity: 1423.2381


### results

In [None]:
tmp_results

### プロフィールを

In [None]:
def get_top_prof (topic_idx, top_n = 100):
    
    topics = lda.transform(X)
    prof_idx_list = topics[:, topic_idx].argsort()[:-top_n - 1:-1]
    return [docs[d] for d in prof_idx_list]

In [None]:
# get_top_prof(0, 20)

In [None]:
# get_top_prof(1, 20)

In [None]:
# get_top_prof(2, 20)

In [None]:
# get_top_prof(3, 20)

In [None]:
# get_top_prof(4, 20)

In [None]:
# get_top_prof(5, 20)

In [None]:
nownow_file = (datetime.datetime.now() + datetime.timedelta(hours=9) ).strftime('%m%d_%H%M')+"topic_modeling.ipynb"

!cp ./topic_modeling.ipynb ./jupyter_backup_for_param/$nownow_file

In [None]:
file_name = (datetime.datetime.now() + datetime.timedelta(hours=9) ) .strftime('%m%d_%H%M')+"_output.txt"