In [2]:
""" import all modules """

' import all modules '

In [3]:
# tools for importing files 
import os, io
# Chinese tokenization tool
import jieba
# vectorization tool
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# topic modeling tool(LDA)
from sklearn.decomposition import LatentDirichletAllocation as LDA
# interactive visualization tool
import pyLDAvis
import pyLDAvis.sklearn

In [4]:
""" ALL DEFs """

' ALL DEFs '

In [5]:
# read single file
def read_txt(filepath):
    f = io.open(filepath,'r', encoding = 'GB18030')
    content=f.read()
    f.close()
    return content

In [6]:
# read stopword (different encoding type with 'read_txt')
def read_stopword(filepath):
    f = io.open(filepath,'r', encoding = 'utf-8')
    content=f.read()
    f.close()
    return content


In [7]:
# read all files
def read_dir_txt(dirpath):
    filenames = os.listdir(dirpath)
    result_list=[]
    for filename in filenames:
        filepath = dirpath + filename
        text = read_txt(filepath)
        result_list.append(text)    
    ## delete blank poems(strings)
    while '' in result_list:
        result_list.remove('')
    return result_list

In [8]:
# tokenization
def poem_cut(text):
    return " ".join(jieba.cut(text, cut_all = False)).split() ## False means accurate mode

In [9]:
# top words in topics
def top_words(model, feature_names, words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
            for i in topic.argsort()[:-words - 1:-1]]))
    print()

In [10]:
""" process """

' process '

In [11]:
# read all Tang poems into a list
wd='C:\\Users\\lenovo\\Desktop\\final project'
os.chdir(wd)
data_path = 'all-TANG-poems from zhengzhou uni\\'
Tang_poem = read_dir_txt(data_path)

In [12]:
# clean txt
clean_tokens = []
stopword = read_stopword('C:\\Users\\lenovo\\Desktop\\final project\\stopwords.txt').split()

for poem in Tang_poem:
    tokens = poem_cut(poem)
    tokens_nostop = [token for token in tokens if token not in stopword]
    clean_tokens.append(' '.join(tokens_nostop))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 3.382 seconds.
Prefix dict has been built succesfully.


In [13]:
# vectorization
n_features = 2000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',max_features = n_features, max_df = 0.5, min_df = 10)
tf = tf_vectorizer.fit_transform(clean_tokens)

In [14]:
# topic modeling
n_topics = 6
lda = LDA(n_topics = n_topics, max_iter=50,learning_method='online',learning_offset=50.,random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=50, mean_change_tol=0.001,
             n_jobs=1, n_topics=6, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [18]:
# print top words of topics
tf_feature_names = tf_vectorizer.get_feature_names()
words = 15
print(top_words(lda, tf_feature_names, words))

Topic #0:
人间 相逢 黄金 天子 东风 白发 行人 洞庭 浮云 风流 先生 文章 天下 风尘 潇湘
Topic #1:
白日 归去 天涯 山川 寂寥 白云 故乡 殷勤 西风 明日 时节 憔悴 江山 桃花 鸳鸯
Topic #2:
青山 江南 风吹 芙蓉 长安 回首 笙歌 万里 山水 明朝 千载 天地 楼台 百年 扁舟
Topic #3:
春风 千里 万里 悠悠 白云 相思 惆怅 寂寞 故人 主人 落日 十年 将军 苍苍 夕阳
Topic #4:
明月 可怜 流水 日暮 杨柳 落花 芳草 白头 春色 山中 凤凰 一枝 烟霞 君王 离别
Topic #5:
秋风 清风 萧条 草木 春草 沧海 人生 乾坤 四海 一朝 歌舞 桃李 太守 知己 太平

None


In [16]:
# interactive visualization
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)