In [None]:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 28 14:04:52 2021

@author: Liuhuiyun
"""
from __future__ import print_function
from time import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pylab as mpl

##############################################

def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 25), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        ax = axes[topic_idx]
        mpl.rcParams['font.sans-serif'] = ['SimHei'] 
        mpl.rcParams['axes.unicode_minus'] = False 
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx +1}',fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig('testplot/Topics_in_LDA_model.tif')

##############################################

print("Loading dataset...")
t0 = time()
data=[]
f=open("CNKI-20210429002635507.txt","r",encoding='utf-8')
f.readline()
for line in f:
    #print (line.startswith('K1'))
    if line.startswith('K1') == True :
        if len(line.lstrip('K1').strip())>0:
            data.append(line.lstrip('K1').strip())
print("done in %0.3fs." % (time() - t0))

data_samples=data

##############################################

n_samples = 20000
n_features = 10000
n_components = 10   # n-topics
n_top_words = 30     # key-words
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print(tf_vectorizer)


print('\n' * 2, "Fitting LDA models with tf features, ""n_samples=%d and n_features=%d..."% (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,learning_method='online',learning_offset=50.,random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
tf_feature_names = tf_vectorizer.get_feature_names()
plot_top_words(lda, tf_feature_names, n_top_words, 'Topics in LDA model')

##############################################

print(tf_feature_names)
print(n_top_words)




import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer )