# LDA 统计模型对新闻进行主题分析

在LDA模型中，一篇文档生成的方式如下：
1. 从狄利克雷分布中取样生成文档 i 的主题分布
2. 从主题的多项式分布中取样生成文档i第 j 个词的主题
3. 从狄利克雷分布中取样生成主题对应的词语分布
4. 从词语的多项式分布中采样最终生成词语

In [7]:
# 新闻爬取xlxs文件目录：
NEWS_PATH = '../news/'
#语料库文件路径：
DATA_PATH = '../data/news_words.txt'
#标记数据路径
LABEL_PATH = '../labels/'

#coding=utf-8  
import codecs  
from gensim import corpora  
from gensim.models import LdaModel  
from gensim.corpora import Dictionary  


import jieba
import csv
import numpy as np  
import os  
import pandas as pd
from sklearn.naive_bayes import GaussianNB  
   
import time    
from sklearn import metrics    
import pickle as pickle    
import pandas as pd  
  
import random




In [8]:
# # 读取语料库，载入字典

# print("Start reading corpus file...")
# start_time = time.time()    
# fr=open(DATA_PATH,'r')  
# train=[]  
# for line in fr.readlines():  
#     line=line.split(' ')  
#     train.append(line)  
# print(len(train))
# print(' '.join(train[2]))

# dictionary = corpora.Dictionary(train)  
# print('Composing dictionary took %fs!' % (time.time() - start_time)) 
# start_time = time.time()
# corpus = [ dictionary.doc2bow(text) for text in train ]  
# print('Loading corpus took %fs!' % (time.time() - start_time)) 


In [9]:
# 遍历语料库文件，逐步增加dictionary
print("Start reading corpus file...")
start_time = time.time()    
fr=open(DATA_PATH,'r')  
train=[]  
dictionary = corpora.Dictionary()
for line in fr.readlines():  
    line=line.split(' ')  
    dictionary.add_documents([line]) 
#dictionary.doc2bow(["军工","金融"])
print('Composing dictionary took %fs!' % (time.time() - start_time)) 
start_time = time.time()
corpus = [ dictionary.doc2bow(text) for text in train ]  
print('Loading corpus took %fs!' % (time.time() - start_time)) 


Start reading corpus file...
Composing dictionary took 249.582738s!
Loading corpus took 0.000119s!


In [10]:
#训练LDA模型
start_time = time.time()    
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=200)  
lda.save('../model/lda.model')
print('LDA training took %fs!' % (time.time() - start_time)) 


LDA training took 93.062531s!


In [4]:
lda = LdaModel.load('../model/lda.model')

In [11]:
# 打印LDA模型相关结果
topic_list=lda.print_topics(20)  
print(type(lda.print_topics(20)))
print(len(lda.print_topics(20)))
  
for topic in topic_list:  
    print(topic) 
print("第一主题")
print(lda.print_topic(1))

<class 'list'>
20
(160, '0.000*"90.377" + 0.000*"徐书楠" + 0.000*"对运来" + 0.000*"1722.2" + 0.000*"0.07001458" + 0.000*"王宝敏" + 0.000*"城中心" + 0.000*"7825.2525" + 0.000*"ICN" + 0.000*"以发"')
(65, '0.000*"125403.97" + 0.000*"岑赛" + 0.000*"衔接" + 0.000*"4030" + 0.000*"40.38" + 0.000*"19.5010" + 0.000*"同退" + 0.000*"摆在首位" + 0.000*"顶头上司" + 0.000*"横移类"')
(82, '0.000*"文著" + 0.000*"止马营" + 0.000*"20.74300059" + 0.000*"骗税" + 0.000*"70.719" + 0.000*"4984.27" + 0.000*"863.71002153" + 0.000*"37000682" + 0.000*"259.389" + 0.000*"科佩茨"')
(197, '0.000*"455.395" + 0.000*"1440.1" + 0.000*"叶贤林" + 0.000*"247.544" + 0.000*"20160808653542465" + 0.000*"15413.74" + 0.000*"立户" + 0.000*"96257.6323421" + 0.000*"OHSASI8001" + 0.000*"094533.88"')
(92, '0.000*"5.8900338" + 0.000*"InstituteofInternationalFinance" + 0.000*"35.0025" + 0.000*"9033.597" + 0.000*"8661.651" + 0.000*"23.12000831" + 0.000*"317.9256" + 0.000*"使内" + 0.000*"103.11300406" + 0.000*"德敖东"')
(130, '0.000*"82.1645" + 0.000*"价来" + 0.000*"3465.20" + 0.000*"罗地亚" 

## 标记数据导入与分类训练

标记数据来源： 第一财经新闻
标记格式：新闻标题/正文/关键词
利用LDA的模型，将标题+正文作为字符串，对标记数据进行向量化，标签为关键词。




In [3]:



# 加载停用词，输入停用词文件，输出停用词list
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]  
    return stopwords

temp_stop_list = ['\u3000','\xa0','\t']
stop_words = stopwordslist("../utils/stopwords.txt") + temp_stop_list
# 为结巴分词词库加载股票名词汇
jieba.load_userdict('../data/user_dict.txt')

 
## jieba分词：输入string & 停用词文件，输出分词结果list
def jieba_split(content):
    '''
    content: 输入文本（string）
    stop_path: 停用词字典文件路径（string）
    返回：list，jieba分词结果
    '''
    str_content = str(content).replace('\t', '').replace('\n', '').replace(' ','')
    str_words = ','.join(jieba.cut_for_search(str_content)).split(",")
    ret_list = []
    for word in str_words:
        if word not in stop_words:
            if word[-1] != '%':
                ret_list.append(word)
    return ret_list



# turn lda result into list
def lda2list(lda,topic_n):
    lda_dict = dict(lda)
    lda_list = [0] * topic_n
    for i in range(topic_n):
        lda_list[i] = lda_dict.get(i,0)
    return lda_list


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.143 seconds.
Prefix dict has been built succesfully.


In [9]:


# 对于label文件夹下的CSV，读取后将标题+正文转换为分词结果，再转换为LDA结果，形成200维向量
# 对于新闻的标签，直接保留其表格中的“关键词”地段

#df_label = pd.DataFrame(columns=['words','label'])  
news_vec = []
news_labels = []


files = os.listdir(LABEL_PATH)
print(files)
for fname in files:
    fpath = LABEL_PATH + fname
    if 'csv' in fpath:
        print(fpath)
        file_data = pd.read_csv(fpath)
        file_data.rename(columns={'标题':'title', '正文':'content','正文1':'content',"字段1_文本":"title","关键词":"plate"}, inplace = True)
        for index, row in file_data.iterrows():
        #print(row.content)
            news_word = jieba_split(str(row.title) + str(row.content))
            news_bow = dictionary.doc2bow(news_word)      #文档转换成bow  
            news_lda = lda2list(lda[news_bow],201) #得到lda向量
            news_vec.append(news_lda)
            news_labels.append(row.plate)
    print("done")

            

['第一财经板块对应新闻-0423-2.csv', '第一财经板块对应新闻-0423-1.csv', '第一财经板块对应新闻-0419-2.csv', '第一财经板块对应新闻-0413.csv', '第一财经板块对应新闻.csv', '第一财经板块对应新闻-0419-1.csv', '第一财经板块对应新闻-0416.csv']
../labels/第一财经板块对应新闻-0423-2.csv


NameError: name 'dictionary' is not defined

In [49]:
print(len(news_labels))

55793


In [56]:
#高斯朴素贝叶斯  
X=np.array([np.array(xi) for xi in news_vec])
Y = np.array(news_labels)
print(type(X[0]))
print(type(Y))
print(len(X))
#X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])  
#Y = np.array([1, 1, 1, 2, 2, 2])  
clf = GaussianNB().fit(X, Y)  
#print(clf.predict([[-0.8,-1]]))
  
''''' 
partial_fit说明：增量的训练一批样本 
这种方法被称为连续几次在不同的数据集，从而实现核心和在线学习，这是特别有用的，当数据集很大的时候，不适合在内存中运算 
该方法具有一定的性能和数值稳定性的开销，因此最好是作用在尽可能大的数据块（只要符合内存的预算开销） 
'''  
#clf_pf = GaussianNB().partial_fit(X, Y, np.unique(Y))  


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
55793


"'' \npartial_fit说明：增量的训练一批样本 \n这种方法被称为连续几次在不同的数据集，从而实现核心和在线学习，这是特别有用的，当数据集很大的时候，不适合在内存中运算 \n该方法具有一定的性能和数值稳定性的开销，因此最好是作用在尽可能大的数据块（只要符合内存的预算开销） \n"

In [62]:
# sklearn中几个基本的分类其的训练函数

# Multinomial Naive Bayes Classifier    
def naive_bayes_classifier(train_x, train_y):    
    from sklearn.naive_bayes import MultinomialNB    
    model = MultinomialNB(alpha=0.01)    
    model.fit(train_x, train_y)    
    return model    
    
    
# KNN Classifier    
def knn_classifier(train_x, train_y):    
    from sklearn.neighbors import KNeighborsClassifier    
    model = KNeighborsClassifier()    
    model.fit(train_x, train_y)    
    return model    
    
    
# Logistic Regression Classifier    
def logistic_regression_classifier(train_x, train_y):    
    from sklearn.linear_model import LogisticRegression    
    model = LogisticRegression(penalty='l2')    
    model.fit(train_x, train_y)    
    return model    
    
    
# Random Forest Classifier    
def random_forest_classifier(train_x, train_y):    
    from sklearn.ensemble import RandomForestClassifier    
    model = RandomForestClassifier(n_estimators=8)    
    model.fit(train_x, train_y)    
    return model    
    
    
# Decision Tree Classifier    
def decision_tree_classifier(train_x, train_y):    
    from sklearn import tree    
    model = tree.DecisionTreeClassifier()    
    model.fit(train_x, train_y)    
    return model    
    
    
# GBDT(Gradient Boosting Decision Tree) Classifier    
def gradient_boosting_classifier(train_x, train_y):    
    from sklearn.ensemble import GradientBoostingClassifier    
    model = GradientBoostingClassifier(n_estimators=200)    
    model.fit(train_x, train_y)    
    return model    
    
    
# SVM Classifier    
def svm_classifier(train_x, train_y):    
    from sklearn.svm import SVC    
    model = SVC(kernel='rbf', probability=True)    
    model.fit(train_x, train_y)    
    return model    
    
# SVM Classifier using cross validation    
def svm_cross_validation(train_x, train_y):    
    from sklearn.grid_search import GridSearchCV    
    from sklearn.svm import SVC    
    model = SVC(kernel='rbf', probability=True)    
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}    
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)    
    grid_search.fit(train_x, train_y)    
    best_parameters = grid_search.best_estimator_.get_params()    
    for para, val in list(best_parameters.items()):    
        print(para, val)    
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)    
    model.fit(train_x, train_y)    
    return model    
    
def read_data(data_file):    
    data = pd.read_csv(data_file)  
    train = data[:int(len(data)*0.9)]  
    test = data[int(len(data)*0.9):]  
    train_y = train.label  
    train_x = train.drop('label', axis=1)  
    test_y = test.label  
    test_x = test.drop('label', axis=1)  
    return train_x, train_y, test_x, test_y  
        

  return f(*args, **kwds)


In [67]:
# 将读取label新闻转换的数据存成numpy array X， Y
# 利用X Y 进行分类训练，并查看关键词预测结果

X = np.array([np.array(xi) for xi in news_vec])
Y = np.array(news_labels)

test_classifiers = ['NB', 'KNN', 'LR', 'RF', 'DT', 'SVM','SVMCV', 'GBDT']    
classifiers = {'NB':naive_bayes_classifier,     
              'KNN':knn_classifier,    
               'LR':logistic_regression_classifier,    
               'RF':random_forest_classifier,    
               'DT':decision_tree_classifier,    
              'SVM':svm_classifier,    
            'SVMCV':svm_cross_validation,    
             'GBDT':gradient_boosting_classifier    
}    

def test_classifier(classifier):
    print("********classifier: ",classifier,"***********")
    start_time = time.time()    
    model = classifiers[classifier](X, Y)
    print('training took %fs!' % (time.time() - start_time)) 
    for i in range(100):
        n = random.randrange(1,len(X))
        # predict_proba(x)：给出带有概率值的结果。每个点在所有label的概率和为1.  
        # predict(x)：直接给出预测结果。内部还是调用的predict_proba()，根据概率的结果看哪个类型的预测值最高就是哪个类型。  
        print(model.predict([X[n]]))
        print(Y[n])

In [None]:
from sklearn.externals import joblib
#lr是一个LogisticRegression模型
joblib.dump(model, 'rf.model')
lr = joblib.load('lr.model')

In [68]:
test_classifier("KNN")

********classifier:  KNN ***********


  return f(*args, **kwds)


training took 15.636353s!
['创投']
创投
['互联网彩票']
金融IC
['军民融合']
车联网
['农机']
太阳能
['PM2.5']
金改
['上海自贸区']
福建自贸区
['举牌']
共享单车
['迪士尼']
迪士尼
['互联网医疗']
互联网医疗
['上海自贸区']
上海自贸区
['共享单车']
太阳能
['基因测序']
节能环保
['军工']
黄金
['京津冀一体化']
金改
['PM2.5']
风电
['广东自贸区']
广东自贸区
['互联网金融']
航运
['农机']
钛白粉
['基因测序']
基因测序
['军工']
军工
['区块链']
锂电池
['农村电商']
车联网
['阿里巴巴']
阿里巴巴
['供应链金融']
供应链金融
['电子信息']
量子通信
['基因测序']
航运
['电子竞技']
白酒
['啤酒']
大数据
['大飞机']
太阳能
['军工']
军工
['举牌']
举牌
['PM2.5']
PM2.5
['人脸识别']
迪士尼
['人工智能']
黄金
['阿里巴巴']
手机游戏
['深港通']
深港通
['核电']
核电
['互联网金融']
互联网金融
['阿里巴巴']
阿里巴巴
['人脸识别']
互联网金融
['工业4.0']
工业4.0
['电子商务']
广东自贸区
['锂电池']
锂电池
['农业现代化']
风电
['供应链金融']
供应链金融
['跨境电商']
跨境电商
['锂电池']
锂电池
['互联网+']
生物医药
['高端装备']
煤化工
['共享单车']
区块链
['互联网+']
军民融合
['上海自贸区']
汽车电子
['迪士尼']
迪士尼
['啤酒']
啤酒
['石墨烯']
石墨烯
['上海自贸区']
上海自贸区
['高送转']
节能环保
['广东自贸区']
广东自贸区
['OLED']
水泥
['PM2.5']
PM2.5
['高端装备']
高端装备
['互联网金融']
工业4.0
['电子信息']
跨境电商
['核电']
核电
['农业现代化']
美丽中国
['PM2.5']
PM2.5
['安防']
大数据
['共享单车']
钛白粉
['区块链']
区块链
['阿里巴巴']
人工智能
['ST板块']
风电
['迪士尼']
区块链
['工业4.0']
工业4.0
['啤酒'

In [69]:
test_classifier("RF")

********classifier:  RF ***********


  return f(*args, **kwds)


training took 5.989762s!
['高端装备']
高端装备
['水泥']
水泥
['融资融券']
融资融券
['白酒']
白酒
['创投']
创投
['黄金']
黄金
['风电']
风电
['生态农业']
生态农业
['海工装备']
海工装备
['航运']
航运
['建筑节能']
建筑节能
['生物质能']
生物质能
['量子通信']
量子通信
['乳业']
黄金
['风电']
风电
['能源互联网']
能源互联网
['乳业']
乳业
['电子发票']
电子发票
['量子通信']
量子通信
['军民融合']
高校
['集成电路']
集成电路
['白马股']
白马股
['期货概念']
期货概念
['冷链物流']
冷链物流
['阿里巴巴']
阿里巴巴
['水利']
水利
['军民融合']
军民融合
['农业现代化']
农业现代化
['阿里巴巴']
阿里巴巴
['食品安全']
食品安全
['高端装备']
高端装备
['基因测序']
基因测序
['区块链']
区块链
['充电桩']
充电桩
['太阳能']
太阳能
['举牌']
举牌
['基因测序']
基因测序
['农机']
农机
['大飞机']
大飞机
['共享单车']
共享单车
['阿里巴巴']
阿里巴巴
['太阳能']
太阳能
['航运']
航运
['生物医药']
生物医药
['工业4.0']
工业4.0
['电子商务']
电子商务
['工业4.0']
工业4.0
['大飞机']
大飞机
['核电']
核电
['大数据']
大数据
['互联网+']
互联网+
['创投']
创投
['农村电商']
农村电商
['创投']
创投
['食品安全']
食品安全
['农村电商']
农村电商
['阿里巴巴']
阿里巴巴
['基因测序']
基因测序
['美丽中国']
美丽中国
['高铁']
高铁
['水泥']
高校
['钛白粉']
钛白粉
['车联网']
车联网
['水泥']
水泥
['太阳能']
太阳能
['人脸识别']
人脸识别
['白酒']
白酒
['军民融合']
军民融合
['互联网金融']
互联网金融
['农机']
农机
['高铁']
高铁
['量子通信']
量子通信
['融资融券']
融资融券
['深港通']
深港通
['乳业']
乳业
['迪士尼']
黄金
['区块链']
区块链
['高送转']
高送

In [1]:
test_classifier("SVM")

NameError: name 'test_classifier' is not defined