# 由CSV生成语料库

* 读取csv文件
* 对于一行资讯进行分词
* 将分词结果写入新的txt文件

In [13]:
# 常量定义

#语料库文件路径：
DATA_PATH = '../data/news_words.txt'

import time
import pandas as pd
import word_segmentation as ws
word_segmentor = ws.WordSegmentation()


# write_news_words
# 输入 csv 文件的 file_path
# 操作：分词-写入DATA_FILE
def write_news_words(file_path):
    file_data = pd.read_csv(file_path)
    file_data.rename(columns={'标题':'title', '正文':'content','正文1':'content',"字段1_文本":"title"}, inplace = True)
    file = open(DATA_PATH,"a")
    for index, row in file_data.iterrows():
        #print(row.content)
        row_words = word_segmentor.word_segmentation(str(row.title) + str(row.content))
        file_words = " ".join(row_words)
        file.write(file_words + ' \n')
    file.close()
    

# 添加文件到语料库
# 输入：new_path 为新增的抓取csv数据的文件夹
# 执行：将csv文件的分词结果写入语料库文件
def append_csv2txt(new_path):
    files = os.listdir(new_path)
    print(files)
    
    for fname in files:
        fpath = new_path + fname
        if 'csv' in fpath:
            print(fpath)
            write_news_words(fpath)
            

In [None]:
start_time = time.time()
append_csv2txt('../news/')
print('Loading CSV files took %fs!' % (time.time() - start_time)) 


['news0312.csv', '东财关键词对应新闻-0426-6.csv', '东方财经关键词板块对应新闻-0425-2.csv', '行业股票频道东方财富网---600621行业要闻-券商信托资讯-0302.csv', '东方财经关键词板块对应新闻-0425-1.csv', '行业股票频道东方财富网---600621行业要闻-券商信托研报-0312-2.csv', '第一财经板块对应新闻-0423-2.csv', '东方财富网---个股要闻(1).csv', '第一财经板块对应新闻-0428-1.csv', '东方财富网---个股要闻-TO603099.csv', '东财关键词对应新闻-0427.csv', '第一财经板块对应新闻-0428-3.csv', '行业股票频道东方财富网---600621行业要闻-券商信托资讯-0305.csv', '行业研究报告数据中心东方财富网-0502-2.csv', '第一财经板块对应新闻-0423-1.csv', '东方财富网---个股要闻-TO600689.csv', '第一财经板块对应新闻-0419-2.csv', '行业研究报告数据中心东方财富网-0502-1.csv', '东财行业板块关键词.csv', '第一财经板块对应新闻-0413.csv', '东财关键词对应新闻-0426-7.csv', '东财关键词对应新闻-0426-4.csv', '东财关键词对应新闻-0426-3.csv', '东财关键词对应新闻-0426-8.csv', '东方财经关键词板块对应新闻-0427.csv', '第一财经板块对应新闻.csv', '第一财经板块对应新闻-0428-2.csv', '东财关键词对应新闻-0426-1.csv', '东方财富网中国财经门户提供专业的财经股票.csv', '第一财经板块对应新闻-0419-1.csv', '东财关键词对应新闻-0426-5.csv', '东财关键词对应新闻-0426-2.csv', '第一财经板块对应新闻-0416.csv', '行业股票频道东方财富网---600621行业要闻-券商信托研报-0312.csv']
../news/news0312.csv
../news/东财关键词对应新闻-0426-6.csv
../news/东方财经关键词板块对应新闻-0425-2.csv
.

# 离线语料分析建模及板块预测模型优化

LDA：

* 生成语料库（初期由csv文件生成，后期由数据集中获取）
* 读取语料库（每行一条资讯，已分词，空格分隔）
* 利用语料库训练LDA模型并保存

RF：

* 读取标记数据库（每行一条资讯，第一列为标签，第二列为资讯，已分词，空格分隔）
* 利用LDA模型转换资讯为Vector-标记 格式
* 训练RF模型并保存

In [1]:
# 新闻爬取xlxs文件目录：
NEWS_PATH = '../news/'
#语料库文件路径：
DATA_PATH = '../data/news_words.txt'
#标记数据路径
LABEL_PATH = '../labels/'
# LDA Model 路径：
LDA_PATH = '../model/lda.model'

#coding=utf-8  
import codecs  
from gensim import corpora  
from gensim.models import LdaModel  
from gensim.corpora import Dictionary  


import jieba
import csv
import numpy as np  
import os  
import pandas as pd
from sklearn.naive_bayes import GaussianNB  
   
import time    
from sklearn import metrics    
import pickle as pickle    
import pandas as pd  
  
import random

## 语料库载入及LDA模型训练

In [None]:
# 遍历语料库文件，逐步增加dictionary
print("Start reading corpus file...")
start_time = time.time()    
fr=open(DATA_PATH,'r')  
train=[]  
dictionary = corpora.Dictionary()
for line in fr.readlines():  
    line=line.split(' ')  
    dictionary.add_documents([line]) 
dictionary.doc2bow(["军工","金融"])
print('Composing dictionary took %fs!' % (time.time() - start_time)) 
start_time = time.time()
corpus = [ dictionary.doc2bow(text) for text in train ]  
print('Loading corpus took %fs!' % (time.time() - start_time)) 

#训练LDA模型
start_time = time.time()    
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=200)  
lda.save(LDA_PATH)
print('LDA training took %fs!' % (time.time() - start_time)) 


In [5]:
today = time.strftime('%Y-%m-%d',time.localtime())
os.rename(LDA_PATH,LDA_PATH+today)

'2018-05-07'