In [1]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime

Using TensorFlow backend.


In [2]:
# 保证
# 'Beta-01-Matrix-20160101-20190430.csv' 
# 'News_WallstreetCN_Seg_20160101_20190330.csv'
# 在同目录下

# 注意别看错文件名WallstreetCN的是20190330.csv结尾

In [3]:
###############################################################
#####
##### 预处理 单只股票 日数据 针对 000001.SZ
#####
###############################################################

In [4]:
df = pd.read_csv('Beta-01-Matrix-20160101-20190430.csv')

In [5]:
# 获取所有交易日期
trade_dates = list(df.keys())[2:]
len(trade_dates)

810

In [6]:
# 生成 y_train
betas = list(df.iloc[0]) ### ！重要 要训练别的股票请改这里 ####  我们要提取000001.SZ的所有日数据 所以用index 0
betas = betas[2:]
y_train = []
for beta in betas:
    if (beta==-1):
        y_train.append([1,0,0])
    if (beta==0):
        y_train.append([0,1,0])
    if (beta==1):
        y_train.append([0,0,1])
y_train = np.array(y_train)
y_train = y_train.astype('float32')
del betas
del df
y_train.shape

(810, 3)

In [7]:
###############################################################
#####
##### 预处理 新闻 
#####
##### 注意 若训练不同股票 无需重复运行这块代码 耗时
##### 原因: 训练不同股票 x_train 永远一样，只是y_train和神经元权值不同
#####
###############################################################

In [8]:
df = pd.read_csv('News-WallstreetCN-Seg-20160101-20190430.csv')

In [9]:
# 单词过滤器
def legal_word(word):
    word_remove_list = ['【','[',']','】','&','联播','会见','快讯','中国','为','简讯']
    prop_remove_list = ['b','c','d','e','f','g','h','k','l','m','nr','o','p','q','r','s','t','u','w','x','y','z']
    prop_whitelist = ['n','ns','nt','nz','v','vd','vn']
    if (word[0] in word_remove_list):
        return False
    if (any(char.isdigit() for char in word[0])):
        return False
    if (word[1] in prop_whitelist):
        return True
    else:
        return False

In [10]:
# 整合日期
news = {}
for index in range(len(df)):
    yyyymmdd = datetime.fromtimestamp(df['display_time'][index]).date().strftime("%Y%m%d")
    if yyyymmdd in trade_dates:
        words = []
        exec('words=words+' + df['content_text_seg'][index])    
        words = set(map(lambda x:x[0], filter(legal_word,words)))
        if yyyymmdd in news:
            news[yyyymmdd] = news[yyyymmdd] | words
        else:
            news[yyyymmdd] = words

In [11]:
len(news.keys()) #发现新闻中有效日期为789个 则修改之前y_train的长度为789

810

In [12]:
# 统计所有不同单词 ！重要 要用于x_train
allwords = set()
for w in news.values(): 
    allwords = allwords|w

In [13]:
len(allwords) #总共有61275个不同单词

63136

In [14]:
# 生成 x_train
x_train = []
for date in trade_dates:
    x_train_single = [0] * len(allwords)
    for index, w in enumerate(allwords):
        if w in news[date]:
            x_train_single[index] = 1
    x_train.append(x_train_single)

In [15]:
x_train = np.array(x_train)
x_train = x_train.astype('float32')
x_train.shape

(810, 63136)

In [16]:
###############################################################
#####
##### 训练
#####
##### 喜闻乐见的神经网络
##### 
#####
###############################################################

In [17]:
X = x_train
Y = y_train

In [56]:
# 分割 训练集 验证集
seed = 7
np.random.seed(seed)
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.551, random_state=seed)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)

In [57]:
# 最终Shape一览
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((567, 63136), (243, 63136), (567, 3), (243, 3))

In [58]:
# 神经网络 结构
batch_size = 256
epochs = 30
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(63136,)))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=2,
                    validation_data=(x_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_57 (Dense)             (None, 512)               32326144  
_________________________________________________________________
dropout_43 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_44 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_45 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_60 (Dense)             (None, 3)                 387       
Total para