# Library Import
The goal of this file is to predict the opening price for DJIA with the help of CNN and LSTM individually.

In [1]:
import re
import nltk
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import SGDClassifier, SGDRegressor,LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random
import itertools

import sys
import os
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
import six
from abc import ABCMeta
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer
from sklearn.svm import LinearSVC

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.preprocessing.text import Tokenizer
from collections import defaultdict
from keras.layers.convolutional import Convolution1D
from keras import backend as K
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
plt.style.use('ggplot')

Using TensorFlow backend.


# Data Import
We will divide the data into traning and testing 
- Traiing dat < 2014
- Test data >= 2014

In [2]:
data = pd.read_csv('processed_data/Combined_News_DJIA.csv')
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

# Data Process
- Transform the news headlines into number of words as input.

In [3]:
# headlines for training
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))

In [4]:
# headlines for testing
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))

In [5]:
# vectorizing train headlines
basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(trainheadlines)
print(basictrain.shape)

(1611, 31675)


## Deep Learning
- tf-idf vectorizer is used

In [6]:
batch_size = 32
nb_classes = 2
advancedvectorizer = TfidfVectorizer( min_df=0.04, max_df=0.3, max_features = 200000, ngram_range = (2, 2))
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
advancedtest = advancedvectorizer.transform(testheadlines)
print(advancedtrain.shape)

(1611, 401)


- Label the data for target values

In [7]:
X_train = advancedtrain.toarray()
X_test = advancedtest.toarray()

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(train["Label"])
y_test = np.array(test["Label"])

Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

X_train shape: (1611, 401)
X_test shape: (378, 401)


In [8]:
# pre-processing: divide by max and substract mean
scale = np.max(X_train)
X_train /= scale
X_test /= scale

mean = np.mean(X_train)
X_train -= mean
X_test -= mean

input_dim = X_train.shape[1]

In [9]:
'''
# Here's a Deep Dumb MLP (DDMLP)
model = Sequential()
model.add(Dense(256, input_dim=input_dim))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

# we'll use categorical xent for the loss, and RMSprop as the optimizer
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print("Training...")
model.fit(X_train, Y_train, nb_epoch=2, batch_size=16, validation_split=0.15)#, show_accuracy=True)

print("Generating test predictions...")
preds14 = model.predict_classes(X_test, verbose=0)
acc14 = accuracy_score(test["Label"], preds14)

print('prediction accuracy: ', acc14)
'''

'\n# Here\'s a Deep Dumb MLP (DDMLP)\nmodel = Sequential()\nmodel.add(Dense(256, input_dim=input_dim))\nmodel.add(Activation(\'relu\'))\nmodel.add(Dropout(0.4))\nmodel.add(Dense(128))\nmodel.add(Activation(\'relu\'))\nmodel.add(Dropout(0.4))\nmodel.add(Dense(nb_classes))\nmodel.add(Activation(\'softmax\'))\n\n# we\'ll use categorical xent for the loss, and RMSprop as the optimizer\nmodel.compile(loss=\'categorical_crossentropy\', optimizer=\'rmsprop\')\n\nprint("Training...")\nmodel.fit(X_train, Y_train, nb_epoch=2, batch_size=16, validation_split=0.15)#, show_accuracy=True)\n\nprint("Generating test predictions...")\npreds14 = model.predict_classes(X_test, verbose=0)\nacc14 = accuracy_score(test["Label"], preds14)\n\nprint(\'prediction accuracy: \', acc14)\n'

### Applying first model LSTM 
- Following are the Hyperparameters

In [10]:
max_features = 10000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
maxlen = 200
batch_size = 32
nb_classes = 2

In [11]:
# vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(trainheadlines)
sequences_train = tokenizer.texts_to_sequences(trainheadlines)
sequences_test = tokenizer.texts_to_sequences(testheadlines)



In [12]:
# padding the sequences
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen)

Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
X_train shape: (1611, 200)
X_test shape: (378, 200)


### Model
- Build the model
- Training of model
- Evaluate the model accuracy
- Generating test predictions

In [13]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=3,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds_lstm = model.predict_classes(X_test, verbose=0)
acc_lstm = accuracy_score(test['Label'], preds_lstm)

Build model...
Instructions for updating:
Colocations handled automatically by placer.


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train...
Instructions for updating:
Use tf.cast instead.


  


Train on 1611 samples, validate on 378 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.981368827441382
Test accuracy: 0.5476190477767319
Generating test predictions...


In [27]:
print('prediction accuracy: ', acc_lstm)

prediction accuracy:  0.5476190476190477


### Get Precision Recall and F1 score

In [15]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['Label'], preds_lstm, average='weighted')

(0.5474056459971952, 0.5476190476190477, 0.5458146791480124, None)

In [16]:
from sklearn.metrics import f1_score
f1_score(test['Label'], preds_lstm, average='weighted')

0.5458146791480124

## Applying second model CNN
- Hyperparameters are following

In [17]:
nb_filter = 120
filter_length = 2
hidden_dims = 120
nb_epoch = 2

### Model
- Build the model
- Training of model
- Evaluate the model accuracy
- Generating test predictions

In [18]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))

def max_1d(X):
    return K.max(X, axis=1)

model.add(Lambda(max_1d, output_shape=(nb_filter,)))
model.add(Dense(hidden_dims)) 
model.add(Dropout(0.2)) 
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


  This is separate from the ipykernel package so we can avoid doing imports until
  # Remove the CWD from sys.path while we load stuff.


In [19]:
print('Train...')
model.fit(X_train, Y_train, batch_size=32, nb_epoch=1,
          validation_data=(X_test, Y_test))
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds_cnn = model.predict_classes(X_test, verbose=0)
acc_cnn = accuracy_score(test['Label'], preds_cnn)

Train...


  This is separate from the ipykernel package so we can avoid doing imports until


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1611 samples, validate on 378 samples
Epoch 1/1
Test score: 0.6962450723799448
Test accuracy: 0.5079365098287189
Generating test predictions...


### Get Precision Recall and F1 score

In [28]:
print('prediction accuracy: ', acc_cnn)

prediction accuracy:  0.5079365079365079


In [29]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(test['Label'], preds_cnn, average='weighted')

  'precision', 'predicted', average, warn_for)


(0.25799949609473416, 0.5079365079365079, 0.3421888053467001, None)