### Imports

In [None]:
import numpy as np
import pandas as pd

import os

from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.layers import Input

from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Dense
from keras.models import Model
from keras.layers import Bidirectional, GRU, BatchNormalization

from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

In [None]:
reuters = pd.read_pickle('reuters_news_concatenated.pkl', 'bz2')

In [None]:
reuters

### Vectorize News into a 2D integer Tensor

In [None]:
MAX_SEQUENCE_LENGTH = 512
MAX_NUM_WORDS = 20000

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(reuters.news)

reuters_sequences = tokenizer.texts_to_sequences(reuters.news)

word_index = tokenizer.word_index
reuters_sequences = pad_sequences(reuters_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [None]:
len(word_index)

In [None]:
reuters_sequences

### Cluster Labels

0 - Strong Sell
1 - Sell
2 - Buy
3 - Strong Buy

In [None]:
clusters = 4
labels = np.copy(reuters.Y)
for i in range(1, clusters):
    print(np.percentile(reuters.Y, 100*i/clusters))

In [None]:
clusters = 4
labels = np.copy(reuters.Y)
labels[reuters.Y<np.percentile(reuters.Y, 100/clusters)] = 0
for i in range(1, clusters):
    labels[reuters.Y>np.percentile(reuters.Y, 100*i/clusters)] = i
reuters.Y = labels.astype("int")

In [None]:
unique, counts = np.unique(reuters.Y, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
reuters_labels = to_categorical(reuters.Y)

In [None]:
reuters_labels

0 - Sell
1 - Buy

In [None]:
# clusters = 2
# labels = np.copy(reuters.Y)
# for i in range(1, clusters):
#     print(np.percentile(reuters.Y, 100*i/clusters))

In [None]:
# clusters = 2
# labels = np.copy(reuters.Y)
# labels[reuters.Y<np.percentile(reuters.Y, 100/clusters)] = 0
# for i in range(1, clusters):
#     labels[reuters.Y>np.percentile(reuters.Y, 100*i/clusters)] = i
# reuters['binY'] = labels.astype("int")

In [None]:
# unique, counts = np.unique(reuters.binY, return_counts=True)
# print(np.asarray((unique, counts)).T)

In [None]:
# reuters_bin_labels = to_categorical(reuters.binY)

In [None]:
# reuters_bin_labels

### Train/Val Split

In [None]:
print('Shape of news tensor:', reuters_sequences.shape)
print('Shape of label tensor:', reuters_labels.shape)
# print('Shape of binary label tensor:', reuters_bin_labels.shape)

In [None]:
mask = np.random.rand(len(reuters)) < 0.8

train_X = reuters_sequences[mask]
train_Y = reuters_labels[mask]
# train_binY = reuters_bin_labels[mask]
val_X = reuters_sequences[~mask]
val_Y = reuters_labels[~mask]
# val_binY = reuters_bin_labels[~mask]

### Prepare Embedding Matrix

In [None]:
import urllib.request
if (not os.path.isfile('glove.42B.300d.zip') and
   not os.path.isfile('glove.42B.300d.txt')):
    urllib.request.urlretrieve('http://nlp.stanford.edu/data/glove.42B.300d.zip', 
                              os.path.join(os.getcwd(), 'glove.42B.300d.zip'))

import zipfile
if not os.path.isfile('glove.42B.300d.txt'):
    with zipfile.ZipFile("glove.42B.300d.zip","r") as zip_ref:
        zip_ref.extractall(os.getcwd())

glove_wordmap = {}
with open('glove.42B.300d.txt', "r", encoding='utf8') as glove:
    for line in glove:
        word, vector = tuple(line.split(" ", 1))
        glove_wordmap[word] = np.fromstring(vector, sep=" ")

# def sentence2sequence(tokens):
#     global glove_wordmap
   
#     feature = np.zeros([0, 300])
#     for token in tokens:
#         try:
#             feature = np.vstack((feature, glove_wordmap[token]))
#         except:
#             pass
   
#     return feature

In [None]:
embedding_matrix = np.zeros((len(word_index), 300))
count=0
for word, i in word_index.items():
    word_vector = glove_wordmap.get(word)
    if word_vector is not None:
        # words not found in embedding index will be all-zeros.
        try:
            embedding_matrix[i] = word_vector
        except:
            pass
        count=count+1

In [None]:
count/len(word_index)

In [None]:
embedding_matrix

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(MAX_NUM_WORDS,
                            300,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

### Neural Architectue (1D convnet with global maxpooling)

In [None]:
def CNN_1D_GMP(clusters=4):
    model = Conv1D(128, 5, activation='relu')(embedded_sequences)
    model = MaxPooling1D(5)(model)
#     model = Dropout(0.2)(model)
    model = Conv1D(128, 5, activation='relu')(model)
    model = MaxPooling1D(5)(model)
    model = Dropout(0.2)(model)
    model = Conv1D(128, 5, activation='relu')(model)
    model = GlobalMaxPooling1D()(model)
#     model = Dropout(0.2)(model)
    model = Dense(128, activation='relu')(model)

    model = Model(sequence_input, Dense(clusters, activation='softmax')(model))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
    return model

### Neural Architectue (Bidirectional GRU)

In [None]:
def BI_DIR_GRU(clusters=4):
    model = Bidirectional(GRU(128, return_sequences=True, activation='relu'))(embedded_sequences)
    model = Bidirectional(GRU(128, return_sequences=True, activation='relu'))(model)
    model = BatchNormalization(axis=-1)(model)
    model = Flatten()(model)
    model = Dense(128,activation='relu')(model)
    model = BatchNormalization(axis=-1)(model)
#     model = Dropout(0.2)(model)
#     model = Dense(128,activation='relu')(model)
#     model = BatchNormalization(axis=-1)(model)

    model = Model(sequence_input, Dense(clusters, activation='softmax')(model))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
    return model

### Train CNN_1D_GMP_bin

In [None]:
# model = CNN_1D_GMP(clusters=2)
# model = load_model('CNN_1D_GMP_bin.h5')
# model.fit(train_X, train_binY,
#           batch_size=128,
#           epochs=12,
#           validation_data=(val_X, val_binY))
# model.save('CNN_1D_GMP_bin.h5')

### Evaluate CNN_1D_GMP_bin

In [None]:
# predictions = np.argmax(model.predict(val_X), axis=-1)
# conf = confusion_matrix(np.argmax(val_binY, axis=-1), predictions)

In [None]:
# predictions

In [None]:
# np.argmax(val_binY, axis=-1)

In [None]:
# pd.DataFrame(conf,
#              index = [i for i in ['Sell', 'Buy']],
#              columns = [i for i in ['Sell', 'Buy']])

In [None]:
# conf = np.array(conf)
# for i in range(2):
#     print("Label %d Precision: %.2f%%" % (i, conf[i,i] * 100.0 / sum(conf[:,i])))

In [None]:
# from pandas_ml import ConfusionMatrix
# ConfusionMatrix(np.argmax(val_Y, axis=-1), predictions)

In [None]:
# matthews_corrcoef(np.argmax(val_binY, axis=-1), predictions)  

### Train CNN_1D_GMP

In [None]:
# model = CNN_1D_GMP(clusters=4)
model = load_model('CNN_1D_GMP.h5')
model.fit(train_X, train_Y,
          batch_size=128,
          epochs=6,
          validation_data=(val_X, val_Y))
model.save('CNN_1D_GMP.h5')

### Evaluate CNN_1D_GMP

In [None]:
predictions = np.argmax(model.predict(val_X), axis=-1)
conf = confusion_matrix(np.argmax(val_Y, axis=-1), predictions)

In [None]:
predictions

In [None]:
np.argmax(val_Y, axis=-1)

In [None]:
pd.DataFrame(conf,
             index = [i for i in ['Strong Sell', 'Sell', 'Buy', 'Strong Buy'] ],
             columns = [i for i in ['Strong Sell', 'Sell', 'Buy', 'Strong Buy']])

In [None]:
conf = np.array(conf)
for i in range(4):
    print("Label %d Precision: %.2f%%" % (i, conf[i,i] * 100.0 / sum(conf[:,i])))

In [None]:
matthews_corrcoef(np.argmax(val_Y, axis=-1), predictions) 

### Train BI_DIR_GRU

In [None]:
# model = BI_DIR_GRU(clusters=4)
model = load_model('BI_DIR_GRU.h5')
model.fit(train_X, train_Y,
          batch_size=128,
          epochs=6,
          validation_data=(val_X, val_Y))
model.save('BI_DIR_GRU.h5')

### Evaluate BI_DIR_GRU

In [None]:
predictions = np.argmax(model.predict(val_X), axis=-1)
conf = confusion_matrix(np.argmax(val_Y, axis=-1), predictions)

In [None]:
predictions

In [None]:
np.argmax(val_Y, axis=-1)

In [None]:
pd.DataFrame(conf,
             index = [i for i in ['Strong Sell', 'Sell', 'Buy', 'Strong Buy'] ],
             columns = [i for i in ['Strong Sell', 'Sell', 'Buy', 'Strong Buy']])

In [None]:
conf = np.array(conf)
for i in range(4):
    print("Label %d Precision: %.2f%%" % (i, conf[i,i] * 100.0 / sum(conf[:,i])))

In [None]:
matthews_corrcoef(np.argmax(val_Y, axis=-1), predictions) 

### Test

In [None]:
test = '''U.S. judge rules Qualcomm owes Apple nearly $1 billion rebate payment

A U.S. federal judge has issued a preliminary ruling that Qualcomm Inc owes Apple Inc nearly $1 billion in patent royalty rebate payments, though the decision is unlikely to result in Qualcomm writing a check to Apple because of other developments in the dispute.
'''

In [None]:
test = '''France's Monoprix working on expanding grocery alliance with Amazon

PARIS Casino's upmarket Monoprix supermarket chain is working to expand its partnership with E-commerce giant Amazon in France, following a successful launch in Paris, Monoprix's Chief Executive said on Thursday.
'''


In [None]:
test = '''Microsoft workers demand it drop $480 million U.S. Army contract

SAN FRANCISCO Some Microsoft Corp employees on Friday demanded that the company cancel a $480 million hardware contract to supply the U.S. Army, with 94 workers signing a petition calling on the company to stop developing "any and all weapons technologies."

'''

In [None]:
from nltk import word_tokenize
import pickle

top_fin_words = pickle.load(open('reuters_top_fin_words.pkl', "rb"))

test = [w for w in word_tokenize(test.lower()) if w in top_fin_words]
test = pad_sequences([np.concatenate(tokenizer.texts_to_sequences(test), axis=0)], 
                     maxlen=MAX_SEQUENCE_LENGTH, 
                     padding='post')
np.argmax(model.predict(test), axis=-1)