<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modelling" data-toc-modified-id="Modelling-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Modelling</a></span></li><li><span><a href="#Basic-MLP" data-toc-modified-id="Basic-MLP-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Basic MLP</a></span></li></ul></div>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.python.client import device_lib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, GRU
from tensorflow.keras.layers import Flatten, Dropout, Input, concatenate, BatchNormalization
from tensorflow.keras import backend as K

import re
import gc
from ast import literal_eval

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 16041283509116965127
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6672629760
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10095192725992032046
physical_device_desc: "device: 0, name: GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


In [2]:
price_df = pd.read_csv("../inputs/preprocessed_news.csv")

In [3]:
cols = ['price_change_short', "price_change_mid", "price_change_long"]
labels = price_df['signal']
labels = pd.get_dummies(columns=['signal'], data=labels)
price_feat_data = price_df[cols]
aux_shape = len(price_feat_data.columns)
docs = price_df["combined_tokens"].apply(literal_eval)

In [4]:
print(aux_shape)

3


In [5]:
price_df['combined_len'].describe()

count    6660.000000
mean       25.458709
std         8.469920
min         0.000000
25%        19.000000
50%        26.000000
75%        30.000000
max        71.000000
Name: combined_len, dtype: float64

In [6]:
max_words = 40

In [7]:
def load_embedding(filename):
    # load embedding into memory, skip first line
    print("Loading Glove pre-trained model")
    # create a map of words to vectors
    embedding = dict()
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype="float32")
            # key is string word, value is numpy array for vector
            embedding[word] = vector
    
    return embedding


def tokenize_and_pad(docs, max_words=max_words):
    
    t = Tokenizer()
    t.fit_on_texts(docs)
    docs = pad_sequences(sequences=t.texts_to_sequences(docs), maxlen=max_words, padding='post')
    
    vocab = t.word_index
    
    return docs, vocab


# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab, embed_dim=100):
    # total vocabulary size plus 0 for unknown words
    words_not_found = []
    
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, embed_dim))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            weight_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
            
    return weight_matrix, words_not_found


# # get vectors in the right order
# # embedding_vectors = get_weight_matrix(raw_embedding, t.word_index)

In [8]:
docs, vocab = tokenize_and_pad(docs)

In [9]:
embedding_index = load_embedding("../inputs/glove.6B.100d.txt")
embedding_matrix, words_not_found = get_weight_matrix(embedding_index, vocab)
print(f"number of null embeddings {np.sum(np.sum(embedding_matrix, axis=1) == 0)}")

Loading Glove pre-trained model
number of null embeddings 2003


In [10]:
del embedding_index
gc.collect()

80

In [11]:
price_train, price_test, label_train, label_test, docs_train, docs_test = train_test_split(
    price_feat_data, labels, docs,
    stratify=labels,
    test_size=0.2,
    random_state=42)

In [12]:
aux_features = ['price_change_short', "price_change_mid", "price_change_long"]
scaler = StandardScaler()
price_train = scaler.fit_transform(price_train)
price_test = scaler.transform(price_test)

In [13]:
## define roc auc as our evaluation 

# https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
def roc_auc(y_true, y_pred):
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

## Modelling

## Basic MLP

In [14]:
def build_model()

SyntaxError: invalid syntax (<ipython-input-14-21987976bdcf>, line 1)