In [25]:
#importing dependencies

import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# load the data

file = open("cnbc_headlines.csv").read()

In [7]:
# lowercase everything to standardize it

def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [10]:
# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [11]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [12]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 509007
Total vocab: 39


In [13]:
seq_length = 100
x_data = []
y_data = []

In [14]:
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 508907


In [16]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [17]:
y = np_utils.to_categorical(y_data)

In [26]:
# creating the model

model=Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [28]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [29]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.59788, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.59788 to 2.05652, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.05652 to 1.88972, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 1.88972 to 1.75549, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f9a037e7b00>

In [30]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [32]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" stores weed cramer floats idea gloomy etf wall street downtrodden stocks 10 59 et tue 19 june 2018 j "


In [33]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

im cramer says cramer lightning round bell means giving answers callers stock questions rapid speed cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cramer remix cra