In [0]:
import re
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping, TensorBoard
from keras.models import Sequential
import keras.utils as ku 
import numpy as np 


""" Custom Libs """
import Cleaner as c
from embedding import ngram, generate_text_ngram

Using TensorFlow backend.


In [0]:
#Hyperparameters

epochs = 100
hidden_1 = 200
hidden_2 = 150
embedding_layer = 64
dropout_1 = 0.1
seed = 0 #For reproducibility

np.random.seed(seed)

In [0]:
#Import Data
sqlite_file = 'deeplearning.sqlite'
table_name  = 'tweets'
cd          = c.CleanData(sqlite_file, table_name)
q           ='SELECT * FROM {} WHERE Author == "simonlporter";'.format(table_name)

cd.set_table(q)
raw_data = cd.get_clean_table()
raw_data = raw_data.CleanText.values
data = ''
for x in raw_data:
    data += x + "\n"

In [0]:
def create_model(predictors, label, max_sequence_len, total_words):
	
	model = Sequential()
	model.add(Embedding(total_words, embedding_layer, input_length=max_sequence_len-1))
	model.add(LSTM(hidden_1, return_sequences = True))
	model.add(Dropout(dropout_1))
	model.add(LSTM(hidden_2))
	model.add(Dense(total_words, activation='softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='min')
	model.fit(predictors, label, epochs=200, verbose=2, callbacks=[earlystop])
	print (model.summary())
	return model 


In [0]:
predictors, label, max_sequence_len, total_words = ngram(data)
model = create_model(predictors, label, max_sequence_len, total_words)

Epoch 1/200
 - 29s - loss: 6.3031 - acc: 0.0395
Epoch 2/200
 - 27s - loss: 5.9646 - acc: 0.0447
Epoch 3/200
 - 27s - loss: 5.8761 - acc: 0.0426
Epoch 4/200
 - 27s - loss: 5.6902 - acc: 0.0486
Epoch 5/200
 - 28s - loss: 5.5299 - acc: 0.0641
Epoch 6/200
 - 27s - loss: 5.3672 - acc: 0.0727
Epoch 7/200
 - 27s - loss: 5.2061 - acc: 0.0799
Epoch 8/200
 - 27s - loss: 5.0552 - acc: 0.0943
Epoch 9/200
 - 27s - loss: 4.9071 - acc: 0.1019
Epoch 10/200
 - 27s - loss: 4.7726 - acc: 0.1139
Epoch 11/200
 - 27s - loss: 4.6373 - acc: 0.1173
Epoch 12/200
 - 27s - loss: 4.5042 - acc: 0.1333
Epoch 13/200
 - 27s - loss: 4.3651 - acc: 0.1531
Epoch 14/200
 - 27s - loss: 4.2299 - acc: 0.1663
Epoch 15/200
 - 27s - loss: 4.0928 - acc: 0.1881
Epoch 16/200
 - 27s - loss: 3.9689 - acc: 0.2056
Epoch 17/200
 - 27s - loss: 3.8479 - acc: 0.2264
Epoch 18/200
 - 16s - loss: 3.7281 - acc: 0.2402
Epoch 19/200
 - 15s - loss: 3.6134 - acc: 0.2527
Epoch 20/200
 - 15s - loss: 3.5121 - acc: 0.2651
Epoch 21/200
 - 15s - loss: 3

In [0]:
tokenizer = Tokenizer()
def ngram(data):
    
    # basic cleanup
    corpus = data.lower().split("\n") #Lower to reduce vocab size

    # tokenization
    tokenizer.fit_on_texts(corpus) #Creates a dictionary of all words and its index, {}'the' : 1, 'to': 2, ...} in tokenizer.word_index
    total_words = len(tokenizer.word_index) + 1 #Embedding layer expects input_dim to be vocabulary size + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)

    #The above basially creates a list of integers representing each sentence from n = 2 to n = len(sentence)

    # pad sequences 
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

predictors, label, max_sequence_len, total_words = ngram(data)

In [0]:
from keras.preprocessing.text import Tokenizer

def generate_text(seed_text, next_words, max_sequence_len, model):
	for _ in range(next_words):
		token_list = tokenizer.texts_to_sequences([seed_text])[0]
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
		predicted = model.predict_classes(token_list, verbose=0)
		
		output_word = ""
		for word, index in tokenizer.word_index.items():
			if index == predicted:
				output_word = word
				break
		seed_text += " " + output_word
	return seed_text

In [0]:
print (generate_text("cloud", max_sequence_len, max_sequence_len, model))
print (generate_text("amazon", max_sequence_len, max_sequence_len, model))
print (generate_text("google", max_sequence_len, max_sequence_len, model))
print (generate_text("trend", max_sequence_len, max_sequence_len, model))
print (generate_text("institute", max_sequence_len, max_sequence_len, model))
print (generate_text("data science", max_sequence_len, max_sequence_len, model))

cloud vendors on the rise in 2019 and beyond the world of data science and go straight to the source are anything but where are anything
amazon your boss is now more likely to train you up thanks to a dwindling talent pool more time operating exit outcomes via feeling can job
google your boss is now more likely to train you up thanks to a dwindling talent pool more time operating exit outcomes via feeling can job
trend of the future will need these 5 skills culture of the next 5 years ago at being ai for the best of all that ai
institute a key factor between success and failure study suggests simonlporter at work today than 5 years ago at being human models should know data successful
data science and robotics the next big area of study than 5 years ago at the answer are at thanks to a fourday week job your job
