In [1]:
import tensorflow as tf
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.utils as ku
import pandas as pd
import collections
import sqlite3
import re
import os
import sys

pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


In [2]:
sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'donald'
cnxn = sqlite3.connect(sqlite_file)
q    ='SELECT * FROM {};'.format(table_name)
data = pd.read_sql_query(q, cnxn)
def strip_links(txt):
    txt = re.sub(r'(\w+|\.+|\/+)\.twitter.com(\/).*\s', '', txt, flags = re.I)
    txt = re.sub(r'(?:\w+|\@\w+|\#\w+|\s).twitter\.com\/\w*', '', txt, flags = re.I)
    txt = re.sub(r'(?:\w+|\@\w+|\#\w+|\s).twitter.com\w*', '', txt, flags = re.I)
    return(re.sub(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', txt))

def strip_whitespace(txt):
    txt = txt.strip(' ')
    txt = re.sub('( \- | \-)', '', txt)
    return(re.sub(r' +', ' ', txt))

def strip_metachar(txt):
    return(re.sub(r"[^a-zA-Z0-9\@\# ]+", '', txt))

def strip_ats(txt):
    return(re.sub(r'(\@|\#)\w*', '', txt))

def detect_empty(txt):
    if txt == '':
        return(np.nan)
    else:
        return(txt)

data['CleanText'] = data['Text'].apply(lambda t: strip_links(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_ats(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_metachar(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_whitespace(t))
data['CleanText'] = data['CleanText'].apply(lambda t: t.lower())
data['CleanText'] = data['CleanText'].apply(lambda t: detect_empty(t))
data = data.replace(r'(^\s+$)', np.nan, regex=True)
print(len(data))
data = data.dropna(subset=['CleanText'])
print(len(data))

499
499


In [4]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return(input_sequences, total_words)

inp_sequences, total_words = get_sequence_of_tokens(list(data.CleanText.values))
inp_sequences[:10]

[[555, 181],
 [555, 181, 8],
 [555, 181, 8, 556],
 [555, 181, 8, 556, 557],
 [251, 8],
 [251, 8, 39],
 [251, 8, 39, 558],
 [251, 8, 39, 558, 40],
 [251, 8, 39, 558, 40, 13],
 [251, 8, 39, 558, 40, 13, 6]]

In [13]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return(predictors, label, max_sequence_len)

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [33]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 24, input_length=input_len))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(1024))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(total_words, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return(model)

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 24, 24)            25056     
_________________________________________________________________
lstm_23 (LSTM)               (None, 24, 256)           287744    
_________________________________________________________________
lstm_24 (LSTM)               (None, 1024)              5246976   
_________________________________________________________________
dense_9 (Dense)              (None, 1044)              1070100   
Total params: 6,629,876
Trainable params: 6,629,876
Non-trainable params: 0
_________________________________________________________________


In [34]:
model.fit(predictors, label, epochs=100, verbose=2)

Epoch 1/100
 - 13s - loss: 6.3077
Epoch 2/100
 - 11s - loss: 5.9472
Epoch 3/100
 - 11s - loss: 5.7258
Epoch 4/100
 - 18s - loss: 5.3497
Epoch 5/100
 - 13s - loss: 4.8522
Epoch 6/100
 - 12s - loss: 4.1485
Epoch 7/100
 - 11s - loss: 3.3428
Epoch 8/100
 - 12s - loss: 2.5462
Epoch 9/100
 - 12s - loss: 1.8550
Epoch 10/100
 - 12s - loss: 1.3643
Epoch 11/100
 - 12s - loss: 1.0397
Epoch 12/100
 - 12s - loss: 0.8521
Epoch 13/100
 - 12s - loss: 0.7420
Epoch 14/100
 - 12s - loss: 0.6605
Epoch 15/100
 - 12s - loss: 0.6095
Epoch 16/100
 - 12s - loss: 0.5668
Epoch 17/100
 - 12s - loss: 0.5454
Epoch 18/100
 - 12s - loss: 0.5118
Epoch 19/100
 - 11s - loss: 0.4880
Epoch 20/100
 - 12s - loss: 0.4785
Epoch 21/100
 - 12s - loss: 0.4745
Epoch 22/100
 - 12s - loss: 0.4567
Epoch 23/100
 - 12s - loss: 0.4462
Epoch 24/100
 - 12s - loss: 0.4241
Epoch 25/100
 - 12s - loss: 0.4085
Epoch 26/100
 - 12s - loss: 0.4044
Epoch 27/100
 - 12s - loss: 0.3957
Epoch 28/100
 - 12s - loss: 0.3838
Epoch 29/100
 - 12s - loss: 0

<keras.callbacks.History at 0x7fcfdd359080>

In [35]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [36]:
print (generate_text("big data", 8, model, max_sequence_len))
print (generate_text("analytics", 10, model, max_sequence_len))
print (generate_text("business", 10, model, max_sequence_len))
print (generate_text("hr", 15, model, max_sequence_len))
print (generate_text("the future", 10, model, max_sequence_len))
print (generate_text("new", 10, model, max_sequence_len))
print (generate_text("success", 10, model, max_sequence_len))
print (generate_text("next generation of", 10, model, max_sequence_len))

Big Data Thinking Myths Debunked That More Doesnt Their Dust
Analytics Experiences Building Relationships One Conversation At A Time About Years
Business Success Factors For Each Phase Of Digital Transformation And Where
Hr And Security The New Bffs On The Block Automation For
The Future Of Work In The Age Of Ai Policy World On
New Technology New Rules Reimagining The Modern Finance Workforce Via Via
Success Technology New Rules Reimagining The Modern Finance Workforce Via Via
Next Generation Of Of Make In In To A World Wave Years Hype


In [43]:
print (generate_text("Analysis", 29, model, max_sequence_len))

Analysis Vendors Is The New Currency And Other Insights From People Leaders Leaders Do To Prepare In Accuracy The World About Years 2020 Will People To 10 Innovators Csuite Execs
