In [35]:
import re
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed, LSTM
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku
import numpy as np
from time import time

In [36]:
def fetch_profiles(filename, n):
    f           = open(filename, 'r')
    profiles    = f.read().splitlines()
    f.close()
    return(list(set(profiles[:n])))

In [39]:
""" Custom Libs """
import Cleaner as c

#Read tweets
profilename = '../../data/profiles.txt'
sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'tweets'
profiles    = fetch_profiles(profilename, 2)
profiles    = [p.strip('@') for p in profiles]
cd          = c.CleanData(sqlite_file, table_name)
q           = 'SELECT * FROM {} WHERE AUTHOR IN ("{}");'.format(table_name, '", "'.join(profiles))

cd.set_table(q)
data = cd.get_clean_table().CleanText.values

data_2 = ''
for x in data:
  data_2 += x + "\n"
data = data_2

np.random.seed(0)

tokenizer = Tokenizer()

In [40]:
data

'top 20 digital experts by via rt\nrescue mt\nthis is what will look like in 2027\nhow do we implement\n2018 trends ht\nthe revolution mt\nthe nature of and in supply and demand mt\nwhat are some advantages of blockchain mt\nprotecting your against and v\nthis cute is a s ultimate nemesis\ncool or creepy this is judging you\nintroducing earth 2018\n7 ways is transforming\nstate of penetration of in the\nin 4 years your will translate your to text\nflying trains two words we never thought wed hear together\nmoves to give social scores to reward or punish citizens ht\nreadies to your brain ht\nwhat are some interesting paths in mt\nthe 7 levels of the internet of things infographic\nwhat exactly are blockchains\nwhat are some federal usecases with\nupdated technology timeline to 2020\nis this the sports of the mt\nthe future of control centers mt\ncontrol everything with one hand\nthis startup wants to digitize your brainv ht\nwhat are the basic concepts of a\nexecuting digital transform

In [58]:
def dataset_preparation(data):

    # basic cleanup
    corpus = data.lower().split("\n")

    # tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    # create input sequences using list of tokens
    input_sequences = []
    for line in corpus:
      token_list = tokenizer.texts_to_sequences([line])[0]
      for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

    # pad sequences
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    print(input_sequences)

    # create predictors and label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)

    return predictors, label, max_sequence_len, total_words

In [46]:
epochs      = 300
dropout     = 0.1
l2_reg      = 1e-4
batch_sz    = 64
learn_rate  = 1e-3
beta_1      = 0.9
beta_2      = 0.999
epsilon     = None
decay_rate  = 0
amsgrad     = False
run_model   = True

In [47]:
def create_model(predictors, label, max_sequence_len, total_words):
    model = Sequential()
    model.add(Embedding(total_words, 500, input_length = max_sequence_len-1))
    model.add(LSTM(512, return_sequences = True))
    if dropout != 0:
        model.add(Dropout(dropout))
        model.add(LSTM(256))
    else:
        model.add(LSTM(256))
    if l2_reg != 0:
        model.add(Dense(total_words, activation = 'softmax', bias_regularizer = l2(l2_reg)))
    else:
        model.add(Dense(total_words, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
    checkpointer = ModelCheckpoint(filepath='model'
                                   + '/single-user-model-{epoch:02d}.hdf5', verbose = 1)
    tensorboard = TensorBoard(log_dir = 'tb-logs/{}'.format(time()))
    earlystop = EarlyStopping(monitor='loss', min_delta=0, patience=50, verbose=0, mode='min')
    return(model, checkpointer, tensorboard, earlystop)

In [48]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
      token_list = tokenizer.texts_to_sequences([seed_text])[0]
      token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
      predicted = model.predict_classes(token_list, verbose=0)

      output_word = ""
      for word, index in tokenizer.word_index.items():
        if index == predicted:
          output_word = word
          break
      seed_text += " " + output_word
    return seed_text

In [55]:
predictors, label, max_sequence_len, total_words

(array([[  0,   0,   0, ...,   0,   0,  26],
        [  0,   0,   0, ...,   0,  26, 647],
        [  0,   0,   0, ...,  26, 647,   9],
        ...,
        [  0,   0,   0, ...,  70,   8,  60],
        [  0,   0,   0, ...,   8,  60,   6],
        [  0,   0,   0, ...,  60,   6, 173]], dtype=int32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 25,
 1223)

In [59]:
predictors, label, max_sequence_len, total_words = dataset_preparation(data)
model, checkpointer, tensorboard, earlystop = create_model(predictors, label, max_sequence_len, total_words)
model.summary()

[[  0   0   0 ...   0  26 647]
 [  0   0   0 ...  26 647   9]
 [  0   0   0 ... 647   9 648]
 ...
 [  0   0   0 ...   8  60   6]
 [  0   0   0 ...  60   6 173]
 [  0   0   0 ...   6 173  31]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 24, 500)           611500    
_________________________________________________________________
lstm_19 (LSTM)               (None, 24, 512)           2074624   
_________________________________________________________________
dropout_10 (Dropout)         (None, 24, 512)           0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 256)               787456    
_________________________________________________________________
dense_10 (Dense)             (None, 1223)              314311    
Total params: 3,787,891
Trainable params: 3,787,891
Non-trainable params: 0
______

In [50]:
model.fit(predictors, label, epochs=300, verbose=1, callbacks=[earlystop, checkpointer, tensorboard])

Epoch 1/300

Epoch 00001: saving model to model/single-user-model-01.hdf5
Epoch 2/300

Epoch 00002: saving model to model/single-user-model-02.hdf5
Epoch 3/300

Epoch 00003: saving model to model/single-user-model-03.hdf5
Epoch 4/300

Epoch 00004: saving model to model/single-user-model-04.hdf5
Epoch 5/300

Epoch 00005: saving model to model/single-user-model-05.hdf5
Epoch 6/300

Epoch 00006: saving model to model/single-user-model-06.hdf5
Epoch 7/300

Epoch 00007: saving model to model/single-user-model-07.hdf5
Epoch 8/300

Epoch 00008: saving model to model/single-user-model-08.hdf5
Epoch 9/300

Epoch 00009: saving model to model/single-user-model-09.hdf5
Epoch 10/300

Epoch 00010: saving model to model/single-user-model-10.hdf5
Epoch 11/300

Epoch 00011: saving model to model/single-user-model-11.hdf5
Epoch 12/300

Epoch 00012: saving model to model/single-user-model-12.hdf5
Epoch 13/300

Epoch 00013: saving model to model/single-user-model-13.hdf5
Epoch 14/300

Epoch 00014: saving 

KeyboardInterrupt: 