In [1]:
import re
import sqlite3
import numpy as np
import pandas as pd
from time import time
import tensorflow as tf
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed, LSTM

""" Custom Libs """
import Cleaner as c
import TokenMgmt as tm

Using TensorFlow backend.


In [7]:
epochs      = 300
dropout     = 0.1
l2_reg      = 1e-4
batch_sz    = 64
learn_rate  = 1e-3
beta_1      = 0.9
beta_2      = 0.999
epsilon     = None
decay_rate  = 0
amsgrad     = False
run_model   = True

In [3]:
def fetch_profiles(filename, n):
    f           = open(filename, 'r')
    profiles    = f.read().splitlines()
    f.close()
    return(list(set(profiles[:n])))

In [4]:
sqlite_file = '../../data/database/deeplearning.sqlite'
profilename = '../../data/profiles.txt'
table_name  = 'tweets'
profiles    = fetch_profiles(profilename, 10)
profiles    = [p.strip('@') for p in profiles]
cd          = c.CleanData(sqlite_file, table_name)
q           = 'SELECT * FROM {} WHERE AUTHOR IN ("{}");'.format(table_name, '", "'.join(profiles))

cd.set_table(q)
data = cd.get_clean_table()
_, total_words = tm.get_sequence_of_tokens(list(data.CleanText.values))
x, y, max_sequence_len = tm.generate_padded_sequences(_, total_words)
opt_adam = Adam(lr=learn_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay_rate, amsgrad=amsgrad)

In [8]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 500, input_length = input_len))
    model.add(LSTM(512, return_sequences = True))
    if dropout != 0:
        model.add(Dropout(dropout))
        model.add(LSTM(256))
    else:
        model.add(LSTM(256))
    if l2_reg != 0:
        model.add(Dense(total_words, activation = 'softmax', bias_regularizer = l2(l2_reg)))
    else:
        model.add(Dense(total_words, activation = 'softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = opt_adam, metrics=['categorical_accuracy'])
    checkpointer = ModelCheckpoint(filepath='model'
                                   + '/single-user-model-{epoch:02d}.hdf5', verbose = 1)
    tensorboard = TensorBoard(log_dir = 'tb-logs/{}'.format(time()))
    return(model, checkpointer, tensorboard)

In [9]:
model, checkpointer, tensorboard = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 48, 500)           2848500   
_________________________________________________________________
lstm_2 (LSTM)                (None, 48, 512)           2074624   
_________________________________________________________________
dropout_1 (Dropout)          (None, 48, 512)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               787456    
_________________________________________________________________
dense_2 (Dense)              (None, 5697)              1464129   
Total params: 7,174,709
Trainable params: 7,174,709
Non-trainable params: 0
_________________________________________________________________


In [None]:
# !tensorboard --logdir=tb-logs/
# TensorBoard 1.12.0 at http://xps:6006 (Press CTRL+C to quit)

In [10]:
if run_model == True:
    if tf.test.is_gpu_available():
        model.fit(x = x, y = y, 
                  epochs = epochs, 
                  batch_size = batch_sz,
                  validation_split = 0.25,
                  verbose = 1, 
                  callbacks=[checkpointer, tensorboard])

Train on 23487 samples, validate on 7830 samples
Epoch 1/300

Epoch 00001: saving model to model/single-user-model-01.hdf5
Epoch 2/300

Epoch 00002: saving model to model/single-user-model-02.hdf5
Epoch 3/300

Epoch 00003: saving model to model/single-user-model-03.hdf5
Epoch 4/300

Epoch 00004: saving model to model/single-user-model-04.hdf5
Epoch 5/300

KeyboardInterrupt: 

In [11]:
model_file = "model/single-user-model-100.hdf5"
model = load_model(model_file)

In [16]:
print (tm.generate_text("Social", 15, model, max_sequence_len))
print (tm.generate_text("Bid Data", 15, model, max_sequence_len))
print (tm.generate_text("Business", 15, model, max_sequence_len))
print (tm.generate_text("Last Year", 15, model, max_sequence_len))
print (tm.generate_text("Be", 15, model, max_sequence_len))

Social Media Getting To Know Facebook Fans Internet Retailer A Big Win For Kontagent Years Ago
Bid Data Science Simplified Part 10 An Introduction To Classification Models In Enterprise Practitioners And In A
Business Factors In The Successful Use Of Machine Learning What It Is Critical And Ray At
Last Year Trends In Data Science And Machine Learning A Conversation With With Passing References To Build
Be The Next Superpower Via Via Ht Via Via Video Via On More More Angeles Preferences


In [None]:
def generate_train_val(obj, train_part):
    data = obj.get_clean_table()
    train = data.sample(frac=train_part, random_state=200)
    val  = data.drop(train.index)
    
    _1, total_words = tm.get_sequence_of_tokens(list(data.CleanText.values))
    _1, _2, max_seq_len = tm.generate_padded_sequences(_1, total_words)
    
    x_seq, total_words = tm.get_sequence_of_tokens(list(train.CleanText.values))
    x_train, y_train, max_seq_len = tm.generate_padded_sequences(x_seq, total_words)
    
    x_seq, _ = tm.get_sequence_of_tokens(list(val.CleanText.values))
    x_val, y_val, _ = tm.generate_padded_sequences(x_seq, _)
    
    return(x_train, y_train, x_val, y_val, total_words, max_seq_len)