In [10]:
import tensorflow as tf
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.utils as ku
import pandas as pd
import collections
import sqlite3
import re
import os
import sys

In [11]:
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed, LSTM
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import sqlite3
import pandas as pd
import re

import Cleaner as c
import TokenMgmt as tm

In [12]:
sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'donald'
cnxn = sqlite3.connect(sqlite_file)
q    ='SELECT * FROM {};'.format(table_name)
data = pd.read_sql_query(q, cnxn)
def strip_links(txt):
    txt = re.sub(r'(\w+|\.+|\/+)\.twitter.com(\/).*\s', '', txt, flags = re.I)
    txt = re.sub(r'(?:\w+|\@\w+|\#\w+|\s).twitter\.com\/\w*', '', txt, flags = re.I)
    txt = re.sub(r'(?:\w+|\@\w+|\#\w+|\s).twitter.com\w*', '', txt, flags = re.I)
    return(re.sub(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', txt))

def strip_whitespace(txt):
    txt = txt.strip(' ')
    txt = re.sub('( \- | \-)', '', txt)
    return(re.sub(r' +', ' ', txt))

def strip_metachar(txt):
    return(re.sub(r"[^a-zA-Z0-9\@\# ]+", '', txt))

def strip_ats(txt):
    return(re.sub(r'(\@|\#)\w*', '', txt))

def detect_empty(txt):
    if txt == '':
        return(np.nan)
    else:
        return(txt)

data['CleanText'] = data['Text'].apply(lambda t: strip_links(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_ats(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_metachar(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_whitespace(t))
data['CleanText'] = data['CleanText'].apply(lambda t: t.lower())
data['CleanText'] = data['CleanText'].apply(lambda t: detect_empty(t))
data = data.replace(r'(^\s+$)', np.nan, regex=True)
print(len(data))
data = data.dropna(subset=['CleanText'])
print(len(data))

498
476


In [13]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return(input_sequences, total_words)

inp_sequences, total_words = get_sequence_of_tokens(list(data.CleanText.values))

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return(predictors, label, max_sequence_len)

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [14]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 24, input_length=input_len))
    model.add(LSTM(64, return_sequences=True))
    model.add(LSTM(128))
    model.add(Dropout(0.3))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return(model)

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 52, 24)            64008     
_________________________________________________________________
lstm_1 (LSTM)                (None, 52, 64)            22784     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               98816     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2667)              344043    
Total params: 529,651
Trainable params: 529,651
Non-trainable params: 0
_________________________________________________________________


In [5]:
model.fit(predictors, label, epochs=50, verbose=2)

Epoch 1/50
 - 51s - loss: 6.8672
Epoch 2/50
 - 43s - loss: 6.5465
Epoch 3/50
 - 43s - loss: 6.3993
Epoch 4/50
 - 42s - loss: 6.2532
Epoch 5/50
 - 43s - loss: 6.1535
Epoch 6/50
 - 45s - loss: 6.0580
Epoch 7/50
 - 44s - loss: 5.9779
Epoch 8/50
 - 45s - loss: 5.8904
Epoch 9/50
 - 45s - loss: 5.8056
Epoch 10/50
 - 44s - loss: 5.7251
Epoch 11/50
 - 44s - loss: 5.6516
Epoch 12/50
 - 44s - loss: 5.5752
Epoch 13/50
 - 44s - loss: 5.4975
Epoch 14/50
 - 45s - loss: 5.4208
Epoch 15/50
 - 44s - loss: 5.3340
Epoch 16/50
 - 47s - loss: 5.2500
Epoch 17/50
 - 42s - loss: 5.1609
Epoch 18/50
 - 43s - loss: 5.0707
Epoch 19/50
 - 44s - loss: 4.9841
Epoch 20/50
 - 44s - loss: 4.8971
Epoch 21/50
 - 44s - loss: 4.8135
Epoch 22/50
 - 44s - loss: 4.7168
Epoch 23/50
 - 46s - loss: 4.6320
Epoch 24/50
 - 44s - loss: 4.5486
Epoch 25/50
 - 45s - loss: 4.4677
Epoch 26/50
 - 44s - loss: 4.3781
Epoch 27/50
 - 45s - loss: 4.3025
Epoch 28/50
 - 45s - loss: 4.2363
Epoch 29/50
 - 45s - loss: 4.1534
Epoch 30/50
 - 45s - lo

<keras.callbacks.History at 0x7f776e4db240>

In [15]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [29]:
import TokenMgmt as tm
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed, LSTM
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam

import Cleaner as c
from keras.models import load_model
filename = 'model/model-300.hdf5'

sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'donald'
cd          = c.CleanData(sqlite_file, table_name)
q           ='SELECT * FROM {};'.format(table_name)

cd.set_table(q)
data = cd.get_clean_table()

inp_sequences, total_words = tm.get_sequence_of_tokens(list(data.CleanText.values))
predictors, label, max_sequence_len = tm.generate_padded_sequences(inp_sequences, total_words)


def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 42, input_length=input_len))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(1024))
    #model.add(Dropout(0.3))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    checkpointer = ModelCheckpoint(filepath='model' + '/model-{epoch:02d}.hdf5', verbose=1)
    return(model, checkpointer)

model, checkpointer = create_model(max_sequence_len, total_words)
model.load_weights(filename)

In [30]:
print (tm.generate_text("caravan", 8, model, max_sequence_len))
print (tm.generate_text("caravan", 20, model, max_sequence_len))

print (tm.generate_text("outrage", 5, model, max_sequence_len))
print (tm.generate_text("dont believe", 19, model, max_sequence_len))
print (tm.generate_text("fake news", 8, model, max_sequence_len))

Caravan Of The Best Governors In The Usa Florida
Caravan Of The Best Governors In The Usa Florida Is Setting Records In Almost Every Category Of Success Amazing Achievementthe Envy
Outrage It Looks Like Mexicos Police
Dont Believe Will After I Speak To Them I Am In Total Support Also Democrats Will Destroy Your Medicare And I
Fake News Is Being Hammered Even By The Left Her


In [31]:
print (tm.generate_text("what does", 40, model, max_sequence_len))

What Does The Way When The Helicopter Couldnt Fly To The First Cemetery In France Because Of Almost Zero Visibility I Suggested Driving Secret Service Said No Too Far From Airport Big Paris Shutdown Speech Next Day At American Cemetery In Pouring
