In [1]:
import tensorflow as tf
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd
import re
import sys

import sqlite3

Using TensorFlow backend.


In [32]:
VOCAB_L = 20
DROPOUT = 0.40
HIDDEN  = 128
BATCH   = 50
N_EPOCH = 35


sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'tweets'
cnxn = sqlite3.connect(sqlite_file)
q    ='SELECT * FROM {};'.format(table_name)
data = pd.read_sql_query(q, cnxn)

def strip_links(txt):
  txt = re.sub(r'(?:\w+|\@\w+|\#\w+)\.twitter\.com\/\w+', '', txt)
  return(re.sub(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', txt))

def strip_whitespace(txt):
  txt = txt.strip(' ')
  return(re.sub(r' +', ' ', txt))

def strip_metachar(txt):
  return(re.sub(r"[^a-zA-Z0-9\-\@\#\.\, ]+", '', txt))

def strip_ats(txt):
  return(re.sub(r'\@\w*', '', txt))

data['CleanText'] = data['Text'].apply(lambda t: strip_links(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_whitespace(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_metachar(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_ats(t))
raw_text = ""
for tweet in data.CleanText:
 raw_text += tweet.strip()
 raw_text += ' '

raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)
# prepare the dataset of input to output pairs encoded as integers
seq_length = VOCAB_L
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

Total Characters:  907210
Total Vocab:  41
Total Patterns:  907190


In [3]:
with tf.Session() as sess:
  devices = sess.list_devices()
devices

[_DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 268435456, 4862082157515828049),
 _DeviceAttributes(/job:localhost/replica:0/task:0/device:GPU:0, GPU, 3575578624, 3549433544248340133)]

In [33]:
VOCAB_L = 20
DROPOUT = 0.40
HIDDEN  = 128
BATCH   = 50
N_EPOCH = 35

model = Sequential()
model.add(LSTM(HIDDEN, input_shape=(X.shape[1], X.shape[2]))) # Add extra LSTM layer
model.add(Dropout(DROPOUT))
model.add(Dense(y.shape[1], activation='softmax')) # Unidirectional
#model.add(Activation('softmax')) # Need activation
model.compile(loss='categorical_crossentropy', optimizer='adam')

filename = './model/weights-improvement-29-1.9302.hdf5'
model.load_weights(filename)
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [25]:
int_to_char = dict((i,c) for i,c in enumerate(chars))

In [38]:
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x/float(n_vocab)
    prediction = model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone.')

Seed:
"  are fueling the rap "
e to ae the #iot #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdata #ai #iot #bigdat

In [22]:
dataX[1]

[19, 37, 0, 30, 29, 33, 34, 0, 27, 15, 28, 15, 21, 23]

In [10]:
int_to_char

{0: ' ',
 1: '#',
 2: ',',
 3: '-',
 4: '.',
 5: '0',
 6: '1',
 7: '2',
 8: '3',
 9: '4',
 10: '5',
 11: '6',
 12: '7',
 13: '8',
 14: '9',
 15: 'a',
 16: 'b',
 17: 'c',
 18: 'd',
 19: 'e',
 20: 'f',
 21: 'g',
 22: 'h',
 23: 'i',
 24: 'j',
 25: 'k',
 26: 'l',
 27: 'm',
 28: 'n',
 29: 'o',
 30: 'p',
 31: 'q',
 32: 'r',
 33: 's',
 34: 't',
 35: 'u',
 36: 'v',
 37: 'w',
 38: 'x',
 39: 'y',
 40: 'z'}

In [41]:
T = [35, 23, 20, 1, 21, 36, 35, 36, 33, 20, 1, 24, 34, 1]
len(T)

14

In [68]:
from keras.layers import Dense, Dropout, LSTM, TimeDistributed, Activation, Softmax, Embedding, ConvLSTM2D

In [42]:
import tensorflow as tf
import numpy
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, TimeDistributed, Activation, Softmax
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd
import sqlite3
import re
import sys

""" PARAMS """
VOCAB_L = 20
DROPOUT = 0.40
HIDDEN  = 128
BATCH   = 82
N_EPOCH = 35


sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'tweets'
cnxn = sqlite3.connect(sqlite_file)
q    ='SELECT * FROM {};'.format(table_name)
data = pd.read_sql_query(q, cnxn)

def strip_links(txt):
  txt = re.sub(r'(?:\w+|\@\w+|\#\w+)\.twitter\.com\/\w+', '', txt)
  return(re.sub(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', txt))

def strip_whitespace(txt):
  txt = txt.strip(' ')
  return(re.sub(r' +', ' ', txt))

def strip_metachar(txt):
  return(re.sub(r"[^a-zA-Z0-9\-\@\#\.\, ]+", '', txt))

def strip_ats(txt):
  return(re.sub(r'\@\w*', '', txt))

data['CleanText'] = data['Text'].apply(lambda t: strip_links(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_whitespace(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_metachar(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_ats(t))
raw_text = ""
for tweet in data.CleanText:
 raw_text += tweet.strip()
 raw_text += ' '

raw_text = raw_text.lower()
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)
# prepare the dataset of input to output pairs encoded as integers
seq_length = VOCAB_L
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
#X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
# define the LSTM model

Total Characters:  907210
Total Vocab:  41
Total Patterns:  907190


In [43]:
X.shape

(907190, 20, 1)

In [79]:
model = Sequential()
model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2])))
#model.add(Embedding(input_dim = 2, output_dim = 2))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath="model/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" # Write to folder, rather than puking all over my directory
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [77]:
model.fit(X, y, epochs=N_EPOCH, batch_size=BATCH, callbacks=callbacks_list)

RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.