In [1]:
import tensorflow as tf
import numpy as np
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import pandas as pd
import collections
import sqlite3
import re
import os
import sys

pd.set_option('display.max_colwidth', -1)

Using TensorFlow backend.


In [2]:
""" PARAMS """
#VOCAB_L = 20
DROPOUT = 0.60
HIDDEN  = 1080
BATCH   = 100
N_EPOCH = 50
STEP    = 15

In [3]:
sqlite_file = '../../data/database/deeplearning.sqlite'
table_name  = 'tweets'
cnxn = sqlite3.connect(sqlite_file)
q    ='SELECT * FROM {};'.format(table_name)
data = pd.read_sql_query(q, cnxn)

In [11]:
def strip_links(txt):
  txt = re.sub(r'(?:\w+|\@\w+|\#\w+)\.twitter\.com\/\w+', '', txt)
  return(re.sub(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', txt))

def strip_whitespace(txt):
  txt = txt.strip(' ')
  return(re.sub(r' +', ' ', txt))

def strip_metachar(txt):
    return(re.sub(r"[^a-zA-Z0-9\-\@\#\.\, ]+", '', txt))

def strip_ats(txt):
    return(re.sub(r'(\@|\#)\w*', '', txt))

data['CleanText'] = data['Text'].apply(lambda t: strip_links(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_whitespace(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_metachar(t))
data['CleanText'] = data['CleanText'].apply(lambda t: strip_ats(t))

NameError: name 'data' is not defined

In [None]:
train = data.sample(frac=0.7, random_state=200)
val  = data.drop(train.index)
#test   = test.sample(frac=0.5, random_state=200)
#val  = test.drop(val.index)
train, val = train.CleanText.values, val.CleanText.values

In [6]:
np.savetxt('train.txt', train, fmt = '%s')
#np.savetxt('test.txt', test, fmt = '%s')
np.savetxt('val.txt', val, fmt = '%s')

In [3]:
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eos>").split()
    
def build_vocab(filename):
    data = read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id

def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [4]:
def load_data():
    # get the data paths
    train_path = 'train.txt'
    valid_path = 'val.txt'
    #test_path  = 'test.txt'

    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    #test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    print(train_data[:5])
    print(word_to_id)
    print(vocabulary)
    print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_data, valid_data, vocabulary, reversed_dictionary

In [5]:
train_data, valid_data, vocabulary, reversed_dictionary = load_data()

[16, 9081, 75, 23, 958]
15398
Data Literacy Is The Biggest Challenge In Analytics Education -<eos>Why


In [6]:
class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [7]:
train_data_generator = KerasBatchGenerator(train_data, STEP, BATCH, vocabulary,
                                           skip_step=STEP)
valid_data_generator = KerasBatchGenerator(valid_data, STEP, BATCH, vocabulary,
                                           skip_step=STEP)

In [8]:
model = Sequential()
model.add(Embedding(vocabulary, HIDDEN, input_length=STEP))
model.add(LSTM(HIDDEN, return_sequences=True))
model.add(LSTM(HIDDEN, return_sequences=True))
model.add(Dropout(DROPOUT))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

In [9]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
checkpointer = ModelCheckpoint(filepath='model' + '/model-{epoch:02d}.hdf5', verbose=1)

In [10]:
model.fit_generator(train_data_generator.generate(), len(train_data)//(BATCH*STEP), N_EPOCH,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(BATCH*STEP), callbacks=[checkpointer])

Epoch 1/50

Epoch 00001: saving model to model/model-01.hdf5
Epoch 2/50

Epoch 00002: saving model to model/model-02.hdf5
Epoch 3/50

Epoch 00003: saving model to model/model-03.hdf5
Epoch 4/50

Epoch 00004: saving model to model/model-04.hdf5
Epoch 5/50

Epoch 00005: saving model to model/model-05.hdf5
Epoch 6/50

Epoch 00006: saving model to model/model-06.hdf5
Epoch 7/50

Epoch 00007: saving model to model/model-07.hdf5
Epoch 8/50

Epoch 00008: saving model to model/model-08.hdf5
Epoch 9/50

Epoch 00009: saving model to model/model-09.hdf5
Epoch 10/50

Epoch 00010: saving model to model/model-10.hdf5
Epoch 11/50

Epoch 00011: saving model to model/model-11.hdf5
Epoch 12/50

Epoch 00012: saving model to model/model-12.hdf5
Epoch 13/50

Epoch 00013: saving model to model/model-13.hdf5
Epoch 14/50

Epoch 00014: saving model to model/model-14.hdf5
Epoch 15/50

Epoch 00015: saving model to model/model-15.hdf5
Epoch 16/50

Epoch 00016: saving model to model/model-16.hdf5
Epoch 17/50

Epoc


Epoch 00038: saving model to model/model-38.hdf5
Epoch 39/50

Epoch 00039: saving model to model/model-39.hdf5
Epoch 40/50

Epoch 00040: saving model to model/model-40.hdf5
Epoch 41/50

Epoch 00041: saving model to model/model-41.hdf5
Epoch 42/50

Epoch 00042: saving model to model/model-42.hdf5
Epoch 43/50

Epoch 00043: saving model to model/model-43.hdf5
Epoch 44/50

Epoch 00044: saving model to model/model-44.hdf5
Epoch 45/50

Epoch 00045: saving model to model/model-45.hdf5
Epoch 46/50

Epoch 00046: saving model to model/model-46.hdf5
Epoch 47/50

Epoch 00047: saving model to model/model-47.hdf5
Epoch 48/50

Epoch 00048: saving model to model/model-48.hdf5
Epoch 49/50

Epoch 00049: saving model to model/model-49.hdf5
Epoch 50/50

Epoch 00050: saving model to model/model-50.hdf5


<keras.callbacks.History at 0x7f4a30e13e48>

In [14]:
model = load_model('model' + "/model-15.hdf5")
dummy_iters = 40
example_training_generator = KerasBatchGenerator(train_data, STEP, 1, vocabulary,
                                                     skip_step=1)
print("Training data:")
for i in range(dummy_iters):
    dummy = next(example_training_generator.generate())
num_predict = 10
true_print_out = "Actual words: "
pred_print_out = "Predicted words: "
for i in range(num_predict):
    data = next(example_training_generator.generate())
    prediction = model.predict(data[0])
    predict_word = np.argmax(prediction[:, STEP-1, :])
    true_print_out += reversed_dictionary[train_data[STEP + dummy_iters + i]] + " "
    pred_print_out += reversed_dictionary[predict_word] + " "
print(true_print_out)
print(pred_print_out)

Training data:
Actual words: he was reading my mind, literally..He described Long - the 
Predicted words: the of of the of the of the the of 
