In [1]:
import numpy
import sys
import re
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
file = open('data.csv', encoding='utf-8').read()

In [3]:
def tokenize_words(input):
    #lowercase everything to standardize it
    input = input.lower()
    
    #instantiate the tokenizer
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(input)
    
    #if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [4]:
#preprocess the input data, make tokens
processed_inputs = tokenize_words(file)
print(processed_inputs)

woke call data science masters program . well ... time save . , " interested job #newyork , ny ? could great fit . click link bio apply : data science … " , " . interested opportunity explore texas violence project scale , e . g . , top … " , trash . throwing water jimmy fallon cheek fart fart garden wiiu clown diarrhea root beer . justin bi … , diarrhea diapers look tyra banks diarrhea . big data science early often poo dumb turkey dealer . mea … , world fraught corruption programmer big data ramble love become ( ) desire , that's good day listen compton . making sense big data's big impact . , us acne turd diarrheaed turkey pox pacific rim taco influencer marketing structure data … , data science beehive gossip electronics block cheese puppy feces diarrhea ( used instead diarr … , pl await details forthcoming event : embracing data science business dr vipul kalamkar 25/01 , " crispr , ai , big data , climate change . patriot nationalist room knows handle global … " , data science : n

In [5]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))


In [8]:
print(char_to_num)

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, ';': 27, '=': 28, '>': 29, '?': 30, '[': 31, ']': 32, '_': 33, 'a': 34, 'b': 35, 'c': 36, 'd': 37, 'e': 38, 'f': 39, 'g': 40, 'h': 41, 'i': 42, 'j': 43, 'k': 44, 'l': 45, 'm': 46, 'n': 47, 'o': 48, 'p': 49, 'q': 50, 'r': 51, 's': 52, 't': 53, 'u': 54, 'v': 55, 'w': 56, 'x': 57, 'y': 58, 'z': 59, '|': 60, 'è': 61, 'ē': 62, '\u200d': 63, '–': 64, '—': 65, '‘': 66, '’': 67, '“': 68, '”': 69, '•': 70, '…': 71, '€': 72, '™': 73, '⌚': 74, '▪': 75, '♂': 76, '✅': 77, '✨': 78, '➕': 79, '➡': 80, '⤵': 81, '️': 82, '🇸': 83, '🇺': 84, '🌽': 85, '🏽': 86, '🏾': 87, '🐺': 88, '🐻': 89, '👀': 90, '👇': 91, '👈': 92, '👋': 93, '👌': 94, '👏': 95, '👶': 96, '💃': 97, '💙': 98, '💡': 99, '💬': 100, '💭': 101, '💿': 102, '📊': 103, '📍': 104, '📰': 105, '📹': 106, '🔧': 107, '🔮': 108, '🔴': 109, '🕵':

In [6]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 135865
Total vocab: 133


In [33]:
seq_length = 100
x_data = []
y_data = []

In [34]:
#loop through the inputs, start at the beginning and go until we hit
#the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    #define the input and output sequences
    #input is the current character puls desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    
    #out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i+seq_length]
    
    #now convert list of characters to integers based on previously and add the values
    #to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [35]:
n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 135765


In [36]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [37]:
y = np_utils.to_categorical(y_data)

In [38]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [40]:
#filepath = "model_weights_saved.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
#desired_callbacks = [checkpoint]

In [41]:
model.fit(X, y, epochs=10, batch_size=256, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x26987cbce80>

In [42]:
filename = "model_weights_saved.hdf5"
model.save_weights(filename)
print("saved model weights")

saved model weights


In [43]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [46]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  😌 😌 " , " attending #flocon2020 , feeling much excited " " data-science friendly " " cyber moneky . "


In [47]:
for i in range(140):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    
    sys.stdout.write(result)
    
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

 sear sear . " " " " dig data science : sear . " " " " dig data science : sear . " " " " dig data science : sear . " " " " dig data science 