## Import Libraries

In [1]:
from unicodedata import normalize
import pandas as pd
import numpy as np
import string, os, re
import psutil

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


## Load Dataset

[My notes]
This is just a bunch of strings we are importing. That's what trains our model. And it's sort of a small corpus, too. 

I could feed it Tweets, or a bunch of Reddit stuff, and then ask the model to generate text based on an input sequence. I could even build models for each subreddit. 

In [2]:
tweets = pd.read_csv('data/all_tweets_clean.csv', engine='python')
pd.set_option('display.max_colwidth', -1)
display(tweets['body'])
tweets.count()

0        @DhaSickest I don’t know if the officer had probable cause for arrest because that would’ve occurred before the vid… https://t.co/wDzYo3Yn9q    
1        @Costello_stats @globalnews Not surprising given the probable demographic you belong to. 🤷🏻‍♂️                                                  
2        "The quantum cosmological model wants to describe our universe as a reasonably probable outcome of a quantum mechan… https://t.co/kyTm0YACS1    
3        #Lessors can apply a general reserve to operating lease receivables that are probable of collection. Learn about yo… https://t.co/Akxj1fb8eH    
4        @FleischmanSteve @LuvnlightCreek @buddylady52 @EdLaborCmte So the median family, in addition to 4% wage growth sinc… https://t.co/1A4BEOhrYB    
5        1/2\nThere were times in our history when the resolution of #KashmirIssue seemed probable, but every time the boat o… https://t.co/fShoDNgtcd   
6        Reason: \nA group persecution of #Yazidis in the Sinjar Mountains w

screen_name    22853
fav            22853
rt             22853
body           22853
tweet_url      22853
is_popular     22853
dtype: int64

## Dataset Prep

### Dataset Cleaning

[My notes]: The data is relatively clean as-is, so we don't have to do that much cleaning. If reddit, might need to do more. If Twitter, think it'll be relatively easy since the data is somewhat clean as is. 

In [3]:
def clean_text(txt):
    txt = re.sub(r'https:\/\/t[.]co\/[A-Za-z0-9]*$', '', txt)
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in tweets.body]
corpus[:10]

['dhasickest i dont know if the officer had probable cause for arrest because that wouldve occurred before the vid ',
 'costellostats globalnews not surprising given the probable demographic you belong to ',
 'the quantum cosmological model wants to describe our universe as a reasonably probable outcome of a quantum mechan ',
 'lessors can apply a general reserve to operating lease receivables that are probable of collection learn about yo ',
 'fleischmansteve luvnlightcreek buddylady52 edlaborcmte so the median family in addition to 4 wage growth sinc ',
 '12\nthere were times in our history when the resolution of kashmirissue seemed probable but every time the boat o ',
 'reason \na group persecution of yazidis in the sinjar mountains was no longer sufficiently probable after the t ',
 'cjciaramella a positive drug field test does not establish probable cause',
 'scrolling through so much news of conferences i cant attend or submit proposals for and trying hard not to think ',
 'wilk

### Generate N-grams

In [4]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

len(inp_sequences)

354429

### Padding Sequences and Obtaining Variables: Predictors and Targetes

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    # the predictors will be all tokens except last one,
    # which will be the label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    
    # create a sparse matrix with the labels
    # label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

## Model

### Architecture

In [7]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 31, 10)            492160    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 49216)             4970816   
Total params: 5,507,376
Trainable params: 5,507,376
Non-trainable params: 0
_________________________________________________________________


### Train

In [None]:
model.fit(predictors, label, epochs=10, verbose=1)

W0802 03:22:50.504038 139759770244928 deprecation.py:323] From /home/migueljaime/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    ''' generate a text snippet based on 
    
    '''
    # for every word of the length we want
    for _ in range(next_words):
        # tokenize the text we already have
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # seems like we pad it?
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # predit using our model!
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        # here it seems that we loop through the returned indexes in
        # the tokenizer (how is it populated though?)
        # and if they match the prediction, we store them as output.
        for word,index in tokenizer.word_index.items():
            # if the index is the same as the predicted word
            if index == predicted:
                # extract the word and store it as output
                output_word = word
                break
        # then update the seed text with this prediction
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
generate_text('Click here', 20, model, max_sequence_len)