## Import Libraries

In [160]:
from unicodedata import normalize
import pandas as pd
import numpy as np
import string, os, re

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Dataset

[My notes]
This is just a bunch of strings we are importing. That's what trains our model. And it's sort of a small corpus, too. 

I could feed it Tweets, or a bunch of Reddit stuff, and then ask the model to generate text based on an input sequence. I could even build models for each subreddit. 

In [163]:
tweets = pd.read_csv('data/all_tweets_clean.csv', engine='python')
pd.set_option('display.max_colwidth', -1)
display(tweets['body'])

0        @DhaSickest I don’t know if the officer had probable cause for arrest because that would’ve occurred before the vid… https://t.co/wDzYo3Yn9q    
1        @Costello_stats @globalnews Not surprising given the probable demographic you belong to. 🤷🏻‍♂️                                                  
2        "The quantum cosmological model wants to describe our universe as a reasonably probable outcome of a quantum mechan… https://t.co/kyTm0YACS1    
3        #Lessors can apply a general reserve to operating lease receivables that are probable of collection. Learn about yo… https://t.co/Akxj1fb8eH    
4        @FleischmanSteve @LuvnlightCreek @buddylady52 @EdLaborCmte So the median family, in addition to 4% wage growth sinc… https://t.co/1A4BEOhrYB    
5        1/2\nThere were times in our history when the resolution of #KashmirIssue seemed probable, but every time the boat o… https://t.co/fShoDNgtcd   
6        Reason: \nA group persecution of #Yazidis in the Sinjar Mountains w

## Dataset Prep

### Dataset Cleaning

[My notes]: The data is relatively clean as-is, so we don't have to do that much cleaning. If reddit, might need to do more. If Twitter, think it'll be relatively easy since the data is somewhat clean as is. 

In [165]:
def clean_text(txt):
    txt = re.sub(r'https:\/\/t[.]co\/[A-Za-z0-9]*$', '', txt)
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in tweets.body]
corpus[:10]

['dhasickest i dont know if the officer had probable cause for arrest because that wouldve occurred before the vid ',
 'costellostats globalnews not surprising given the probable demographic you belong to ',
 'the quantum cosmological model wants to describe our universe as a reasonably probable outcome of a quantum mechan ',
 'lessors can apply a general reserve to operating lease receivables that are probable of collection learn about yo ',
 'fleischmansteve luvnlightcreek buddylady52 edlaborcmte so the median family in addition to 4 wage growth sinc ',
 '12\nthere were times in our history when the resolution of kashmirissue seemed probable but every time the boat o ',
 'reason \na group persecution of yazidis in the sinjar mountains was no longer sufficiently probable after the t ',
 'cjciaramella a positive drug field test does not establish probable cause',
 'scrolling through so much news of conferences i cant attend or submit proposals for and trying hard not to think ',
 'wilk

### Generate N-grams

In [166]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[15673, 4],
 [15673, 4, 44],
 [15673, 4, 44, 66],
 [15673, 4, 44, 66, 29],
 [15673, 4, 44, 66, 29, 1],
 [15673, 4, 44, 66, 29, 1, 2032],
 [15673, 4, 44, 66, 29, 1, 2032, 77],
 [15673, 4, 44, 66, 29, 1, 2032, 77, 263],
 [15673, 4, 44, 66, 29, 1, 2032, 77, 263, 295],
 [15673, 4, 44, 66, 29, 1, 2032, 77, 263, 295, 10]]

### Padding Sequences and Obtaining Variables: Predictors and Targetes

In [169]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    # the predictors will be all tokens except last one,
    # which will be the label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    # create a sparse matrix with the labels
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

MemoryError: 

## Model

### Architecture

In [168]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 28, 10)            492160    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 49216)             4970816   
Total params: 5,507,376
Trainable params: 5,507,376
Non-trainable params: 0
_________________________________________________________________


### Train

In [118]:
model.fit(predictors, label, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f5b3800aa20>

In [132]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    ''' generate a text snippet based on 
    
    '''
    # for every word of the length we want
    for _ in range(next_words):
        # tokenize the text we already have
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # seems like we pad it?
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # predit using our model!
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        # here it seems that we loop through the returned indexes in
        # the tokenizer (how is it populated though?)
        # and if they match the prediction, we store them as output.
        for word,index in tokenizer.word_index.items():
            # if the index is the same as the predicted word
            if index == predicted:
                # extract the word and store it as output
                output_word = word
                break
        # then update the seed text with this prediction
        seed_text += " "+output_word
    return seed_text.title()

In [159]:
generate_text('Click here', 20, model, max_sequence_len)

'Click Here In The Day But I Am Im What Have Down Anything Down Down We Are Rt And It As Your'

Unnamed: 0,screen_name,fav,rt,tweet,tweet_url
0,Calum5SOS,65617,12662,"Looking through the mentions, there has been a...",https://twitter.com/Calum5SOS/status/115500969...
1,peaceforchange,9945,2184,The clip of Air Marshal is revealed to be doct...,https://twitter.com/peaceforchange/status/1155...
2,GOP,13555,4874,“We've gone through roughly 200 days of this D...,https://twitter.com/GOP/status/115519016241612...
3,Nigel_Farage,9483,2801,If @BorisJohnson tries to put through a reheat...,https://twitter.com/Nigel_Farage/status/115504...
4,PeterStefanovi2,5551,3519,We now hear that through retirement &amp; natu...,https://twitter.com/PeterStefanovi2/status/115...
5,glennkirschner2,5668,2658,Listen up Republicans - these words from Trump...,https://twitter.com/glennkirschner2/status/115...
6,Imamofpeace,1503,765,Wow: “Retired Dean of Qatar’s Intelligence Age...,https://twitter.com/Imamofpeace/status/1155369...
7,JoyceMeyer,10096,2146,"No matter what you are going through, lift it ...",https://twitter.com/JoyceMeyer/status/11552303...
8,SkySportsNews,13501,2328,Gareth Bale's move to Chinese Super League clu...,https://twitter.com/SkySportsNews/status/11554...
9,Truman_Black,18979,2312,People want attention. You have it. You are po...,https://twitter.com/Truman_Black/status/115521...
