## Import Libraries

In [3]:
from unicodedata import normalize
import pandas as pd
import numpy as np
import string, os, re
import psutil

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
import keras.utils as ku 

# for pre-trained embeddings
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load Dataset

[My notes]
This is just a bunch of strings we are importing. That's what trains our model. And it's sort of a small corpus, too. 

I could feed it Tweets, or a bunch of Reddit stuff, and then ask the model to generate text based on an input sequence. I could even build models for each subreddit. 

In [13]:
from google.colab import drive
drive.mount('/content/drive')

KeyError: 'CLOUDSDK_CONFIG'

In [11]:
url = 'https://raw.githubusercontent.com/callmeandre/generative_language_model/master/data/with_trunc_popular_tweets_13828.csv'
tweets = pd.read_csv(url)

# only use non-truncated tweets
print(tweets.count())
tweets = tweets[tweets['truncated'] == False]
print(tweets.count())

screen_name    13828
fav            13828
rt             13828
tweet          13828
truncated      13828
tweet_url      13828
dtype: int64
screen_name    3156
fav            3156
rt             3156
tweet          3156
truncated      3156
tweet_url      3156
dtype: int64


In [None]:
# load embeddings

# Google News embeddings based on 3M words in 300 dimensions
filename = 'data/GoogleNews-vectors-negative300.bin'
gensim_embeddings = KeyedVectors.load_word2vec_format(filename, binary=True)

pretrained_weights = gensim_embeddings.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

## Dataset Prep

### Dataset Cleaning

[My notes]: The data is relatively clean as-is, so we don't have to do that much cleaning. If reddit, might need to do more. If Twitter, think it'll be relatively easy since the data is somewhat clean as is. 

In [None]:
def clean_text(txt):
    txt = re.sub(r'https:\/\/t[.]co\/[A-Za-z0-9]*$', '', txt)
    txt = re.sub(r'\n', ' ', txt)
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    txt = re.sub(' +', ' ', txt)
    return txt 

corpus = [clean_text(x) for x in tweets.tweet]
corpus[:10]

### Generate N-grams

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

len(inp_sequences)

### Padding Sequences and Obtaining Variables: Predictors and Targetes

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    # the predictors will be all tokens except last one,
    # which will be the label
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    
    # not needed since we're using sparse_categorical_crossentropy 
    # as the loss function
    # label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

## Model

### Architecture

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    model.add(Embedding(input_dim=vocab_size,
                       output_dim=embedding_size,
                       weights=[pretrained_weights],
                       trainable=False,
                       name='embedding_layer'))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

### Train

In [None]:
model.fit(predictors, label, epochs=100, verbose=1)

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    ''' generate a text snippet based on 
    
    '''
    # for every word of the length we want
    for _ in range(next_words):
        # tokenize the text we already have
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        # seems like we pad it?
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        # predit using our model!
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        # here it seems that we loop through the returned indexes in
        # the tokenizer (how is it populated though?)
        # and if they match the prediction, we store them as output.
        for word,index in tokenizer.word_index.items():
            # if the index is the same as the predicted word
            if index == predicted:
                # extract the word and store it as output
                output_word = word
                break
        # then update the seed text with this prediction
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
generate_text('Click here', 20, model, max_sequence_len)