In [0]:
# Keras 
import keras.utils as ku 
# The Layers we will Use
from keras.layers import Embedding, LSTM, Dense
# Tokenizer
from keras.preprocessing.text import Tokenizer
# Our model
from keras.models import Sequential
# To address overfitting
from keras.callbacks import EarlyStopping
# To enable sequencing in our data
from keras.preprocessing.sequence import pad_sequences
# Seeds for reproducibility
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

Using TensorFlow backend.


In [0]:
# Load the data from your files
text = open("kanye.txt").read()


In [0]:
# See what it looks like, get the first 50 characters
text[0:50]
# Safe to say Kanye is a bit over the top

In [0]:
# You can see we have a lot of \n characters. This is meant to portray when a new line starts
# So we want to split all of the sentences we have in order to avoid \n characters as well as to get a more definite set of sentences
splitted_text = text.split("\n")

In [0]:
# Visualize how this looks
splitted_text[0:10]

In [0]:
# We still need to clean our text as it may contain characters we are not used to such as è or more
# We also want to convert everything to lower case
def remove_unwanted_text(new_text):
    new_text = "".join(x for x in new_text if x not in string.punctuation).lower()
    # ignore this line
    new_text = new_text.encode("utf8").decode("ascii",'ignore')
    return new_text 


In [0]:
# We now use our remove function on all our lines
final_text = [remove_unwanted_text(x) for x in splitted_text]


In [0]:
# Visualize it once again
final_text[0:10]

We know want to find a way in which to "encode" our words to numbers. That is, our model will only be able to recognize numbers. If we assign a specific number to each word then our algorithm will be able to learn!

In [0]:
# Keras has a great way to do this with it's Tokenizer Object
# This gives us access to the tokenizer capabilities from Keras
tokenizer = Tokenizer()

In [0]:
# Now we encode our text with the tokenizer object
tokenizer.fit_on_texts(final_text)
# Try and tokenize our first sentece
token_list = tokenizer.texts_to_sequences([final_text[0]])[0]


In [0]:
#Visualize it
print(token_list)
print(final_text[0])


You can see each word has become a unique number 

In [0]:
# Now we do that for all of our sentences

In [0]:
def get_sequences(text):
    # encode our words
    tokenizer.fit_on_texts(text)
    # how many words we have in total ( + 1 because it starts at 0)
    total_words = len(tokenizer.word_index) + 1
    ## convert data to sequence
    # i.e, the text: "hello new world will" become the sequence:  hello -> hello new -> hello new world
    sequences = []
    for sentence in text:
        # like before, we create a sequence of tokens for each sentence
        token_sentences = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(1, len(token_sentences)):
            # For each token (word) in our sentence we create an array with the token and its previous tokens
            sequence = token_sentences[:i+1]
            # Add that sequence to our array of sequences
            sequences.append(sequence)
    # Return our total sequences and the total number of words in our text
    return sequences, total_words


In [0]:
# Give our function get_sequences our kanye text
sequences, total_words = get_sequences(final_text)

In [0]:
# See how many UNIQUE words we have
print(total_words)

In [0]:
# Check our first 10 sequences
sequences[0:10]

In [0]:
# Compare to our first 10 sentences 
final_text[:2]
# Try and find patterns

In [0]:
def padded_sequences(sequences):
  # We need to have a fixed length for our sequences so the model can learn. We can't give it diffent size of inputs each time we run it!
  # So we extract the max length and use that one. Shorter sequences will just use 0's where they don't have words
    max_sequence_length = max([len(x) for x in sequences])
    # Now we have to reshape our sequences to fit to this new lentgh
    # Thankfully keras has the function pad_sequences that does this
    # We then make it an array by calling np.array(padded_sequences)
    sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_length, padding='pre'))
    
    # Now we split our sequences into data and labels
    # for the phrase "hello new world"
    # we will have the seuqences and labels: 
    # hello -> new
    # hello new -> world
    # Where each label is the next word we are trying to predict based on the sequence
    data = sequences[:,:-1]
    # So our data will be all the words up to the last one
    label = sequences[:,-1]
    # Our label will be our last word
    # We don't want to assign greater importance to certain words just because they have a bigger number
    # So we make them all arrays of 0 and 1. 
    # Each one of our labels will have a specific value
    # i.e, hello can be [0, 0, 0, ..... ,  1] 
    # the length depends on the number of words we can have
    
    label = ku.to_categorical(label, num_classes=total_words)
    return data, label, max_sequence_length



In [0]:
data, label, max_sequence_length = padded_sequences(sequences)

In [0]:
# Now visualize our first sequence
print(data[1])
print(sequences[0])
# We see we have a bunch of 0's before and after (this is the padding)

# Now We Build The Model

In [0]:
# Declare a sequential model
model = Sequential()
# Add a layer to the model (Embedding) that will allow us to take the inputs
model.add(Embedding(total_words, 10, input_length=max_sequence_length - 1)) # because its not 0-based
# Add an LSTM Layer with 100 units
model.add(LSTM(100))
# Add another layer (our output layer) with softmax actiavtion
model.add(Dense(total_words, activation='softmax'))
# Compile model with adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
# See what the model looks like
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 18, 10)            49380     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_2 (Dense)              (None, 4938)              498738    
Total params: 592,518
Trainable params: 592,518
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Now train it with our sequences and labels 100 times. verbose is just how we want to see the progress
# CAUTION: THIS WILL TAKE  ~3 HOURS TO RUN USING GOOGOLE COLLAB'S GPU
model.fit(data, label, epochs=100, verbose=2)

Nevertheless, I took the time to run it and we can see the ouputs here. **This Part will not run locally on your computer because you do not have the file model_with_verses2.h5. This file will be uploaded to the Turing Club FB page so you can test it out**

In [0]:
import tensorflow as tf
model = tf.keras.models.load_model('model_with_verses2.h5')


This function generates the text. Disregard it. **Taken from Stack Overflow.

In [0]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

Put your input here! in the first parameter put the text you want to start with, second, the number of words to print

In [0]:
print (generate_text("How", 15, model, max_sequence_length))

How Anything A Cause I Right You Leave How Do Jesus This Supposed A Dude The


In [0]:
# The eminem one
model = tf.keras.models.load_model('model_with_eminem.h5')


In [0]:
print (generate_text("Go", 15, model, 33))

Go The Say Tv Pryor You Slaves Mistakes But You Ups Ghetto She Go To Wait


We could make these models much better if I your're more patient and are willing to wait 6-7 hours for them to train on much more data.
For now these seem like pretty good results

Thank you for coming!