In [1]:
import numpy as np
import pandas as pd
import keras
import re
import nltk                               
nltk.download('brown')      
from nltk.corpus import brown 
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package brown to /Users/atreish/nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Download Corpus

In [2]:
brown_cat= brown.categories() # Creates a list of categories

docs=[] 
for cat in brown_cat: 
    t1 = brown.sents(categories=cat) 
    for doc in t1:
        docs.append(' '.join(doc)) 
#Corpus to train with
#docs

# Preprocessing Data

In [3]:
def text_cleaner(text):
    # lower case text
    newString = text.lower()
    newString = re.sub(r"'s\b","",newString)
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    long_words=[]
    # remove short word
    for i in newString.split():
        if len(i) >= 3:                  
            long_words.append(i)
    return (" ".join(long_words)).strip()

In [4]:
data_new = []
for i in docs:
    data_new.append(text_cleaner(i))
data_new=data_new[0:5000] # Reduce sample, otherwise is going to crash
#data_new

# Creating Sequences

In [5]:
data_new1 = ' '.join(data_new)
data_new1 = data_new1.split()

In [6]:
def create_seq(text):
    length = 2
    sequences = list()
    for i in range(length, len(text)):
        # select sequence of tokens
        seq = text[i - length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

In [7]:
# create sequences   
sequences = create_seq(data_new1)
print(sequences[:1])

Total Sequences: 52917
[['dan', 'morgan', 'told']]


# Encoding a sequence

In [8]:
# create a character mapping index
chars = sorted(list(set(data_new1)))
mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences_encoded = encode_seq(sequences)

In [9]:
len(sequences_encoded)

52917

# Training and Validation set

In [10]:
# vocabulary size
vocab = len(mapping)
sequences_encoded = np.array(sequences_encoded)

#create X and y
X, y = sequences_encoded[:,:-1], sequences_encoded[:,-1]
#one hot encode y
y = to_categorical(y, num_classes=vocab)

#create train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print('Training shape:', X_train.shape, 'Validation shape:', X_test.shape)

Training shape: (42333, 2) Validation shape: (10584, 2)


# Create Model

In [11]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=2, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(X_train, y_train, epochs=5, verbose=1, validation_data=(X_test, y_test))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2, 50)             434650    
_________________________________________________________________
gru (GRU)                    (None, 150)               90450     
_________________________________________________________________
dense (Dense)                (None, 8693)              1312643   
Total params: 1,837,743
Trainable params: 1,837,743
Non-trainable params: 0
_________________________________________________________________
None
Train on 42333 samples, validate on 10584 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as 

<tensorflow.python.keras.callbacks.History at 0x7fab6de60fd0>

# Generate Sequences

In [12]:
# generate a sequence from a language model
def generate_seq(model, mapping, seq_length, seed_text, n_words):
    result = seed_text.split()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = in_text.split()
        encoded = [mapping[char] for char in encoded]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        probability=model.predict(encoded)
        # Choose random word base on probabilities from the model predictions
        out_word=choice(list(mapping.keys()), 1, p=probability[0])[0] 
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
  
def get_key(val):
    for key, value in mapping.items():
         if val == value:
            return key

In [13]:
print(generate_seq(model, mapping, 2, "the news", 50))
print(generate_seq(model, mapping, 2, "the news", 50))
print(generate_seq(model, mapping, 2, "the news", 50))
print(generate_seq(model, mapping, 2, "the news", 50))
print(generate_seq(model, mapping, 2, "the news", 50))

the news hurt haven walked can get giving her mind did not conservative them you you can any the fragment became lalaurie that close explosive upwards carmer managed gambits over three bourbons sparks glimpse painful supplicating and dignity force until matter you don ain hunting yourself money all into the pacific position
the news and stringed start upon coat and they don ever want the far black had wound the ignored the kid day stood any world meredith could made here was some command julia unfamiliar swinging air and began aaa methods pig ago but was the long shirt toward the door was dusty
the news the arrogant door and walked nate wrist saw stopped off the devastating course filled later these problem pulled her eyes had been stave packed let into exclusive before the stockade rapture his neck badge runaway belongings were his watching out man would come possible brassnose when coiled the dipper the
the news grazer her with the country claimants against the febrile occasion sudde