In [43]:
import numpy as np
import pandas as pd
import re
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import to_categorical


In [44]:
# Read the data
f = open('../../Data/text-data/rnn-text-file.txt', 'r', encoding='utf-8')
data_text = f.read()

In [45]:
'''
    We perform basic text preprocessing since this data does not have much noise. We lower case all the words to maintain 
    uniformity.
'''

def text_cleaner(text):
    # lower case text
    newString = text.lower().strip()    
    # remove punctuations
    newString = re.sub("[^a-zA-Z]", " ", newString)  
    newString = re.sub(r"'s\b"," ",newString)
    newString = re.sub(r" +"," ",newString)
    return newString
   
# preprocess the text
data_new = text_cleaner(data_text)
# print(data_new)

In [46]:
'''
    we take in 30 characters as context and ask the model to predict the next character. Now, 30 is a number which I got 
    by trial and error and we can use any other length of context.

'''

def create_seq(text):
    length = 30
    sequences = list()
    words = text.split(' ')
    for i in range(length, len(words)):
        # select sequence of tokens
        seq = words[i-length:i+1]
        # store
        sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# create sequences   
sequences = create_seq(data_new)
# print(sequences)

Total Sequences: 5877


In [47]:
'''
    Once the sequences are generated, the next step is to encode each character. After encoding, each word here will be 
    represented as a unique number and will be ready to use form for Keras library. 
'''

# create a character mapping index
words = sorted(list(set(data_new.split(' '))))
mapping = dict((c, i) for i, c in enumerate(words))

In [48]:
def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)

In [8]:
# vocabulary size
vocab = len(mapping)
sequences = np.array(sequences)

# create X and y
X, y = sequences[:,:-1], sequences[:,-1]

# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train shape:', X_tr.shape, 'Val shape:', X_val.shape)

Train shape: (5289, 30) Val shape: (588, 30)


In [9]:
# define model
def train_model(model_name, epochs, X_train, y_train, X_val, y_val):
    model = Sequential()
    model.add(Embedding(vocab, 50, input_length=30, trainable=True))
    model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
    model.add(Dense(vocab, activation='softmax'))
    print(model.summary())

    # compile the model
    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
    # fit the model
    model.fit(X_train, y_train, epochs=epochs, verbose=2, validation_data=(X_val, y_val))
    model.save(model_name)
    print(model.summary())

train_model('model1.h5', 5, X_tr, y_tr, X_val, y_val)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            83950     
                                                                 
 gru (GRU)                   (None, 150)               90900     
                                                                 
 dense (Dense)               (None, 1679)              253529    
                                                                 
Total params: 428379 (1.63 MB)
Trainable params: 428379 (1.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5
166/166 - 9s - loss: 6.7457 - acc: 0.0664 - val_loss: 6.5971 - val_acc: 0.0697 - 9s/epoch - 54ms/step
Epoch 2/5
166/166 - 6s - loss: 6.1760 - acc: 0.0679 - val_loss: 6.5478 - val_acc: 0.0748 - 6s/epoch - 34ms/step
Epoch 3/5
166/166 - 6s - loss: 5.9144 - acc: 0.0862 - v

  saving_api.save_model(


In [64]:
from keras.models import load_model

# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_words):
    in_text = text_cleaner(seed_text)
    # generate a fixed number of characters
    for _ in range(n_words):
        # encode the characters as integers
        encoded = [mapping[word] for word in in_text.split(' ')]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
                
        predict_x=model.predict(encoded) 
        yhat=np.argmax(predict_x,axis=1)
      
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text = in_text +' '+char

    return in_text


In [65]:
model = load_model('model1.h5')
seed_text = 'However, after 35 minutes, he made the first controversial decision. Pritam Kotal had the ball at his feet in the box.Roy Krishna reached the ball before the Mohun Bagan captain bowled it. The Bengaluru footballer got the ball. Pritam tackled his legs from behind. Krishna fell into the box. Bengaluru footballers continued to appeal for penalties. But the referee did not give a penalty'
txt =  generate_seq(model, mapping, 30, seed_text, 10)
print(txt)

however after minutes he made the first controversial decision pritam kotal had the ball at his feet in the box roy krishna reached the ball before the mohun bagan captain bowled it the bengaluru footballer got the ball pritam tackled his legs from behind krishna fell into the box bengaluru footballers continued to appeal for penalties but the referee did not give a penalty of the prime minister had been a large breakout of
