<a href="https://colab.research.google.com/github/aditichak22/nlp-rnn/blob/main/PostRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install necessary packages using pip
!pip install keras numpy wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=e33727fc7036577d60321279b27be5f156847fe1b0d20a7924b7f9743f1baf43
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import os
import io
import sys
import pandas as pd

In [None]:
def load_corpus(path):

    # Check if the path is a directory.
    if not os.path.isdir(path):
        sys.exit("Input path is not a directory")
    corpusList = []
    for filename in os.listdir(path):
      filename = os.path.join(path, filename)
      try:
        reader = io.open(filename)
        for line in reader:
          textList = line.split()
          newList = list(map(lambda x: tuple(x.split("/")), textList))
          if (len(newList) > 0):
            corpusList.append(list(newList))
      except IOError:
        sys.exit("Cannot read file")
    return corpusList
    

# test the function here:
path = "drive/MyDrive/modified_brown"
data = load_corpus(path)

In [None]:
import numpy as np 


# Creates the dataset with train_X (words) and train_y (tag).
def create_dataset(sentences):
    # Defines the relevant lists.
    train_X, train_y = list(), list()
    tags = set()
    vocab = set()
    tags_dict = {}
    vocab_dict = {}
    for sentence in sentences:
      for word in sentence:
        vocab.add(word[0])
        tags.add(word[1])
    
    for i,j in enumerate(vocab):
      vocab_dict[j] = i + 1

    for i,j in enumerate(tags):
      tags_dict[j] = i + 1
    

    vocab_dict["PAD"] = 0
    tags_dict["PAD"] = 0


    for sentence in sentences:
      vec_words = []
      vec_tags = []
      for word in sentence:
        vec_words.append(vocab_dict[word[0]])
        vec_tags.append(tags_dict[word[1]])
      train_X.append(vec_words)
      train_y.append(vec_tags)
    

    return np.array(train_X), np.array(train_y), len(vocab), len(tags), list(tags), vocab_dict, tags_dict


# Test the function here
# Call create_dataset()
train_X, train_y, vocab_size, tag_size, tag_list, vocab_dict, tags_dict = create_dataset(data)



In [None]:
from keras.preprocessing.sequence import pad_sequences as pad


# Pad the sequences with 0s to the max length.
def pad_sequences(train_X, train_y):
    # Use MAX_LENGTH to record length of longest sequence 


    MAX_LENGTH = len(max(train_X, key=len))
    X_padded = pad(train_X, MAX_LENGTH, padding='post')
    Y_padded = pad(train_y, MAX_LENGTH, padding='post')

    print(Y_padded)
    return X_padded, Y_padded, MAX_LENGTH

# Test the function
train_X, train_y, MAX_LENGTH = pad_sequences(train_X, train_y)

[[ 5 10  7 ...  0  0  0]
 [ 9  9  2 ...  0  0  0]
 [ 9  5  2 ...  0  0  0]
 ...
 [ 4  7  3 ...  0  0  0]
 [ 2  8  7 ...  0  0  0]
 [ 6  2  1 ...  0  0  0]]


In [None]:
from keras.models import Sequential
from keras.layers import InputLayer, Activation
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from tensorflow.keras.optimizers import Adam 

# Define the Keras model.
def define_model(MAX_LENGTH):  
    
    # Define 'model' here
    model = Sequential()
    model.add(Embedding(len(vocab_dict), 128, input_length=MAX_LENGTH))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(len(tags_dict))))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(0.001), metrics=['accuracy'])
    print (model.summary())
    return model

# Call the function here
model = define_model(MAX_LENGTH)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 180, 128)          7165952   
_________________________________________________________________
bidirectional (Bidirectional (None, 180, 512)          788480    
_________________________________________________________________
time_distributed (TimeDistri (None, 180, 12)           6156      
_________________________________________________________________
activation (Activation)      (None, 180, 12)           0         
Total params: 7,960,588
Trainable params: 7,960,588
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# Returns the one-hot encoding of the sequence.
from keras.utils.np_utils import to_categorical as categorical
def to_categorical(train_y, categories):
    return categorical(train_y, num_classes=categories)


# Call the function as to_categorical(train_y, categories = len(tag2idx))

train_y1 = to_categorical(train_y, len(tags_dict))


In [None]:
import tensorflow as tf

# Trains the model.
def train(model, train_X, train_y):

    # Fit the data into the Keras model, through 40 passes (epochs) using model.fit()

    model = model.fit(train_X, train_y, validation_split=0.2, batch_size=128, epochs=40)

    # Return the model.
    return model

# call function here
trained_model = train(model, train_X, train_y1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [None]:
# Test a sentence using the given model.
def test(model, sentence):
  sentenceList = sentence.split()
  wordToInt = []
  for word in sentenceList:
    if (word in vocab_dict):
      wordToInt.append(vocab_dict[word])
    else:
      wordToInt.append(vocab_dict["PAD"])

  testSentence = pad([wordToInt], MAX_LENGTH, padding='post')
  predictions = model.predict(testSentence)
  result = logits_to_tokens(predictions, {i: t for t, i in tags_dict.items()})
  print(result[0][:len(sentenceList)])


test(model, "the secretariat is expected to race tomorrow .")
test(model, "people continue to enquire the reason for the race for outer space .")

test(model, "the planet jupiter and its moons are in effect a mini solar system .")
test(model, "computers process programs accurately .")



['DETERMINER', 'NOUN', 'VERB', 'VERB', 'PREPOSITION', 'VERB', 'NOUN', 'PUNCT']
['NOUN', 'VERB', 'PREPOSITION', 'VERB', 'DETERMINER', 'NOUN', 'PREPOSITION', 'DETERMINER', 'NOUN', 'PREPOSITION', 'ADJECTIVE', 'NOUN', 'PUNCT']
['DETERMINER', 'NOUN', 'ADJECTIVE', 'CONJUNCTION', 'PRONOUN', 'NOUN', 'VERB', 'PREPOSITION', 'NOUN', 'DETERMINER', 'ADJECTIVE', 'ADJECTIVE', 'NOUN', 'PUNCT']
['VERB', 'NOUN', 'NOUN', 'ADVERB', 'PUNCT']
