In [None]:
import warnings
warnings.filterwarnings("ignore")

#Numpy library for performing mathematical calculations
import numpy as np

#matplotlib for plot functions
from matplotlib import pyplot as plt
	
#nltk library to import the treebank dataset
import nltk
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

print(nltk_data[0])


In [None]:
#Loading every sentence into X and Y variables
def load_sentences(tagged_sentences):
    X = []
    Y = []
    
    for sentence in tagged_sentences:
        X_sentence = []
        Y_sentence = []
        for word in sentence:         
            X_sentence.append(word[0]) #word[0] contains the word
            Y_sentence.append(word[1]) #word[1] contains the corresponding tag
        
        X.append(X_sentence)
        Y.append(Y_sentence)
    
    return X,Y

X_train, Y_train = load_sentences(nltk_data)

print("Total number of tagged sentences: {}".format(len(X_train)))


In [None]:
from keras.preprocessing.text import Tokenizer

#Encoding X
word_tokenizer = Tokenizer()            
word_tokenizer.fit_on_texts(X_train)     

#Using the tokenizer to encode the input sequence
X_encoded = word_tokenizer.texts_to_sequences(X_train)

#Encoding Y
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y_train)
Y_encoded = tag_tokenizer.texts_to_sequences(Y_train)

#Look at first encoded data point
print("Original data:", "\n",)
print('X: ', X_train[0], '\n')
print('Y: ', Y_train[0], '\n')
print("Encoded data:", "\n")
print('X: ', X_encoded[0], '\n')
print('Y: ', Y_encoded[0], '\n')


In [None]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

#Path to word2vec model
path = '../input/googlenewsvectors/GoogleNews-vectors-negative300.bin'

word2vec = KeyedVectors.load_word2vec_format(path, binary=True)

#Assign word vectors from the pre trained word2vec model

EMBEDDING_SIZE  = 300  #Every word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

#Empty embedding matrix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

#Word to index dictionary mapping
word2id = word_tokenizer.word_index

#Copying the vectors from word2vec model to the words present in our corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word2vec[word]
    except KeyError:
        pass

print("Embeddings shape: {}".format(embedding_weights.shape))

from keras.utils.np_utils import to_categorical

# use Keras' to_categorical function to one-hot encode Y
Y_train = to_categorical(Y_train)

# print Y of the first output sequence
print("Shape of Y: {}".format(Y_train.shape))

X,Y = X_train,Y_train


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=0.2, random_state=4)


In [None]:
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN

#Creating the architecutre

rnn_model = Sequential()

#Creating the embedding layer
#It is usually the first layer in any text problem
rnn_model.add(Embedding(input_dim = VOCABULARY_SIZE, 
                        output_dim = EMBEDDING_SIZE,          
                        input_length = MAX_SEQ_LENGTH,          
                        trainable = False                    
))

#Add an RNN layer with 64 RNN cells
rnn_model.add(SimpleRNN(64, 
              return_sequences = True  # True - Return whole sequence
))

#Add an output after each sequence
rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

rnn_model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adamax',
                  metrics = ['accuracy'])

#Summary of the model
rnn_model.summary()

rnn_training = rnn_model.fit(X_train, Y_train, batch_size=32, epochs=20, 
validation_data=(X_validation, Y_validation))


In [None]:
rnn_model = Sequential()

#Embedding layer
rnn_model.add(Embedding(input_dim = VOCABULARY_SIZE,         
                        output_dim = EMBEDDING_SIZE,          
                        input_length = MAX_SEQ_LENGTH,          
                        trainable = True                     
))

#RNN layer
rnn_model.add(SimpleRNN(64, 
              return_sequences = True  
))

#Output after each sequence
rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

rnn_model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adamax',
                  metrics = ['accuracy'])

rnn_training = rnn_model.fit(X_train, Y_train, batch_size=32, epochs=20, 
validation_data=(X_validation, Y_validation))


In [None]:
rnn_model = Sequential()

#Embedding layer
rnn_model.add(Embedding(input_dim = VOCABULARY_SIZE,         
                        output_dim = EMBEDDING_SIZE,          
                        input_length = MAX_SEQ_LENGTH,
                        weights = [embedding_weights],          
                        trainable = True                     
))

#RNN layer
rnn_model.add(SimpleRNN(64, 
              return_sequences = True  
))

#Output after each sequence
rnn_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

rnn_model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adamax',
                  metrics = ['accuracy'])

rnn_model.summary()

rnn_training = rnn_model.fit(X_train, Y_train, batch_size=32, epochs=20, 
validation_data=(X_validation, Y_validation))
