In [None]:
import numpy as np
import nltk
from nltk.corpus import treebank, brown, conll2000
import tensorflow as tf
nltk.download("treebank")
nltk.download("brown")
nltk.download("conll2000")
nltk.download("universal_tagset")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [None]:
treebank_corpus = treebank.tagged_sents(tagset='universal')
brown_corpus = brown.tagged_sents(tagset='universal')
conll_corpus = conll2000.tagged_sents(tagset='universal')

sentences = treebank_corpus + brown_corpus + conll_corpus
#sentences = treebank.tagged_sents()

In [None]:
X = [] # store input sequence
Y = [] # store output sequence
for sentence in sentences:
  X_sentence = []
  Y_sentence = []
  for (word, tag) in sentence: 
    X_sentence.append(word) # entity[0] contains the word
    Y_sentence.append(tag) # entity[1] contains corresponding tag
 
  X.append(X_sentence)
  Y.append(Y_sentence)
num_words = len(set([word.lower() for sentence in X for word in sentence]))
num_tags = len(set([word.lower() for sentence in Y for word in sentence]))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
# encode X
word_tokenizer = Tokenizer()              # instantiate tokeniser
word_tokenizer.fit_on_texts(X)            # fit tokeniser on data
# use the tokeniser to encode input sequence
X_encoded = word_tokenizer.texts_to_sequences(X)  
# encode Y
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(Y)
Y_encoded = tag_tokenizer.texts_to_sequences(Y)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_SEQ_LENGTH = 100
X_padded = pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')
Y_padded = pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [None]:
from tensorflow.keras.utils import to_categorical
Y_cat = to_categorical(Y_padded)

In [None]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [None]:
embeddings_index = {}
GLOVE_PATH = "/content/drive/MyDrive/MZ/Glove embeddings/glove.6B.300d.txt"
f = open(GLOVE_PATH)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [None]:
embedding_dim = 300
word_index = word_tokenizer.word_index
# first create a matrix of zeros, this is our embedding matrix
embedding_matrix = np.zeros((num_words+2, embedding_dim))

# for each word in out tokenizer lets try to find that work in our w2v model
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # we found the word - add that words vector to the matrix
        embedding_matrix[i+1] = embedding_vector
    else:
        # doesn't exist, assign a random vector
        pass

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Input(shape=(100,)),
  tf.keras.layers.Embedding(input_dim=num_words+2, output_dim=300, input_length=100,weights = [embedding_matrix],trainable=True),
  tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True), input_shape=(100,9)),
  tf.keras.layers.Dense(num_tags+1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['acc',ignore_class_accuracy(0)])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 300)          17835000  
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
dense_6 (Dense)              (None, 100, 13)           1677      
Total params: 18,023,557
Trainable params: 18,023,557
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(np.array(X_padded), np.array(Y_cat), epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
def logits_to_tokens(sequences):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            arg = np.argmax(categorical)
            token_sequence.append(tag_tokenizer.sequences_to_texts([[arg]]))
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [None]:
test_sent = "I am a boy ."
test = [test_sent.split()]
test_input = word_tokenizer.texts_to_sequences(test)
test_input_padded = pad_sequences(test_input, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")
predictions = model.predict(test_input_padded)
print(logits_to_tokens(predictions))

[[['pron'], ['verb'], ['det'], ['noun'], ['.'], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], [''], ['']]]
