In [None]:
import xml.etree.ElementTree as ET
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import pickle
import os
import json
import numpy as np
from numpy import zeros
import multiprocessing
import time
from sklearn import metrics
import seaborn as sn
from numpy import asarray
from numpy import savetxt
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow.keras.layers as layers
import tensorflow.keras.optimizers as optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import backend as K
pp = pprint.PrettyPrinter(indent=4)

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

In [None]:
def parse_data(file):
    
    tree = ET.parse(file)
    root = tree.getroot()
    
    data = []
    labels = []
    
    for s_tag in root.iter('s'):
        
        sentence = []
        tags = []
        
        for e_tag in s_tag:
                
            if e_tag.tag == 'w' or e_tag.tag == 'c':
                if e_tag.text is not None:
                    tag = e_tag.attrib['c5']
                    word = e_tag.text.replace(" ", "")
                    
                    sentence.append(word)
                    tags.append(tag)
                    
            elif e_tag.tag == 'mw':
                tag = e_tag.attrib['c5']
                word = ""
                for w_tag in e_tag.iterfind('w'):     
                    word += w_tag.text.replace(" ", "")
                
                sentence.append(word)
                tags.append(tag)
                
            elif e_tag.tag == 'hi' or e_tag.tag == 'corr':
                
                for r_tag in e_tag:
                
                    if r_tag.tag == 'w' or r_tag.tag == 'c':
                        if r_tag.text is not None:
                            tag = r_tag.attrib['c5']
                            word = r_tag.text.replace(" ", "")

                            sentence.append(word)
                            tags.append(tag)

                    elif r_tag.tag == 'mw':
                        tag = r_tag.attrib['c5']
                        word = ""
                        for w_tag in r_tag.iterfind('w'):     
                            word += w_tag.text.replace(" ", "")

                        sentence.append(word)
                        tags.append(tag)
                
        data.append(sentence)
        labels.append(tags)

    return data, labels

In [None]:
def load_dataset(path):

    data = []
    labels = []

    for subdir, dirs, files in os.walk(path):
        for file in files:

            fileName = subdir + '/' + str(file)
            file_data, file_labels = parse_data(fileName)
            data.extend(file_data)
            labels.extend(file_labels)

    return data, labels

In [None]:
# Load Dataset

train_path = 'Train-corpus/'
test_path = 'Test-corpus/'

data, labels = load_dataset(train_path)
test_data, test_labels = load_dataset(test_path)

In [None]:
print(len(data))
print(len(labels))

print(data[0])
print(labels[0])

print(len(test_data))
print(len(test_labels))

print(test_data[11])
print(test_labels[11])

In [None]:
def len_cap(cap, data, labels):
    d = []
    l = []
    for i, sentence in enumerate(data):
        if len(sentence) < cap:
            d.append(sentence)
            l.append(labels[i])
            
    return d, l

In [None]:
a = data
b = labels
c = test_data
d = test_labels

In [None]:
data, labels = len_cap(32, a, b)
test_data, test_labels = len_cap(32, c, d)

In [None]:
print(len(data))
print(len(labels))
print(len(test_data))
print(len(test_labels))

In [None]:
words, tags = set([]), set([])
 
for s in data:
    for w in s:
        words.add(w.lower())
        
for ts in labels:
    for t in ts:
        tags.add(t)

In [None]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}

word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [None]:
def convert_to_categorical():
    train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
    
    for s in data:
        s_int = []
        for w in s:
            try:
                s_int.append(word2index[w.lower()])
            except KeyError:
                s_int.append(word2index['-OOV-'])

        train_sentences_X.append(s_int)

    for s in test_data:
        s_int = []
        for w in s:
            try:
                s_int.append(word2index[w.lower()])
            except KeyError:
                s_int.append(word2index['-OOV-'])

        test_sentences_X.append(s_int)

    for s in labels:
        train_tags_y.append([tag2index[t] for t in s])

    for s in test_labels:
        test_tags_y.append([tag2index[t] for t in s])
    
    return train_sentences_X, train_tags_y, test_sentences_X, test_tags_y
    

In [None]:
data_int, labels_int, test_data_int, test_labels_int = convert_to_categorical()

In [None]:
print(data_int[0])
print(labels_int[0])
print(test_data_int[0])
print(test_labels_int[0])

In [None]:
MAX_LENGTH = len(max(data, key=len))
print(MAX_LENGTH)

In [None]:
data_int = pad_sequences(data_int, maxlen=MAX_LENGTH, padding='post')
labels_int = pad_sequences(labels_int, maxlen=MAX_LENGTH, padding='post')

In [None]:
test_data_int = pad_sequences(test_data_int, maxlen=MAX_LENGTH, padding='post')
test_labels_int = pad_sequences(test_labels_int, maxlen=MAX_LENGTH, padding='post')

In [None]:
t = Tokenizer()
t.fit_on_texts(data)

In [None]:
vocab_size = len(t.word_index) + 1
print(vocab_size)

In [None]:
encoded_docs = t.texts_to_sequences(data)
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_LENGTH, padding='post')

encoded_docs_test = t.texts_to_sequences(test_data)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=MAX_LENGTH, padding='post')

print(padded_docs[0])
print(data_int[0])
print(type(padded_docs))
print(type(data_int))
print(type(padded_docs_test.tolist()))
print(type(test_data_int))

In [None]:
embeddings_index = dict()

In [None]:
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print(len(embeddings_index))

In [None]:
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
#             print(item)
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [None]:
one_hot_labels = to_categorical(labels_int, len(tag2index))
# print(one_hot_labels[0])

In [None]:
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [None]:
def create_model():
    
    model = tf.keras.Sequential()
    model.add(layers.InputLayer(input_shape=(MAX_LENGTH, )))
#     model.add(layers.Embedding(len(word2index), 128))
    model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False))
    
    model.add(layers.Bidirectional(layers.LSTM(256, return_sequences=True)))
    model.add(layers.TimeDistributed(layers.Dense(len(tag2index))))
    model.add(layers.Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
    
    return model

In [None]:
model = create_model()

In [None]:
model.summary()

In [None]:
model = tf.keras.models.load_model('weights/')

In [None]:
history = model.fit(padded_docs, one_hot_labels, batch_size=4096, epochs=400)

In [None]:
model.save('weights/')

In [None]:
one_hot_labels_test = to_categorical(test_labels_int, len(tag2index))

In [None]:
print(type(one_hot_labels[0]))
print(type(one_hot_labels_test[0]))

In [None]:
print(type(padded_docs[0][0]))
print(type(padded_docs_test[0][0]))

In [None]:
scores = model.evaluate(padded_docs_test, one_hot_labels_test, batch_size=4096)
print(f"{model.metrics_names[2]}: {scores[2] * 100}")
# model.metrics_names

In [None]:
predictions = model.predict(test_data_int)
print(predictions, predictions.shape)

In [None]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [None]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

In [None]:
def get_accuracy(test_data, test_labels, preds):
    
    correct = 0
    incorrect = 0
    
    print("Total: %d" % len(test_data))
    
    t0 = time.process_time()
    
    for index, pred_labels in enumerate(preds):
        true_labels = test_labels[index]
        
        for i, pred_label in enumerate(pred_labels):
            if pred_label in true_labels[i]:
                correct = correct + 1
            else:
                incorrect = incorrect + 1
                
    print("Evaluated Words: %d " % (incorrect + correct))   
    print("Correct: %d " % (correct))   
    print("Incorrect: %d " % (incorrect))   
    print("Time Taken: %.2f \n " % (time.process_time()-t0))
    
    print("Final Accuracy = %.06f"  % (correct/(correct+incorrect)))

In [None]:
get_accuracy(test_)

In [None]:
# Load JSON Files

with open('words.json') as f:
    word_dict = json.load(f)
with open('tags.json') as f:
    tag_dict = json.load(f)
with open('word_tags.json') as f:
    word_tags_dict = json.load(f)

In [None]:
print(len(tag_dict))
print(len(word_dict))
print(len(word_tags_dict))