In [1]:
from sklearn import preprocessing
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, LSTM, TimeDistributed, Bidirectional
from nervaluate import Evaluator
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [38]:
class txtReader:
    def __init__(self, filename):
        self.filename = filename

    def read_split(self):
        with open(self.filename, 'r') as f:
            file_read = f.read()
        lines = file_read.split('\n')
        text = []
        text_id = []
        tmp = []
        tmp_id = []
        for i in lines:
            if i == '':
                if len(tmp) != 0:
                    text.append(tmp)
                    text_id.append(tmp_id)
                    tmp = []
                    tmp_id = []
            else:
                word = i.split('\t')
                tmp.append(word[0])
                tmp_id.append(word[1])

        return text, text_id

In [49]:
class alphabet:
    def __init__(self, train_file, dev_file, test_file):
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.data = dict()
        self.labels = dict()

    def read_split(self):
        text_files = []
        for text_file in [self.train_file, self.dev_file, self.test_file]:
            txt = txtReader(text_file)
            text, text_id = txt.read_split()
            text_files.append(text)
            text_files.append(text_id)

        return text_files[0], text_files[1], text_files[2], text_files[3], text_files[4], text_files[5]
    
    def _tagger(self, dataset, cnt, dictionary):
        for i in dataset: # i es una frase
            for j in i: # j es una palabra
                pos = i.index(j) # pos es la posicion de la palabra en la frase
                if j not in dictionary:
                    dictionary[j] = cnt
                    i[pos] = cnt
                    cnt += 1
                else:
                    i[pos] = dictionary[j]

        return dataset, cnt, dictionary 

    def labelEncoder(self):
        train, train_id, dev, dev_id, test, test_id = self.read_split()
        cnt = 1
        cnt_id = 0

        train, cnt, self.data = self._tagger(train, cnt, self.data) 
        train_id, cnt_id, self.labels = self._tagger(train_id, cnt_id, self.labels)
        dev, cnt, self.data = self._tagger(dev, cnt, self.data)
        dev_id, cnt_id, self.labels = self._tagger(dev_id, cnt_id, self.labels)
        
        len_train = 0
        for i in train:
            len_train += len(i)

        for phrase_te in test:
            for te in phrase_te:
                pos = phrase_te.index(te)
                if te not in self.data:
                    self.data[te] = len_train
                    phrase_te[pos] = self.data[te]
                else:
                    phrase_te[pos] = self.data[te]

        for phrase_te_id in test_id:
            for te_id in phrase_te_id:
                pos_id = phrase_te_id.index(te_id)
                if te_id not in self.labels:
                    self.labels[te_id] = len_train
                    phrase_te_id[pos_id] = self.labels[te_id]
                else:
                    phrase_te_id[pos_id] = self.labels[te_id] 

        return train, train_id, dev, dev_id, test, test_id, self.data, self.labels

In [40]:
train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT, PartTUT_data, PartTUT_labels = alphabet('materiales_practica/datasets/PartTUT/train.txt', 'materiales_practica/datasets/PartTUT/dev.txt', 'materiales_practica/datasets/PartTUT/test.txt').labelEncoder()

In [172]:
train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie, MITMovie_data, MITMovie_labels = alphabet('materiales_practica/datasets/MITMovie/train.txt', 'materiales_practica/datasets/MITMovie/dev.txt', 'materiales_practica/datasets/MITMovie/test.txt').labelEncoder()

In [50]:
train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant, MITRestaurant_data, MITRestaurant_labels = alphabet('materiales_practica/datasets/MITRestaurant/train.txt', 'materiales_practica/datasets/MITRestaurant/dev.txt', 'materiales_practica/datasets/MITRestaurant/test.txt').labelEncoder()

In [94]:
class FFTagger():
    def __init__(self, train, train_id, dev, dev_id, test, test_id, labels_dict, n, loss, optimizer, metrics): # weighted_metrics
        self.model = Sequential()
        self.train = train
        self.train_id = train_id
        self.dev = dev
        self.dev_id = dev_id
        self.test = test
        self.test_id = test_id
        self.labels_dict = labels_dict
        self.n = n
        len_train = 0
        for i in train:
            len_train += len(i)
        self.vocab_size = len_train + 1 # +1 por valores desconocidos en test
        self.classes = labels_dict.keys()
        self.num_classes = len(labels_dict)
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics
        # self.weighted_metrics = weighted_metrics
        self.train_windows = []
        self.dev_windows = []
        self.test_windows = []
        self.batch_size = 64

    def build_model(self): 
        self.model.add(Input(shape=(self.n*2+1,), dtype=tf.int32))
        self.model.add(Embedding(input_dim = self.vocab_size, output_dim=20, mask_zero=True, input_length=self.n*2+1))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dense(self.num_classes, activation='softmax'))

    def train_model(self):
        # Añadimos padding a las frases y las dividimos en ventanas de tamaño 2n+1
        padding = []
        for i in range(self.n):
            padding.append(0)

        padded_train = []
        padded_dev = []
        one_hot_train_id = []
        one_hot_dev_id = []
        for j in range(len(self.train)):
            padded_phrase = padding + self.train[j] + padding
            padded_train.append(padded_phrase) # padding
            for w in range(self.n, len(padded_phrase)-self.n): # división en ventanas
                self.train_windows.append(padded_phrase[w-self.n:w+self.n+1])
            one_hot_phrase = to_categorical(self.train_id[j], num_classes=self.num_classes) # one-hot encoding
            for v in one_hot_phrase:
                one_hot_train_id.append(v)

        for k in range(len(self.dev)):
            padded_phrase_dev = padding + self.dev[k] + padding
            padded_dev.append(padded_phrase_dev)
            for w in range(self.n, len(padded_phrase_dev)-self.n):
                self.dev_windows.append(padded_phrase_dev[w-self.n:w+self.n+1])
            one_hot_phrase_dev = to_categorical(self.dev_id[k], num_classes=self.num_classes)
            for v in one_hot_phrase_dev:
                one_hot_dev_id.append(v)
        
        # Convertimos las listas en tensores
        train_tensor = tf.data.Dataset.from_tensor_slices((self.train_windows, one_hot_train_id))
        train_tensor = train_tensor.batch(self.batch_size)
        dev_tensor = tf.data.Dataset.from_tensor_slices((self.dev_windows, one_hot_dev_id))
        dev_tensor = dev_tensor.batch(self.batch_size)

        # Reescribimos las variables globales
        self.train = padded_train
        self.dev = padded_dev

        self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=self.metrics)
        self.model.fit(train_tensor, epochs=1, validation_data=dev_tensor, verbose=1)

    def evaluate_model(self, task):
        padding = []
        for i in range(self.n):
            padding.append(0)
        
        # Añadimos el padding, dividimos en ventanas y hacemos one-hot encoding de las etiquetas
        padded_test = []
        one_hot_test_id = []
        for j in range(len(self.test)):
            padded_phrase_test = padding + self.test[j] + padding
            padded_test.append(padded_phrase_test)
            for w in range(self.n, len(padded_phrase_test)-self.n): # división en ventanas
                self.test_windows.append(padded_phrase_test[w-self.n:w+self.n+1])
            one_hot_phrase_test = to_categorical(self.test_id[j], num_classes=self.num_classes) # one-hot encoding
            for v in one_hot_phrase_test:
                one_hot_test_id.append(v)

        test_labels = [] # guarda las etiquetas como cadena de caracteres
        test_labels_length = [] # guarda la longitud de las oraciones
        for i in self.test_id: 
            temp_phrase = []
            for j in i: 
                for k, v in self.labels_dict.items():
                    if j == v:
                        temp_phrase.append(k)
            test_labels.append(temp_phrase)
            test_labels_length.append(len(temp_phrase))
        
        one_hot_dict = dict() # Almacena la correspondencia entre one hot (valor) y etiqueta numérica (clave)
        pos_one_hot = 0
        for i in self.test_id:
            for j in range(len(i)):
                if i[j] not in one_hot_dict:
                    one_hot_dict[i[j]] = np.argmax(one_hot_test_id[pos_one_hot]) # devuelve el índice del elemento con valor 1 en one_hot_test_id[i]
                pos_one_hot += 1

        test_tensor = tf.data.Dataset.from_tensor_slices((self.test_windows, one_hot_test_id))
        test_tensor = test_tensor.batch(self.batch_size)
        
        if task == "PoS":
            loss, accuracy = self.model.evaluate(test_tensor, verbose=1)
            return loss, accuracy
        elif task == "NER":
            loss, accuracy = self.model.evaluate(test_tensor, verbose=1)
            pred = self.model.predict(test_tensor).astype(np.float32)

            # para cada elemento de pred obtenemos la etiqueta numérica  
            predictions = []
            for p in pred:
                pos = np.argmax(p)
                for k, v in one_hot_dict.items():
                    if pos == v:
                        predictions.append(k)
            
            # convertimos las etiquetas numéricas a etiquetas de texto
            pred_labels = []
            test_index = 0
            tmp_phrase = []
            for l in predictions:
                for k, v in self.labels_dict.items():
                    if l == v:
                        tmp_phrase.append(k)
                    if len(tmp_phrase) == test_labels_length[test_index]:
                        pred_labels.append(tmp_phrase)
                        tmp_phrase = []
                        test_index += 1
                        break

            numeric_tags = set()
            for i in self.train_id:
                numeric_tags = set(numeric_tags|set(i))
            for j in self.dev_id:
                numeric_tags = set(numeric_tags|set(j))
            numeric_tags = list(numeric_tags)

            # convertimos las etiquetas numéricas a etiquetas de texto
            tags = []
            for t in numeric_tags:
                for k, v in self.labels_dict.items():
                    if t == v:
                        tags.append(k)

            # Nos quedamos con las etiquetas de la entidad (sin B e I)
            evaluator_tags = []
            for tag in tags:
                if tag == "O":
                    continue
                elif tag == '':
                    continue
                else:
                    evaluator_tag = tag[2:]

                if evaluator_tag not in evaluator_tags:
                    evaluator_tags.append(evaluator_tag)
            
            evaluator = Evaluator(test_labels, pred=pred_labels, tags=evaluator_tags, loader="list")
            results, results_by_tag = evaluator.evaluate()
            
            return loss, accuracy, results, results_by_tag
        else:
            return "Task not found"

In [95]:
modelPartTUT = FFTagger(train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT,  PartTUT_labels, 2, 'categorical_crossentropy', 'adam', ['accuracy'])
modelPartTUT.build_model()
modelPartTUT.train_model()

43494 vocab_size


In [67]:
modelPartTUT.evaluate_model("PoS")



(0.5380165576934814, 0.8398101329803467)

In [174]:
modelMITMovie = FFTagger(train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie, MITMovie_labels, 2, 'categorical_crossentropy', 'adam', ['accuracy'])
modelMITMovie.build_model()
modelMITMovie.train_model()



In [183]:
# modelMITMovie.evaluate_model("NER")

In [95]:
modelMITRestaurant = FFTagger(train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant, MITRestaurant_labels, 2, 'categorical_crossentropy', 'adam', ['accuracy'])
modelMITRestaurant.build_model()
modelMITRestaurant.train_model()



In [96]:
modelMITRestaurant.evaluate_model("NER")

1521 test_labels 1521 pred_labels


(0.5831315517425537,
 0.8437149524688721,
 {'ent_type': {'correct': 2411,
   'incorrect': 569,
   'partial': 0,
   'missed': 171,
   'spurious': 605,
   'possible': 3151,
   'actual': 3585,
   'precision': 0.6725244072524407,
   'recall': 0.7651539193906697,
   'f1': 0.7158551068883611},
  'partial': {'correct': 2238,
   'incorrect': 0,
   'partial': 742,
   'missed': 171,
   'spurious': 605,
   'possible': 3151,
   'actual': 3585,
   'precision': 0.7277545327754533,
   'recall': 0.827991113932085,
   'f1': 0.774643705463183},
  'strict': {'correct': 1950,
   'incorrect': 1030,
   'partial': 0,
   'missed': 171,
   'spurious': 605,
   'possible': 3151,
   'actual': 3585,
   'precision': 0.5439330543933054,
   'recall': 0.6188511583624247,
   'f1': 0.578978622327791},
  'exact': {'correct': 2238,
   'incorrect': 742,
   'partial': 0,
   'missed': 171,
   'spurious': 605,
   'possible': 3151,
   'actual': 3585,
   'precision': 0.6242677824267783,
   'recall': 0.7102507140590288,
   'f1':

In [106]:
class LSTMTagger():
    def __init__(self, train, train_id, dev, dev_id, test, test_id, data_dict, labels_dict, loss, optimizer, metrics):
        self.model = Sequential()
        self.train = train
        self.train_id = train_id
        self.dev = dev
        self.dev_id = dev_id
        self.test = test
        self.test_id = test_id
        self.data_dict = data_dict
        self.labels_dict = labels_dict
        len_train = 0
        for i in train:
            len_train += len(i)
        self.vocab_size = len_train + 1
        self.classes = labels_dict.keys()
        self.num_classes = len(labels_dict)
        self.loss=loss
        self.optimizer=optimizer
        self.metrics=metrics
        self.train_windows = []
        self.train_windows_id = []
        self.dev_windows = []
        self.dev_windows_id = []
        self.test_windows = []
        self.test_windows_id = []
        self.maxlen = 0
        self.batch_size = 64

    def build_model(self, bidirectional=False):
        self.model.add(Embedding(input_dim=self.vocab_size, output_dim=20, mask_zero=True, input_length=self.maxlen))
        if bidirectional:
            self.model.add(Bidirectional(LSTM(64, return_sequences=True)))
        else:
            self.model.add(LSTM(64, return_sequences=True))
        self.model.add(TimeDistributed(Dense(units=self.num_classes, activation='softmax'))) 
        
    def _padder(self, data, data_id):
        # Obtenemos la longitud de la oración más larga
        for i in range(len(data)):
            if len(data[i]) > self.maxlen:
                self.maxlen = len(data[i])

        padded_data = pad_sequences(data, maxlen=self.maxlen, padding='post')
        padded_data_id = pad_sequences(data_id, maxlen=self.maxlen, padding='post')
        one_hot_data_id = to_categorical(padded_data_id, num_classes=self.num_classes)

        return padded_data, one_hot_data_id
          

    def preprocessing(self):
        self.train_windows, self.train_windows_id = self._padder(self.train, self.train_id)
        self.dev_windows, self.dev_windows_id = self._padder(self.dev, self.dev_id)
        self.test_windows, self.test_windows_id = self._padder(self.test, self.test_id)

    def train_model(self):
        train_tensor = tf.data.Dataset.from_tensor_slices((self.train_windows, self.train_windows_id))
        train_tensor = train_tensor.batch(self.batch_size)
        dev_tensor = tf.data.Dataset.from_tensor_slices((self.dev_windows, self.dev_windows_id))
        dev_tensor = dev_tensor.batch(self.batch_size)

        self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=self.metrics)
        self.model.fit(train_tensor, epochs=1, validation_data=dev_tensor)

    def evaluate_model(self, task):
        test_labels = [] # lista que almacena las etiquetas de los elementos de self.test_id
        # Cambiamos las etiquetas de numérico a string
        for i in self.test_id:
            temp_phrase = []
            for j in i:
                for k, v in self.labels_dict.items():
                    if j == v:
                        temp_phrase.append(k)
            test_labels.append(temp_phrase)
        
        lengths = []
        for t in self.test_id:
            lengths.append(len(t))

        one_hot_dict = dict()
        cnt_test_id = 0
        for w in self.test_windows_id:
            for i in range(len(w)):
                if np.argmax(w[i]) not in one_hot_dict:
                    one_hot_dict[np.argmax(w[i])] = self.test_id[cnt_test_id] # clave es el índice del elemento con valor 1 en one_hot_test_id[i], valor es el elemento de self.test_id -> al reves que en FFTagger
                    cnt_test_id += 1

        test_tensor = tf.data.Dataset.from_tensor_slices((self.test_windows, self.test_windows_id))
        test_tensor = test_tensor.batch(self.batch_size)
        
        if task == "PoS":
            loss, accuracy = self.model.evaluate(test_tensor, verbose=1)
            return loss, accuracy
        elif task == "NER":
            loss, accuracy = self.model.evaluate(test_tensor, verbose=1)
            pred = self.model.predict(test_tensor).astype(np.float32)

            pred_wo_padding = []
            for p in range(len(pred)):
                pred_wo_padding.append(pred[p][:lengths[p]])
            
            # convertimos las predicciones a etiquetas numéricas
            predictions = []
            for phrase in pred_wo_padding:
                temp_phrase = []
                for i in range(len(phrase)):
                    pos = np.argmax(phrase[i])
                    for k, v in one_hot_dict.items():
                        if pos == k:
                            temp_phrase.append(v)
                predictions.append(temp_phrase)

            # convertimos las etiquetas numéricas a etiquetas de texto
            pred_labels = []
            for l in predictions:
                tmp_phrase = []
                for w in l:
                    for k, v in self.labels_dict.items():
                        if w == v:
                            tmp_phrase.append(k)
                pred_labels.append(tmp_phrase)
            
            # obtenemos las etiquetas únicas que hay en el conjunto de train y dev
            numeric_tags = set()
            for i in self.train_id:
                numeric_tags = set(numeric_tags|set(i))
            for j in self.dev_id:
                numeric_tags = set(numeric_tags|set(j))
            numeric_tags = list(numeric_tags)

            # convertimos las etiquetas numéricas a etiquetas de texto
            tags = []
            for t in numeric_tags:
                for k, v in self.labels_dict.items():
                    if t == v:
                        tags.append(k)

            # Nos quedamos con las etiquetas de cada entidad
            evaluator_tags = []
            for tag in tags:
                if tag == "O":
                    continue
                elif tag == '':
                    continue
                else:
                    evaluator_tag = tag[2:]

                if evaluator_tag not in evaluator_tags:
                    evaluator_tags.append(evaluator_tag)

            evaluator = Evaluator(test_labels, pred=pred_labels, tags=evaluator_tags, loader="list")
            results, results_by_tag = evaluator.evaluate()
            
            return loss, accuracy, results, results_by_tag
        else:
            return "Task not found"

In [148]:
BDLSTMPartut = LSTMTagger(train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT, PartTUT_data, PartTUT_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
BDLSTMPartut.preprocessing()
BDLSTMPartut.build_model(bidirectional=True)
BDLSTMPartut.train_model()



In [109]:
# BDLSTMPartut.evaluate_model("PoS")

In [None]:
LSTMPartut = LSTMTagger(train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT, PartTUT_data, PartTUT_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
LSTMPartut.preprocessing()
LSTMPartut.build_model(bidirectional=False)
LSTMPartut.train_model()

In [None]:
# LSTMPartut.evaluate_model("PoS")

In [None]:
BDLSTMMITMovie = LSTMTagger(train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie, MITMovie_data, MITMovie_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
BDLSTMMITMovie.preprocessing()
BDLSTMMITMovie.build_model(bidirectional=True)
BDLSTMMITMovie.train_model()

In [None]:
# BDLSTMMITMovie.evaluate_model("NER")

In [None]:
LSTMMITMovie = LSTMTagger(train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie, MITMovie_data, MITMovie_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
LSTMMITMovie.preprocessing()
LSTMMITMovie.build_model(bidirectional=False)
LSTMMITMovie.train_model()

In [None]:
# LSTMMITMovie.evaluate_model("NER")

In [107]:
BDLSTMMITRestaurant = LSTMTagger(train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant, MITRestaurant_data, MITRestaurant_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
BDLSTMMITRestaurant.preprocessing()
BDLSTMMITRestaurant.build_model(bidirectional=True)
BDLSTMMITRestaurant.train_model()



In [110]:
# BDLSTMMITRestaurant.evaluate_model("NER")

In [152]:
LSTMMITRestaurant = LSTMTagger(train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant, MITRestaurant_data, MITRestaurant_labels, 'categorical_crossentropy', 'adam', ['accuracy'])
LSTMMITRestaurant.preprocessing()
LSTMMITRestaurant.build_model(bidirectional=False)
LSTMMITRestaurant.train_model()



In [111]:
# LSTMMITRestaurant.evaluate_model("NER")