In [7]:
from sklearn import preprocessing
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, LSTM
from nervaluate import Evaluator

In [2]:
class txtReader:
    def __init__(self, filename):
        self.filename = filename

    def read_split(self):
        with open(self.filename, 'r') as f:
            file_read = f.read()
        efg = []
        lines = file_read.split('\n')
        efg.append(lines)
        text = []
        text_id = []
        for i in lines:
            if i != '':
                word = i.split('\t')
                text.append(word[0])
                text_id.append(word[1])

        return text, text_id

In [3]:
class alphabet:
    def __init__(self, train_file, dev_file, test_file):
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.data = dict()
        self.labels = dict()

    def read_split(self):
        text_files = []
        for text_file in [self.train_file, self.dev_file, self.test_file]:
            txt = txtReader(text_file)
            text, text_id = txt.read_split()
            text_files.append(text)
            text_files.append(text_id)

        return text_files[0], text_files[1], text_files[2], text_files[3], text_files[4], text_files[5]
    
    def _tagger(self, dataset, cnt, dictionary):
        for i in dataset:
            # obtener indice de i en dataset
            pos = dataset.index(i)
            if i not in dictionary:
                dictionary[i] = cnt
                dataset[pos] = cnt
                cnt += 1
            else:
                dataset[pos] = dictionary[i]

        return dataset, cnt, dictionary 

    def labelEncoder(self):
        train, train_id, dev, dev_id, test, test_id = self.read_split()
        cnt = 1
        cnt_id = 1

        train, cnt, self.data = self._tagger(train, cnt, self.data) 
        train_id, cnt_id, self.labels = self._tagger(train_id, cnt_id, self.labels)
        dev, cnt, self.data = self._tagger(dev, cnt, self.data)
        dev_id, cnt_id, self.labels = self._tagger(dev_id, cnt_id, self.labels)
        
        for te in test:
            pos = test.index(te)
            if te not in self.data:
                self.data[te] = -1 # -1 indica que la palabra es desconocida
                test[pos] = -1
            else:
                test[pos] = self.data[te]

        for te_id in test_id:
            pos_id = test_id.index(te_id)
            if te_id not in self.labels:  
                self.labels[te_id] = -1
                test_id[pos_id] = self.labels[te_id]
            else:
                test_id[pos_id] = self.labels[te_id]  

        return train, train_id, dev, dev_id, test, test_id

In [23]:
train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT = alphabet('materiales_practica/datasets/PartTUT/train.txt', 'materiales_practica/datasets/PartTUT/dev.txt', 'materiales_practica/datasets/PartTUT/test.txt').labelEncoder()

In [13]:
train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie = alphabet('materiales_practica/datasets/MITMovie/train.txt', 'materiales_practica/datasets/MITMovie/dev.txt', 'materiales_practica/datasets/MITMovie/test.txt').labelEncoder()

In [None]:
train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant = alphabet('materiales_practica/datasets/MITRestaurant/train.txt', 'materiales_practica/datasets/MITRestaurant/dev.txt', 'materiales_practica/datasets/MITRestaurant/test.txt').labelEncoder()

In [74]:
class FFTagger():
    def __init__(self):
        self.model = Sequential()

    def build_model(self, n): 
        self.model.add(Input(shape=(n*2+1,)))
        self.model.add(Embedding(input_dim=1, output_dim=20, input_length=n*2+1))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dense(1, activation='softmax'))

    def train(self, train, train_id, dev, dev_id, n, loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], weighted_metrics=[]):
        # train = np.concatenate((np.zeros(n), train, np.zeros(n)))
        # dev = np.concatenate((np.zeros(n), dev, np.zeros(n)))
        padding = []
        for i in range(n):
            padding.append(0)
        train = padding + train + padding
        dev = padding + dev + padding

        # almacenar ventanas de tama√±o n*2+1 en una lista de listas para cada conjunto
        train_windows = []
        dev_windows = []

        for i in range(n, len(train) - n):
            data = train[i-n:i+n+1]
            train_windows.append(data)
        
        for i in range(n, len(dev) - n):
            data = dev[i-n:i+n+1]
            dev_windows.append(data) 

        print(type(train_windows))
        print(type(train_windows[0]))

        print("train_windows: ", len(train_windows))
        print("id", len(train_id))
        print("dev_windows: ", len(dev_windows))
        print("id", len(dev_id))

        self.model.compile(loss=loss, optimizer=optimizer, metrics=metrics, weighted_metrics=weighted_metrics)
        self.model.fit(train_windows, train_id, epochs=10, validation_data=(dev_windows, dev_id), verbose=0)

    def evaluate(self, test, test_id, n, task):
        test = np.concatenate((np.zeros(n), test, np.zeros(n)))
        if task == "PoS":
            return self.model.evaluate(test, test_id)
        elif task == "NER":
            evaluator = Evaluator(test_id, test, tags=['ent_type', 'partial', 'exact', 'strict']).evaluate()
            
            return self.model.evaluate(test, test_id), evaluator['ent_type']['f1'], evaluator['partial']['f1'], evaluator['exact']['f1'], evaluator['strict']['f1']
        else:
            return "Task not found"

In [75]:
modelPartTUT = FFTagger()
modelPartTUT.build_model(2)
modelPartTUT.train(train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, 2)


<class 'list'>
<class 'list'>
train_windows:  43511
id 43503
dev_windows:  2730
id 2722


ValueError: Data cardinality is ambiguous:
  x sizes: 43511
  y sizes: 43503
Make sure all arrays contain the same number of samples.