In [30]:
from sklearn import preprocessing
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, LSTM
from nervaluate import Evaluator
from keras.utils import to_categorical

In [2]:
class txtReader:
    def __init__(self, filename):
        self.filename = filename

    def read_split(self):
        with open(self.filename, 'r') as f:
            file_read = f.read()
        efg = []
        lines = file_read.split('\n')
        efg.append(lines)
        text = []
        text_id = []
        for i in lines:
            if i != '':
                word = i.split('\t')
                text.append(word[0])
                text_id.append(word[1])

        return text, text_id

In [135]:
class alphabet:
    def __init__(self, train_file, dev_file, test_file):
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.data = dict()
        self.labels = dict()

    def read_split(self):
        text_files = []
        for text_file in [self.train_file, self.dev_file, self.test_file]:
            txt = txtReader(text_file)
            text, text_id = txt.read_split()
            text_files.append(text)
            text_files.append(text_id)

        return text_files[0], text_files[1], text_files[2], text_files[3], text_files[4], text_files[5]
    
    def _tagger(self, dataset, cnt, dictionary):
        for i in dataset:
            # obtener indice de i en dataset
            pos = dataset.index(i)
            if i not in dictionary:
                dictionary[i] = cnt
                dataset[pos] = cnt
                cnt += 1
            else:
                dataset[pos] = dictionary[i]

        return dataset, cnt, dictionary 

    def labelEncoder(self):
        train, train_id, dev, dev_id, test, test_id = self.read_split()
        cnt = 1
        cnt_id = 0

        train, cnt, self.data = self._tagger(train, cnt, self.data) 
        train_id, cnt_id, self.labels = self._tagger(train_id, cnt_id, self.labels)
        dev, cnt, self.data = self._tagger(dev, cnt, self.data)
        dev_id, cnt_id, self.labels = self._tagger(dev_id, cnt_id, self.labels)
        
        for te in test:
            pos = test.index(te)
            if te not in self.data:
                self.data[te] = -1 # -1 indica que la palabra es desconocida
                test[pos] = -1
            else:
                test[pos] = self.data[te]

        for te_id in test_id:
            pos_id = test_id.index(te_id)
            if te_id not in self.labels:  
                self.labels[te_id] = -1
                test_id[pos_id] = self.labels[te_id]
            else:
                test_id[pos_id] = self.labels[te_id]  

        return train, train_id, dev, dev_id, test, test_id

In [136]:
train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT = alphabet('materiales_practica/datasets/PartTUT/train.txt', 'materiales_practica/datasets/PartTUT/dev.txt', 'materiales_practica/datasets/PartTUT/test.txt').labelEncoder()

In [13]:
train_MITMovie, train_id_MITMovie, dev_MITMovie, dev_id_MITMovie, test_MITMovie, test_id_MITMovie = alphabet('materiales_practica/datasets/MITMovie/train.txt', 'materiales_practica/datasets/MITMovie/dev.txt', 'materiales_practica/datasets/MITMovie/test.txt').labelEncoder()

In [None]:
train_MITRestaurant, train_id_MITRestaurant, dev_MITRestaurant, dev_id_MITRestaurant, test_MITRestaurant, test_id_MITRestaurant = alphabet('materiales_practica/datasets/MITRestaurant/train.txt', 'materiales_practica/datasets/MITRestaurant/dev.txt', 'materiales_practica/datasets/MITRestaurant/test.txt').labelEncoder()

In [141]:
class FFTagger():
    def __init__(self, train, train_id, dev, dev_id, test, test_id, n, loss, optimizer, metrics):
        self.model = Sequential()
        self.train = train
        self.train_id = train_id
        self.dev = dev
        self.dev_id = dev_id
        self.test = test
        self.test_id = test_id
        self.n = n
        self.m = len(self.train)
        self.num_classes = 0
        self.loss=loss
        self.optimizer=optimizer
        self.metrics=metrics
        # self.weighted_metrics=weighted_metrics
        self.train_windows = []
        self.dev_windows = []
        self.test_windows = []

    def build_model(self): 
        padding = []
        for i in range(self.n):
            padding.append(0)
        self.train = padding + self.train + padding
        self.dev = padding + self.dev + padding

        # almacenar ventanas de tamaño n*2+1 en una lista de listas para cada conjunto

        for i in range(self.n, len(self.train) - self.n):
            data = self.train[i-self.n:i+self.n+1]
            self.train_windows.append(data)
        
        for i in range(self.n, len(self.dev) - self.n):
            data = self.dev[i-self.n:i+self.n+1]
            self.dev_windows.append(data) 

        classes = set(set(self.train_id) | set(self.dev_id))
        self.num_classes = len(classes)

        one_hot_train_id = to_categorical(self.train_id, num_classes=self.num_classes)
        one_hot_dev_id = to_categorical(self.dev_id, num_classes=self.num_classes)
        
        batch_size = 10
        train_tensor = tf.data.Dataset.from_tensor_slices((self.train_windows, one_hot_train_id))
        train_tensor = train_tensor.batch(batch_size)
        dev_tensor = tf.data.Dataset.from_tensor_slices((self.dev_windows, one_hot_dev_id))
        dev_tensor = dev_tensor.batch(batch_size)

        self.model.add(Input(shape=(self.n*2+1,), dtype=tf.int32))
        self.model.add(Embedding(input_dim = self.m, output_dim=20, mask_zero=True, input_length=self.n*2+1))
        self.model.add(Flatten())
        self.model.add(Dense(64, activation='relu'))
        self.model.add(Dense(self.num_classes, activation='softmax'))

        self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=self.metrics)
        self.model.fit(train_tensor, epochs=1, validation_data=dev_tensor, verbose=1)

    # def train_model(self):

    #     self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=self.metrics, weighted_metrics=self.weighted_metrics)
    #     self.model.fit(self.train_windows, self.train_id, epochs=10, validation_data=(self.dev_windows, self.dev_id), verbose=0)

    def evaluate(self, task):
        padding = []
        for i in range(self.n):
            padding.append(0)
        self.test = padding + self.test + padding

        self.test_windows = []
        for i in range(self.n, len(self.test) - self.n):
            data = self.test[i-self.n:i+self.n+1]
            self.test_windows.append(data)

        batch_size = 10
        one_hot_test_id = to_categorical(self.test_id, num_classes=self.num_classes+1)
        test_tensor = tf.data.Dataset.from_tensor_slices((self.test_windows, one_hot_test_id))
        test_tensor = test_tensor.batch(batch_size)
        
        if task == "PoS":
            return self.model.evaluate(test_tensor, verbose=1)
        elif task == "NER":
            evaluator = Evaluator(test_tensor, tags=['ent_type', 'partial', 'exact', 'strict']).evaluate()
            
            return self.model.evaluate(test_tensor, verbose=1), evaluator['ent_type']['f1'], evaluator['partial']['f1'], evaluator['exact']['f1'], evaluator['strict']['f1']
        else:
            return "Task not found"

self.m es en realidad vocab_size??????

In [140]:
modelPartTUT = FFTagger(train_PartTUT, train_id_PartTUT, dev_PartTUT, dev_id_PartTUT, test_PartTUT, test_id_PartTUT, 2, 'categorical_crossentropy', 'adam', ['accuracy'])
modelPartTUT.build_model()
# modelPartTUT.train_model()



In [142]:
modelPartTUT.evaluate("PoS")

InvalidArgumentError: Graph execution error:

Detected at node 'categorical_crossentropy/softmax_cross_entropy_with_logits' defined at (most recent call last):
    File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\traitlets\config\application.py", line 976, in launch_instance
      app.start()
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\base_events.py", line 603, in run_forever
      self._run_once()
    File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\base_events.py", line 1909, in _run_once
      handle._run()
    File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\Administrador\AppData\Local\Temp\ipykernel_12848\201230881.py", line 1, in <cell line: 1>
      modelPartTUT.evaluate("PoS")
    File "C:\Users\Administrador\AppData\Local\Temp\ipykernel_12848\1328941117.py", line 82, in evaluate
      return self.model.evaluate(test_tensor, verbose=1)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1947, in evaluate
      tmp_logs = self.test_function(iterator)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1727, in test_function
      return step_function(self, iterator)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1713, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1701, in run_step
      outputs = model.test_step(data)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1667, in test_step
      self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\training.py", line 1052, in compute_loss
      return self.compiled_loss(
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\engine\compile_utils.py", line 265, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\losses.py", line 152, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\losses.py", line 272, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
      return backend.categorical_crossentropy(
    File "C:\Users\Administrador\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\keras\backend.py", line 5535, in categorical_crossentropy
      return tf.nn.softmax_cross_entropy_with_logits(
Node: 'categorical_crossentropy/softmax_cross_entropy_with_logits'
logits and labels must be broadcastable: logits_size=[10,17] labels_size=[10,16]
	 [[{{node categorical_crossentropy/softmax_cross_entropy_with_logits}}]] [Op:__inference_test_function_319330]