<a href="https://colab.research.google.com/github/alexisdr/uned-tfg/blob/main/UNED-TFG-6-macro-average.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Parameters


In [13]:
base_path = '/drive/My Drive/CorpusPFG/'

#Datasets procesados
dataset_path = base_path + 'Dataset'

#Model parameters
CHECKPOINT = "alexisdr/uned-tfg-08.30_MasFrecuentes"
HUGGING_FACE_TOKEN = "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"

MAS_FRECUENTES = True

## Set-up environment

First, we install the libraries which we'll use: HuggingFace Transformers and Datasets.

In [14]:
!pip install -q datasets transformers[sentencepiece] 

## Load dataset

Next, let's load a multi-label text classification dataset from files.

In [15]:
from google.colab import drive

drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [16]:
from datasets import DatasetDict

dataset = DatasetDict.load_from_disk(dataset_path)

In [17]:
dataset

train = "train"
validation = "validation"
test = "test"

if (MAS_FRECUENTES):
  train = "trainMasFrecuentes"
  validation = "validationMasFrecuentes"
  test = "testMasFrecuentes"

In [18]:
from datasets import ClassLabel

class2label = dataset[train].features["label"]
print(class2label)

ClassLabel(names=['T38.0X5A', 'T45.515A', 'T50.2X5A', 'Y95', 'P01.1', 'T81.4XXA', 'T45.1X5A', 'Y83.1'], id=None)


In [19]:
class2label.num_classes

8

## Preprocess test data

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT, use_auth_token=HUGGING_FACE_TOKEN)

def preprocess_data(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

ReadTimeout: ignored

## Cálculo de la métrica S

In [None]:
#Calcula el prefico comun entre 2 códigos
def calculo_lcs(codigo_i, codigo_j):
  if codigo_i is None or codigo_j is None:
    return ""

  #Se omiten los puntos existenten en los códigos
  codigo_i = codigo_i.replace(".", "")
  codigo_j = codigo_j.replace(".", "")

  #Tomamos el tamaño mínimo
  size = min(len(codigo_i), len(codigo_j)) 

  lcs_i_j = ''
  for i in range(size):
    if codigo_i[i] == codigo_j[i]:
      lcs_i_j = lcs_i_j + codigo_i[i]
    else:
      return lcs_i_j
  
  return lcs_i_j


#Devuelve la longitud de la cadena de caracteres C
#si esta vale al menos 3, y devuelve 0 si dicha longitud es menor que 3. 
#Esto se debe a que los códigos CIE-10 más cortos contienen al menos 3 caracteres.
def calculo_ic(codigo):
  #Se omiten los puntos existenten en los códigos
  codigo = codigo.replace(".", "")

  tamanyo = len(codigo)
  if (tamanyo < 3):
    return 0
  else:
    return tamanyo

#similitud entre 2 códigos CIE-10 𝑖 y 𝑗:
def calculo_c(codigo_i, codigo_j):
  divisor = calculo_ic(codigo_i) + calculo_ic(codigo_j)
  if (divisor == 0):
    return 0
  dividendo = 2 * calculo_ic(calculo_lcs(codigo_i, codigo_j))
  c_i_j = dividendo / divisor
  return round(c_i_j, 6)

def metrica_s(lista_codigos_i, lista_codigos_j): 
  #las listas deben tener valores 
  if (len(lista_codigos_i) == 0 or len(lista_codigos_j) == 0):
    return 0

  #max (Ng, Ns)
  divisor = max(len(lista_codigos_i), len(lista_codigos_j)) 
  if (divisor == 0):
    return 0
  
  max_c_i_j = 0
  for codigo_j in lista_codigos_j:
    max_local_c_i_j = 0
    for codigo_i in lista_codigos_i:
      c_i_j = calculo_c(codigo_i, codigo_j)
      if (c_i_j > max_local_c_i_j):
        max_local_c_i_j = c_i_j
    max_c_i_j += max_local_c_i_j

  s = max_c_i_j / divisor
  return round(s, 6)

def metrica_s_train (y_true, y_pred):     
    y_true_labels = []
    true_labels = [class2label.int2str([idx])for idx, label in enumerate(y_true) if label == 1.0]
    for label in true_labels:
      y_true_labels.append(label[0])

    y_pred_labels = []
    pred_labels = [class2label.int2str([idx])for idx, label in enumerate(y_pred) if label == 1.0]
    for label in pred_labels:
      y_pred_labels.append(label[0])

    return metrica_s(y_true_labels, y_pred_labels)

## Inference

Let's test the model on a test report

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained(
    CHECKPOINT, 
    num_labels=class2label.num_classes, 
    problem_type = "multi_label_classification",
    use_auth_token=HUGGING_FACE_TOKEN)

In [None]:
import numpy as np
import pandas as pd
df_resultados = pd.DataFrame(columns=["y_true", "y_pred", "metrica_s"])

In [None]:
dataset[test].num_rows

In [None]:
from tqdm.notebook import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

for i in tqdm(range(dataset[test].num_rows)):
  inputs = preprocess_data(dataset[test][i])
  outputs = model(**inputs)

  logits = outputs.logits
  # apply sigmoid + threshold
  sigmoid = torch.nn.Sigmoid()
  probs = sigmoid(logits.squeeze().cpu())
  #print(probs)
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs > 0.9)] = 1
  #print(predictions)
  # turn predicted id's into actual label names
  predicted_labels = [class2label.int2str([idx])for idx, label in enumerate(predictions) if label == 1.0]
  p = []
  for label in predicted_labels:
    p.append(label[0])
  predicted_labels = p  
  metrica_s_value = metrica_s(dataset[test][i]['label_list_str'], predicted_labels)  
  resultados = ({
      'y_true':dataset[test][i]['label_list_str'],
      'y_pred' :predicted_labels,
      'metrica_s':metrica_s_value})
  df_resultados = df_resultados.append(resultados, ignore_index=True)


In [None]:
suma_metrica_s = df_resultados['metrica_s'].sum()

macro_average = round(suma_metrica_s / dataset[test].num_rows, 6)

print(macro_average)

In [None]:
class2label.str2int('T38.0X5A')

In [None]:
df_resultados