In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 7.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 73.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 51.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 7.0 MB/

In [4]:
# imports
import pandas as pd
from copy import deepcopy
import os
from datasets import Dataset, Features, Value, ClassLabel
import re
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from evaluate import evaluator

# rutas
#parent_dir = #os.path.dirname(os.getcwd())
path_datos = "/content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/parrafos/parrafos.xlsx"#os.path.join(parent_dir, "parrafos", "parrafos.xlsx")
path_resultados = "/content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados"#os.path.join(parent_dir, "resultados")

In [5]:
# funciones

def categoria(old_label):
  new_label = None
  if old_label in ["Antecedentes de hecho y hechos probados", "Fundamentos de Derecho"]:
    new_label = 0
  if old_label == "Encabezamiento":
    new_label = 1
  if old_label == "Parte dispositiva y fallo":
    new_label = 2
  return(new_label)


def split_df(df_datos, train = 0.8):
  df = deepcopy(df_datos)
  df_train = df.sample(frac = train)
  df_validation = df.drop(df_train.index)
  df_train.reset_index(drop = True, inplace = True)
  df_validation.reset_index(drop = True, inplace = True)
  return(df_train, df_validation)


def recortar(parrafo, max_len):
    split_parrafo = parrafo.split(" ")
    if len(split_parrafo) > max_len:
        new_parrafo = " ".join(split_parrafo[:max_len])
        return new_parrafo
    else:
        return parrafo

def tokenize_function(parrafos):
    #return tokenizer(parrafos["text"], padding = "max_length", truncation = True)
    return tokenizer(parrafos["text"], padding = True, truncation = True, max_length = 512, add_special_tokens = True)
    #return tokenizer(parrafos["text"], truncation = True)


In [6]:
# carga de datos
datos = pd.read_excel(path_datos, sheet_name = "Datos", dtype = {"Radicado": "string", "Categoria": "category"})
datos["label"] = datos["Categoria"].apply(lambda x: categoria(x)).astype("category")
datos["Parrafo"] = datos["Parrafo"].apply(lambda x: re.sub("\s{2,}", " ", x)).str.strip()
datos["Parrafo"] = datos["Parrafo"].apply(lambda x: recortar(x, 511))
datos.dtypes


Radicado          string
ID Categoria       int64
No. Parrafo        int64
Parrafo           object
Categoria       category
label           category
dtype: object

In [7]:
datos.head()

Unnamed: 0,Radicado,ID Categoria,No. Parrafo,Parrafo,Categoria,label
0,86001312100120200017500,0,1,JUZGADO PRIMERO CIVIL DEL CIRCUITO\nESPECIALIZ...,Encabezamiento,1
1,86001312100120200017500,0,2,Juez: JUAN JACOBO BURBANO PADILLA,Encabezamiento,1
2,86001312100120200017500,0,3,Sentencia No. 009,Encabezamiento,1
3,86001312100120200017500,0,4,"Mocoa, cinco (05) de abril de dos mil veintidó...",Encabezamiento,1
4,86001312100120200017500,0,5,Referencia: Solicitud Restitución Y Formalizac...,Encabezamiento,1


In [8]:
# dividir los datos
train_df, validation_df = split_df(datos[["Parrafo", "label"]], train = 0.8)
validation_df, test_df = split_df(validation_df, train = 0.5)
print("""El número de registros en los datos de entrenamiento son: {}""".format(train_df.shape[0]))
print("""El número de registros en los datos de prueba son: {}""".format(test_df.shape[0]))
print("""El número de registros en los datos de validación son: {}""".format(validation_df.shape[0]))

El número de registros en los datos de entrenamiento son: 1841
El número de registros en los datos de prueba son: 230
El número de registros en los datos de validación son: 230


In [9]:
# cargar en el formato adecuado
train_df.rename(columns = {"Parrafo": "text"}, inplace = True)
validation_df.rename(columns = {"Parrafo": "text"}, inplace = True)
test_df.rename(columns = {"Parrafo": "text"}, inplace = True)

class_names = [0, 1, 2]
text_features = Features({'text': Value('string'), 'label': ClassLabel(names=class_names)})

train_data = Dataset.from_pandas(train_df, features = text_features)
validation_data = Dataset.from_pandas(validation_df, features = text_features)
test_data = Dataset.from_pandas(test_df, features = text_features)

In [10]:
# Fine tune
tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-base-xnli-es")

# tokenize los datasets

tokenized_train = train_data.map(tokenize_function, batched = True)
tokenized_validation = validation_data.map(tokenize_function, batched = True)
tokenized_test = test_data.map(tokenize_function, batched = True)

Downloading:   0%|          | 0.00/339 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/855k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/514k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
# modelo
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels = 3)

# "bertin-project/bertin-base-xnli-es",


Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

In [12]:
# entrenamiento ajuste
training_args = TrainingArguments(
    output_dir = path_resultados,
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 5,
    weight_decay = 0.01
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1841
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 580
  Number of trainable parameters = 135326979
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.3482


Saving model checkpoint to /content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados/checkpoint-500
Configuration saved in /content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados/checkpoint-500/config.json
Model weights saved in /content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/Shareddrives/Proyecto PNL/Proyecto PLN 20221125/Proyecto PLN 2022-2/sentencias/Clasificación de Parrafos/scripts_python/resultados/checkpoint-500/speci

TrainOutput(global_step=580, training_loss=0.3164880456595585, metrics={'train_runtime': 458.308, 'train_samples_per_second': 20.085, 'train_steps_per_second': 1.266, 'total_flos': 1219384150225920.0, 'train_loss': 0.3164880456595585, 'epoch': 5.0})

In [13]:
task_evaluator = evaluator("text-classification")
results = task_evaluator.compute(
    model_or_pipeline = model,
    data = validation_data,
    metric = "accuracy",
    label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0, "LABEL_2": 2.0},
    strategy="bootstrap",
    n_resamples = 10,
    random_state = 0,
    tokenizer = tokenizer
)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Disabling tokenizer parallelism, we're using DataLoader multithreading already
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
results

{'accuracy': {'confidence_interval': (0.8697346014652945, 0.9217391304347826),
  'standard_error': 0.02043940743224898,
  'score': 0.9043478260869565},
 'total_time_in_seconds': 1.5126401139998507,
 'samples_per_second': 152.0520300045558,
 'latency_in_seconds': 0.006576696147825438}