In [1]:
import pandas as pd
import numpy as np
import os
import re


In [2]:
data_dir='/Users/achain/Documents/github/web-mining/pages/'

In [3]:
plain_txt=[ data_dir+i for i in os.listdir(data_dir) if i.find('plain')!=-1]

In [4]:
plain_txt

['/Users/achain/Documents/github/web-mining/pages/eco_content_plain',
 '/Users/achain/Documents/github/web-mining/pages/el-mun_content_plain',
 '/Users/achain/Documents/github/web-mining/pages/socie_content_plain',
 '/Users/achain/Documents/github/web-mining/pages/el-pai_content_plain']

In [5]:
labels = []
texts = []
dates = []
date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2})')

# Iterate over folders
for folder_path in plain_txt:
    label = os.path.basename(folder_path)  # Extract label from folder name
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            # Extract date using regular expression
            match = date_pattern.search(filename)
            date = match.group(1) if match else None
            
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                
                # If the title starts with "None", assign null date
                if date and not text.startswith("None"):
                    labels.append(label)
                    texts.append(text)
                    dates.append(date)

# Create a DataFrame
df = pd.DataFrame({'label': labels, 'text': texts, 'date': dates})



In [6]:
df['label']=df.label.str.split('_',expand=True).iloc[:,0]

In [7]:
df.groupby('label')['text'].count()

label
eco       486
el-mun    545
el-pai    499
socie     544
Name: text, dtype: int64

In [9]:
df.groupby('label').agg({'date':'min','date':'min'})

Unnamed: 0_level_0,date
label,Unnamed: 1_level_1
eco,2023-08-05
el-mun,2023-07-05
el-pai,2023-09-04
socie,2023-08-16


In [10]:
length_text=np.array([len(i) for i in df.text])

In [11]:
length_text.mean()

4130.912729026037

In [12]:
length_text.max()

37619

In [13]:
length_text.min()

287

In [None]:
### Probamos hacer fine tunning del modelo con bert

In [14]:
#!pip install transformers==4.31.0
#!pip install torch
import torch

In [15]:
# Hay una GPU disponible?
is_gpu = torch.cuda.is_available()
if is_gpu:
    # Decile a PyTorch que use la GPU.
    device = torch.device("cuda")
    print(f'\nHay {torch.cuda.device_count()} GPU(s) disponible(s).')
    print(f'Vamos a usar la GPU: {torch.cuda.get_device_name(0)}.\n')
# si no hay GPU...
else:
    print('\nNo hay GPU disponible, vamos a usar la CPU.\n')
    device = torch.device("cpu")


No hay GPU disponible, vamos a usar la CPU.



In [16]:
# Cargar el tokenizador correspondiente al modelo de BERT que vamos a usar.
# TIENE que ser exactamente el mismo.
from transformers import BertTokenizer

In [17]:
#Probamos con este modelo primero...
## este probamos despues "mrm8488/bert-spanish-cased-finetuned-ner"
# Este es el nombre en Huggingface del modelo de BERT ya entrenado que vamos a usar.
# Ver https://huggingface.co/models?language=es&sort=downloads para mas modelos en castellano.
BERT_MODEL = 'Recognai/bert-base-spanish-wwm-cased-xnli'

In [18]:
print('Cargando tokenizador de BERT...')
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
print('\nTokenizador de BERT listo.')

Cargando tokenizador de BERT...

Tokenizador de BERT listo.


In [19]:
my_dataset=df.copy()

In [20]:
my_dataset.columns=['label', 'sentence', 'date']

In [21]:
# IMPORTANTE:
# 1) Su dataset debe ser un dataframe de Pandas con 2 columnas: 'sentence' conteniendo el texto sin tokenizar, y 'label' conteniendo la categoria correspondiente.
# 2) BERT no acepta mas de 512 tokens por frase. MY_DATASET_MAX_TOKENS es el maximo de tokens por frase que vamos a usar, el resto de descarta.
# Si hay menos de 512 tokens el tokenizer va a llenar el resto con el token PAD.
MY_DATASET_MAX_TOKENS = 512
if MY_DATASET_MAX_TOKENS>512:
  raise ValueError(f"ERROR: BERT no puede codificar frases con mas de 512 tokens, pero MY_DATASET_MAX_TOKENS = {MY_DATASET_MAX_TOKENS}.")

# Si LAYERS_TO_FINETUNE==2, entonces conservamos los pesos de todas excepto la ultma capa de encoding y
# hacemos finetuning the la ultima capa de encodign y  la capa de clasificacion.
# Si es LAYERS_TO_FINETUNE>2, entonces hacemos finetuning the las ultimas LAYERS_TO_FINETUNE (hasta 12) capa de encoding y de la capa de clasificacion.
# Si LAYERS_TO_FINETUNE==1, hacemos finetuning solo de la capa de clasificacion, entrena mas rápido pero adapta menos pesos.
LAYERS_TO_FINETUNE=1
if LAYERS_TO_FINETUNE<1 or LAYERS_TO_FINETUNE>13:
  raise ValueError("LAYERS_TO_FINETUNE no puede ser menor a 1 o mayor a 13, poque BERT tiene 12 capas de encoder + 1 de clasificacion.")

# Que porcentaje del dataset vamos a usar para entrenar ?
TRAIN_FOR_PCT = 0.7
TRAIN_FOLD_SIZE = int(TRAIN_FOR_PCT * len(my_dataset))

# Number of training epochs. The BERT authors recommend between 2 and 4
# for over 1000 examples.
# We chose to run for 4, but this may be over-fitting the
# training data.
EPOCHS = 8

In [22]:
# Crear las listas de frases labels numéricosx
sentences = my_dataset.sentence.values
# labels en BERT DEBEN SER enteros; aqui transformos los valores de los labels
# a ints (mismo int, mismo valor), comenzando por 0.
labels, label_texts = pd.factorize(my_dataset.label)

# reordenar filas del dataset al azar; importante para el descenso de gradiente estocastico
my_dataset = my_dataset.iloc[np.random.permutation(len(my_dataset))]
my_dataset.reset_index(drop=True)

# MI_DATASET_NUM_LABELS es la cantidad de categorias distintas en su columna 'label'.
MY_DATASET_NUM_LABELS = labels.max() + 1

In [None]:
#!pip install protobuf==3.20.0
#!pip install --upgrade pip
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [37]:
# codificar frases en su dataset
input_ids = []
attention_masks = []

# Por cada frase ...
for sent in sentences:
    # `encode_plus` va a :
    #   (1) Tokenizar la frase.
    #   (2) Poner el token `[CLS]` al comienzo de la frase.
    #   (3) Poner el token `[SEP]` al final de la frase.
    #   (4) Mapear tokens a sus IDs.
    #   (5) Rellenar o truncar la frase a `max_length`
    #   (6) Crear attention masks para los tokens [PAD].
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # frase a codificar.
                        add_special_tokens = True, # Agregar '[CLS]' y '[SEP]'
                        max_length = MY_DATASET_MAX_TOKENS,  # Rellenar frases cortas, truncar largas
                        padding='max_length',
                        truncation=True,
                        return_token_type_ids=True,
                        return_attention_mask = True,   # Construir attn. masks  (diferencian padding de non-padding).
                        return_tensors = 'pt',     # retornar los vectores de pytorchxs.
                   )

    # Agregar la frase codificada a la lista de frases
    input_ids.append(encoded_dict['input_ids'])

    # Agregar su correspondiente attention mask
    attention_masks.append(encoded_dict['attention_mask'])

# Convertiar las listas a tensores de Pytorch
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [33]:
## este es para ver la frase
tokenizer.decode(tokenizer.encode_plus( sentences[0],
                        sent,                      # frase a codificar.
                        add_special_tokens = True, # Agregar '[CLS]' y '[SEP]'
                        max_length = MY_DATASET_MAX_TOKENS,  # Rellenar frases cortas, truncar largas
                        padding='max_length',
                        truncation=True)['input_ids'])

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


'[CLS] [UNK] El Banco Central compró este jueves 9 millones de dólares en el mercado de cambios, con lo que registró 23 jornadas consecutivas con resultado positivo. La autoridad monetaria, al mismo tiempo, no modificó la tasa de interés en pesos, luego de los datos de inflación de agosto, cuando se anotó un alza del 12, 4 por ciento de aumentos en los precios minoristas. El dólar blue registró un retroceso de 10 pesos a 725 pesos por unidad. " En septiembre, el Banco Central acumula compras por 350 millones de dólares y totaliza desde el 24 de julio más de 2. 500 millones ", según detalló Gustavo Quintana, analista de PR Corredores de Cambio. En lo referido al tipo de cambio, el dólar minorista cerró a 365, 50 pesos. En el segmento bursátil, el dólar contado con liquidación ( CCL ) bajó 0, 2 por ciento, a 739, 25 pesos [UNK] mientras que el MEP subió 0, 8 por ciento, a 681, 88 pesos, en el tramo final de la rueda. En el mercado mayorista, la divisa estadounidense finalizó con una caíd

# Cargamos el BERT en español pre-entrenado

In [44]:
# Cargar modelo pre-entrenado

from transformers import BertForSequenceClassification, AdamW, BertConfig

# Cargar un BertForSequenceClassification, que es un modelo de BERT pre entrenado
# con una capa de clasificación al final.
print('Cargando pesos de modelo BERT pre-entrenado...\n')
model = BertForSequenceClassification.from_pretrained(
    BERT_MODEL, # Modelo a usar
    num_labels=MY_DATASET_NUM_LABELS, # cant. de categorias de mi dataset
    ignore_mismatched_sizes=True, # si el modelo pre-entrenado tiene distinta cant. de categorias que mi dataset, ignorar las categorias del pre-entrenado, total las vamos a cambiar.
    output_attentions = False, # El modelo debe retornat attention weights?
    output_hidden_states = False # El modelo debe retornar todos los pesos de las capas de NN?
)
print('\nEl modelo BERT pre-entrenado está listo.')

Cargando pesos de modelo BERT pre-entrenado...



Downloading model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Recognai/bert-base-spanish-wwm-cased-xnli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



El modelo BERT pre-entrenado está listo.


# Setup del codigo para el fine tunning

In [51]:
#  setup para entrenar

from torch.utils.data import TensorDataset, random_split
# Combinar todos los ejemplos de entrenamiento en un TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)
# Crear un split de validacion 90-10.

# Calcular la cant. de ejmplos en train y test
train_size = int(TRAIN_FOLD_SIZE)
val_size = len(dataset) - train_size

# Dividir el dataset seleccionado ejemplos al azar.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print(f'Tamaño dataset de entrenamiento: {train_size} muestras')
print(f'Tamaño dataset de validacion: {val_size} muestras')

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# El DataLoader necesita conocer el  batch size para entrenar, asi que aquí
# lo definimos. Para hacer fine-tuning de BERT, los autores recomienzan un
# batch size de 16 o 32.
batch_size = 32

# Crear los DataLoaders para los dataset de entrenamiento y validacion.
# Usaremos ejemplos de entrenamiento ordenados al azar
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# Para validación el orde no importa, asi que lo hacemo secuencialmente
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

from transformers import AdamW, BertConfig

#if FINETUNE_ONLY_CLASSIFICATION_LAYER:
  # BERT is made of 3 modules: bert, dropout, and classifier
  # here we freeze the weights of all layers but the classifier module
#  for name, param in model.named_parameters():
#    if 'classifier' not in name: # classifier module
#      param.requires_grad = False

# Tell pytorch to run this model on the GPU if possible
if is_gpu:
    model.cuda()

# optimizer and learning rate
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

from transformers import get_linear_schedule_with_warmup

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Tamaño dataset de entrenamiento: 1451 muestras
Tamaño dataset de validacion: 623 muestras




In [None]:
# loop de entrenamiento
import numpy as np

def flat_accuracy(preds, labels):
    """
    Esta funcion calcula accuracy de predicción de labels
    """
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    """
    Recibe un timestamp en segundos y retorna un string hh:mm:ss
    """
    # Redonda a segundos
    elapsed_rounded = int(round((elapsed)))
    # Formatear como hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# Para cada epoch...
for epoch_i in range(0, EPOCHS):

    # ========================================
    #               Training
    # ========================================

    # llevar a cabo 1 pasada sobre todo el dataset de entrenamiento.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Resetear la perdida (the total loss) de este epoch
    total_train_loss = 0

    # Pasar al modelo a "training mode". Ojo! esto setea el modo en "training"
    # pero no ejecuta el  entrenamiento.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # Por cada batch de datos de entrenamiento...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        #(loss_tensor, logits)
        trainstep_output  = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_train_loss += trainstep_output.loss.item()

        # Perform a backward pass to calculate the gradients.
        trainstep_output.loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            trainstep_output = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)

        # Accumulate the validation loss.
        total_eval_loss += trainstep_output.loss.item()

        # Move logits and labels to CPU
        logits = trainstep_output.logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
