# Fake News Detector
Notebook para entrenar un modelo LLM en español para detectar si una noticia es una *fake new*.

Se ha usado el siguiente dataset de [Kaggle](https://www.kaggle.com/datasets/javieroterovizoso/spanish-political-fake-news?resource=download&select=D57000_complete.csv)

## Librerías
Instalamos las librerías necesarias

In [8]:
%pip install torch transformers pandas matplotlib datasets accelerate numpy scikit-learn



Importamos las librerías

In [2]:
from transformers import (TrainingArguments, Trainer, AutoTokenizer,
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding)

from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import (
    precision_score,
    accuracy_score,
    recall_score,
    f1_score)

import pandas as pd
import numpy as np
import torch
import json
import re

# Funciones relacionados con los datasets
Se han definido tres funciones para trabajar con el dataset:
- *processing_text* -> separar signos de puntuación de las palabras.
- *stats_dataset* -> función que imprime diferentes estadísticas del dataset
- *split_dataset_train_dev_test* -> función para generar tres subconjuntos a partir del dataset. Se ha seguido por una división 80-10-10

In [3]:
def processing_text(text: str):
    punctuation = r'[,;.:¡!¿?@#$%&[\](){}<>~=+\-*/|\\_^`"\'“”º]'
    text = re.sub(punctuation, ' ', text)
    text = re.sub(' +', ' ', text)
    return text

In [4]:
def stats_dataset(df: pd.DataFrame):
    df['TOTAL_WORDS_TITULO'] = df['Titulo'].str.count(' ') + 1
    df['TOTAL_WORDS_DESCP'] = df['Descripcion'].str.count(' ') + 1
    df['TOTAL_WORDS_TITULO_DESCP'] = df['title_descp'].str.count(' ') + 1

    print(f"COUNT FAKE NEWS: {len(df[df['Label'] == 1])}")
    print(f"COUNT REAL NEWS: {len(df[df['Label'] == 0])}")
    print(f"MEAN_WORDS_TITULO: {df.loc[:, 'TOTAL_WORDS_TITULO'].mean()}")
    print(f"MEAN_WORDS_DESCP: {df.loc[:, 'TOTAL_WORDS_DESCP'].mean()}")
    print(f"MEAN_WORDS_TITULO_DESCP: {df.loc[:, 'TOTAL_WORDS_TITULO_DESCP'].mean()}")

In [5]:
def split_dataset_train_dev_test(df: pd.DataFrame):
    X_train, X_test, y_train, y_test = train_test_split(df, df['Label'],
                                                        test_size=0.2, random_state=42, stratify=df['Label'])

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                      test_size=0.25, random_state=42, stratify=y_train)

    return X_train, X_val, X_val

In [7]:
    path_dataset = './dataset/D57000_complete.csv' # descargar y añadir el dataset en el path
    df = pd.read_csv(path_dataset, sep=';')
    df = df.sample(n=3000) # get 3000 texts bc Google colab is too slow
    df['title_descp'] = df['Titulo'] + ' ' + df['Descripcion']
    df['title_descp'] = df['title_descp'].apply(lambda x: processing_text(x))
    stats_dataset(df)

    train_df, val_df, test_df = split_dataset_train_dev_test(df)

COUNT FAKE NEWS: 1688
COUNT REAL NEWS: 1312
MEAN_WORDS_TITULO: 16.105666666666668
MEAN_WORDS_DESCP: 37.87566666666667
MEAN_WORDS_TITULO_DESCP: 55.133


# Funciones relacionadas con el entrenamiento
En este caso contemplamos dos funciones:
- tokenize: se encarga de tokenizar con el tokenizador del modelo cada uno de los textos
- train: función para entrenar el modelo
- evaluate: función para evaluar el modelo durante la fase de entrenamiento y testeo


In [8]:
def tokenize(dt: Dataset, tokenizer: AutoTokenizer):
    params = {'add_special_tokens': True,
              'truncation': True,
              'padding': 'max_length',
              'return_token_type_ids': True}

    return tokenizer(dt['title_descp'], **params)

In [18]:
def train(train_df: pd.DataFrame, val_df: pd.DataFrame, model: AutoModelForSequenceClassification,
          tokenizer: AutoTokenizer, lr=3e-5, epochs=10,
          batch_size=16, save_model='../modelos/fine-tuning',
          name_model='fake_news_detector'):

    train_dt = Dataset.from_pandas(train_df).rename_column("Label", "labels")
    val_dt = Dataset.from_pandas(val_df).rename_column("Label", "labels")

    train_dt = train_dt.map(lambda dt: tokenize(dt, tokenizer), batched=True)
    val_dt = val_dt.map(lambda dt: tokenize(dt, tokenizer), batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")

    training_args = TrainingArguments(
        output_dir=save_model,  # "HF repository or path model"
        learning_rate=lr,  # 1e-3
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        evaluation_strategy="steps",
        save_strategy="no",
        load_best_model_at_end=False,
        optim="adamw_torch",
        seed=42,
        logging_dir=f"./{save_model}/logs/{name_model}",
        logging_strategy="steps",
        eval_steps=100,
        report_to=["tensorboard"],
        use_cpu=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=train_dt,
        eval_dataset=val_dt,
        data_collator=data_collator,
        compute_metrics=evaluate
    )

    trainer.args._n_gpu = 1  # set one gpu
    torch.cuda.empty_cache()

    print("Starting the training process...")
    train_results = trainer.train()
    print("training process finished...")

    with open(save_model + f"evaluation_metrics.json", 'w') as file:
        json.dump(trainer.state.log_history, file, indent=4)

    # save training metrics
    metrics = train_results.metrics
    trainer.log_metrics("all", metrics)
    trainer.save_metrics("all", metrics)


    print("Saving model...")
    trainer.save_model(save_model)
    trainer.model.roberta.config.to_json_file(save_model + '/config.json')

In [9]:
def evaluate(preds):
    logits, tags = preds
    preds = [y_pred.argmax().item() for y_pred in logits]  # longSeq

    tags = np.array(tags)
    preds = np.array(preds)

    # matrix
    tp = ((tags == 1) & (preds == 1)).sum()
    tn = ((tags == 0) & (preds == 0)).sum()
    fp = ((tags == 0) & (preds == 1)).sum()
    fn = ((tags == 1) & (preds == 0)).sum()

    matrix = {'tp': tp, 'fn': fn, 'fp': fp, 'tn': tn}
    metrics = {'accuracy': accuracy_score(tags, preds),
               'precision': precision_score(tags, preds),
               'recall': recall_score(tags, preds),
               'f1': f1_score(tags, preds)
              }

    result = {'matrix': matrix, 'metrics': metrics}

    print(result)

    return result

# Entrenamiento
Se define el modelo y se inicia el proceso de entrenamiento

In [19]:
  model_repo = "PlanTL-GOB-ES/roberta-base-bne"
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
  model = AutoModelForSequenceClassification.from_pretrained(model_repo)
  train(train_df, val_df, model, tokenizer, epochs=3, batch_size=16, save_model='./trained')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]



Starting the training process...


Step,Training Loss,Validation Loss,Matrix,Metrics
100,No log,0.172847,"{'tp': 320, 'fn': 24, 'fp': 14, 'tn': 242}","{'accuracy': 0.9366666666666666, 'precision': 0.9580838323353293, 'recall': 0.9302325581395349, 'f1': 0.943952802359882}"
200,No log,0.137984,"{'tp': 328, 'fn': 16, 'fp': 12, 'tn': 244}","{'accuracy': 0.9533333333333334, 'precision': 0.9647058823529412, 'recall': 0.9534883720930233, 'f1': 0.9590643274853802}"
300,No log,0.153472,"{'tp': 328, 'fn': 16, 'fp': 15, 'tn': 241}","{'accuracy': 0.9483333333333334, 'precision': 0.956268221574344, 'recall': 0.9534883720930233, 'f1': 0.9548762736535662}"


Trainer is attempting to log a value of "{'tp': 320, 'fn': 24, 'fp': 14, 'tn': 242}" of type <class 'dict'> for key "eval/matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9366666666666666, 'precision': 0.9580838323353293, 'recall': 0.9302325581395349, 'f1': 0.943952802359882}" of type <class 'dict'> for key "eval/metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'matrix': {'tp': 320, 'fn': 24, 'fp': 14, 'tn': 242}, 'metrics': {'accuracy': 0.9366666666666666, 'precision': 0.9580838323353293, 'recall': 0.9302325581395349, 'f1': 0.943952802359882}}


Trainer is attempting to log a value of "{'tp': 328, 'fn': 16, 'fp': 12, 'tn': 244}" of type <class 'dict'> for key "eval/matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9533333333333334, 'precision': 0.9647058823529412, 'recall': 0.9534883720930233, 'f1': 0.9590643274853802}" of type <class 'dict'> for key "eval/metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'matrix': {'tp': 328, 'fn': 16, 'fp': 12, 'tn': 244}, 'metrics': {'accuracy': 0.9533333333333334, 'precision': 0.9647058823529412, 'recall': 0.9534883720930233, 'f1': 0.9590643274853802}}


Trainer is attempting to log a value of "{'tp': 328, 'fn': 16, 'fp': 15, 'tn': 241}" of type <class 'dict'> for key "eval/matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.9483333333333334, 'precision': 0.956268221574344, 'recall': 0.9534883720930233, 'f1': 0.9548762736535662}" of type <class 'dict'> for key "eval/metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'matrix': {'tp': 328, 'fn': 16, 'fp': 15, 'tn': 241}, 'metrics': {'accuracy': 0.9483333333333334, 'precision': 0.956268221574344, 'recall': 0.9534883720930233, 'f1': 0.9548762736535662}}
training process finished...
***** all metrics *****
  epoch                    =        3.0
  total_flos               =  1323222GF
  train_loss               =     0.1525
  train_runtime            = 0:08:28.05
  train_samples_per_second =     10.629
  train_steps_per_second   =      0.667
Saving model...


# Testing
Desarrollamos una función para testear el modelo entrenado

In [10]:
def test(df: pd.DataFrame, trained_model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(trained_model_path)
    tok_params = {'add_special_tokens': True,
                  'truncation': True,
                  'padding': 'max_length',
                  'return_token_type_ids': True}

    preds, tags = [], []
    for idx, row in df.iterrows():
        text = row['Titulo'] + ' ' + row['Descripcion']
        text = processing_text(text)
        inputs = tokenizer(text, **tok_params)

        params = {'input_ids': torch.tensor([inputs["input_ids"]]),
                  'attention_mask': torch.tensor([inputs["attention_mask"]]),
                  'token_type_ids': torch.tensor([inputs["token_type_ids"]])}

        outputs = model(**params)

        pred = outputs.logits.argmax().item()
        preds.append(int(pred))
        tags.append(row['Label'])

    metrics = {
        'accuracy': accuracy_score(tags, preds),
        'precision': precision_score(tags, preds),
        'recall': recall_score(tags, preds),
        'f1': f1_score(tags, preds)
    }
    print(metrics)

In [12]:
test(test_df, './trained')

{'accuracy': 0.9416666666666667, 'precision': 0.9520958083832335, 'recall': 0.9436201780415431, 'f1': 0.9478390461997019}
