<a href="https://colab.research.google.com/github/carlosprr29/ai-progetto-spagnoli/blob/main/notebooks/03_3_BERT_Training_Maestro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

a third dataset; in any case, this whole process of trial and error is pure gold for documenting memory.

We have put together a 'fusion' of WELFake and ISOT (cleaning up the biases from Reuters) to see if we can achieve the definitive model, but if this is not enough, we will look for a third dataset; in any case, this whole process of trial and error is pure gold for documenting memory.

In [None]:
# =================================================================
# ARCHIVO 03.2: ENTRENAMIENTO DEL MODELO MAESTRO (FUSIÃ“N)
# =================================================================

# 1. INSTALLATION AND BOOKSTORES
!pip install -q transformers datasets torch scikit-learn
import pandas as pd
import numpy as np
import random
import torch
import re
import shutil
from google.colab import drive
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# 2. LOADING AND CLEANING (WELFake + ISOT)
drive.mount('/content/drive')

print("ðŸ“¦ 1/3: Loading WELFake...")
ds_welfake = load_dataset("davanstrien/WELFake")
df_w = ds_welfake["train"].to_pandas()[['title', 'text', 'label']].dropna()

print("ðŸ“¦ 2/3: Loading ISOT...")
# Adjust the paths if your files are in another folder.
path_true = '/content/drive/MyDrive/Project_IA/data/True.csv'
path_fake = '/content/drive/MyDrive/Project_IA/data/Fake.csv'

df_t = pd.read_csv(path_true)
df_f = pd.read_csv(path_fake)
df_t['label'], df_f['label'] = 0, 1
df_i = pd.concat([df_t, df_f]).dropna()

# MASTER CLEAN-UP FUNCTION (FOR BOTH DATASETS)
def clean_master(text):
    if not isinstance(text, str): return ""

    # A. Cleaning up datelines (Location/Agency at the beginning, such as WASHINGTON (Reuters) -)
    text = re.sub(r'^[^-:]*[-:]\s*', '', text)

    # B. Web Noise Cleanup (URLs and Mentions)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\S+', '', text)

    # C. Cleaning Signatures and Agencies (Avoiding Shortcuts for Media Names)
    # We remove mentions of Reuters, Breitbart, CNN, etc., so that BERT does not memorise brands.
    marcas = r'\b(Reuters|Breitbart|InfoWars|CNN|Fox News|BBC|Associated Press|AP)\b'
    text = re.sub(marcas, 'the news outlet', text, flags=re.IGNORECASE)

    # D. Cleaning up traces of "Read more" or signatures at the end
    text = re.sub(r'(?i)read more|source\s*[:\-].*', '', text)

    return text.strip()

print("ðŸ§¹ 3/3: Applying Unified Master Cleaning...")
df_w['text'] = df_w['text'].apply(clean_master)
df_i['text'] = df_i['text'].apply(clean_master)

#  MERGE AND PREPARATION
print("ðŸ§ª Generating Merge and Ablation Study...")
df_fusion = pd.concat([df_w, df_i]).sample(frac=1, random_state=42).reset_index(drop=True)

# Safety filter: remove news items that have been left almost empty after cleaning
df_fusion = df_fusion[df_fusion['text'].str.len() > 100]

# Create a combined column
df_fusion['total'] = df_fusion['title'] + " " + df_fusion['text']

# Stratified sampling
train_df, test_df = train_test_split(df_fusion, test_size=0.2, stratify=df_fusion['label'], random_state=42)
train_sample = train_df.sample(n=8000, random_state=42) # We increased it to 8k due to the complexity of the merger.
test_sample = test_df.sample(n=2000, random_state=42)

In [None]:
# 3. TRAINING FUNCTION
def train_master(x_train, x_test, y_train, y_test, name):
    print(f"\nðŸš€ Training variant: {name}")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    train_enc = tokenizer(x_train.tolist(), truncation=True, padding=True, max_length=128)
    test_enc = tokenizer(x_test.tolist(), truncation=True, padding=True, max_length=128)

    class DatasetTorch(torch.utils.data.Dataset):
        def __init__(self, encodings, labels): self.encodings, self.labels = encodings, labels
        def __getitem__(self, idx):
            item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
            item["labels"] = torch.tensor(self.labels[idx])
            return item
        def __len__(self): return len(self.labels)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to("cuda")

    args = TrainingArguments(output_dir=f"./res_{name}", num_train_epochs=2,
                             per_device_train_batch_size=16, fp16=True,
                             eval_strategy="epoch", save_strategy="no")

    trainer = Trainer(model=model, args=args,
                      train_dataset=DatasetTorch(train_enc, y_train.tolist()),
                      eval_dataset=DatasetTorch(test_enc, y_test.tolist()),
                      compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))})
    trainer.train()
    return model, tokenizer

In [None]:
# 4. EXECUTION OF THE ABLATION (MERGER) STUDY
# MODEL A: Securities Only (WELFake + ISOT Merger)
model_fus_tit, token_fus_tit = train_master(
    train_sample["title"], test_sample["title"],
    train_sample["label"], test_sample["label"], "FUSION_TITLES"
)

# Save locally
model_fus_tit.save_pretrained('./model_fusion_titles')
token_fus_tit.save_pretrained('./model_fusion_titles')

# Connect and copy to Drive
from google.colab import drive
drive.mount('/content/drive')
import shutil
# We copy it to the folder (using dirs_exist_ok=True in case you repeat the execution).
shutil.copytree('./model_fusion_titles', '/content/drive/MyDrive/Project_IA/model_fusion_titles', dirs_exist_ok=True)

In [None]:
# MODEL B: Title + Text (Complete and Clean Merge)
model_fus_full, token_fus_full = train_master(
    train_sample["total"], test_sample["total"],
    train_sample["label"], test_sample["label"], "FUSION_FULL_TEXT"
)

# Save locally
model_fus_full.save_pretrained('./model_fusion_full')
token_fus_full.save_pretrained('./model_fusion_full')

In [None]:
# 5. Connect and copy to Drive
from google.colab import drive
drive.mount('/content/drive')
import shutil
# We copy it into the folder.
shutil.copytree('./model_fusion_full', '/content/drive/MyDrive/Project_IA/model_fusion_full', dirs_exist_ok=True)

print("\nâœ… Fusion ablation study completed and models exported!")