<a href="https://colab.research.google.com/github/caesarcc/python-tcc-url-fakenews-check/jupyter/blob/main/bertimbau_fakebr_treino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Abrir no Colab"/></a>

In [None]:
#Garantir dependências fora do padrão, caso use diretamente no Colab.
!pip install -q transformers

In [None]:
# Imports
import pandas as pd
import torch
import numpy as np
from timeit import default_timer as timer
from sklearn.metrics import cohen_kappa_score, accuracy_score
from transformers import (BertTokenizer,
                          BertForSequenceClassification,
                          TrainingArguments,
                          Trainer)
import os
import re

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display

### Baixar os dados do [Fake.br-Corpus](https://github.com/roneysco/Fake.br-Corpus)

*Atenção:* Para rodar localmente, é preciso salvar o pacote Fake.br-Corpus no diretorio do projeto.

In [None]:
!git clone https://github.com/roneysco/Fake.br-Corpus

### Carregando os dados

Vou utilizar a amostra "size_normalized_texts" do Corpus dado que ela ajuda a evitar viéses no treino, conforme informação disponível na documentação. Cada par de texto verdadeiro ou falso possui a mesma quantidade de palavras, truncando o maior pelo menor.

In [None]:
DADOS_CAMINHO = "./Fake.br-Corpus/size_normalized_texts"

def load_txts(path):
    txts = []
    for filename in sorted(os.listdir(path), key=lambda x: int(re.match("[0-9]+", x).group())):
        with open(os.path.join(path, filename)) as f:
            txts.append(f.read())
    return txts


true_txts = load_txts(os.path.join(DADOS_CAMINHO, "true"))
fake_txts = load_txts(os.path.join(DADOS_CAMINHO, "fake"))

#confere se tem a mesma quantidade
assert(len(true_txts) == len(fake_txts))

data = pd.DataFrame(
    [{"texto": t, "label": 0} for t in true_txts] \
         + [{"texto": f, "label": 1}  for f in fake_txts]
).sample(frac=1)

pd.set_option('max_colwidth', 200)

display(data)

### Explorando os dados

In [None]:
sample = data.sample(n=1)
print("> TEXT: \n{}".format(sample["text"].values[0]))
print("> LABEL: {}".format(sample["label"].values[0]))

In [None]:
def count_words(texts):
    counts = []
    for text in texts:
        num_words = len(re.findall(r"[\w']+|[.,!?;:/\"]", text))
        counts.append(num_words)
    return np.array(counts)


counts = count_words(data["text"].values)
intervals = list(range(0, 601, 50))

plt.figure(figsize=(8,6))
plt.rcParams.update({'font.size': 14})
plt.hist(counts, rwidth=0.9, edgecolor='black', linewidth=1, bins=intervals)
plt.xticks(intervals)

plt.xlabel("Number of words and punctuation", fontsize=14, fontweight="bold")
plt.ylabel("Number of texts", fontsize=14, fontweight="bold");

### Separando treino, teste e validação

In [None]:
DATA_SPLIT_PC = [0.75, 0.125, 0.125]
assert sum(DATA_SPLIT_PC) == 1

train_df = data.sample(frac=DATA_SPLIT_PC[0])
val_df = data.drop(train_df.index).sample(n=int(len(data) * DATA_SPLIT_PC[1]))
test_df = data.drop(train_df.index).drop(val_df.index)

print("> Training samples: %d" % len(train_df))
print("> Validation samples: %d" % len(val_df))
print("> Test samples: %d" % len(test_df))

### Ajustar para o dataset ser compatível com BERT

In [None]:
# BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased",
                                               do_lower_case=False)

In [None]:
class BertFakeNewsDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer=bert_tokenizer, max_seq_length=512):
        self._labels = torch.tensor(df["label"].values, dtype=torch.long)
        self._encodings = {"input_ids": [],
                           "token_type_ids": [],
                           "attention_mask": []}

        for txt in df["text"].values:
            enc_dict = tokenizer.encode_plus(
                text=txt,
                add_special_tokens=True,
                max_length=max_seq_length,
                return_token_type_ids=True,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt",
                truncation=True,
            )
            for k, v in enc_dict.items():
                self._encodings[k].append(v[0])
    
    def __getitem__(self, idx):
        item = {key: value[idx] for key, value in self._encodings.items()}
        item["labels"] = self._labels[idx]
        return item

    def __len__(self):
        return len(self._labels)

In [None]:
train_ds = BertFakeNewsDataset(train_df)
val_ds = BertFakeNewsDataset(val_df)
test_ds = BertFakeNewsDataset(test_df)

In [None]:
train_ds[0]

### Carregando o modelo BERT

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=2,
)

In [None]:
sample = {k: torch.unsqueeze(v, 0)
          for k, v in train_ds[0].items()}

start_time = timer()
print(f"Output: {model(**sample)}")
print(f"Time: {timer() - start_time:.5f}s")

### Treinando o modelo

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "qwk": cohen_kappa_score(labels, predictions, weights="quadratic"),
    }

In [None]:
train_args = TrainingArguments(
    output_dir="drive/MyDrive/Colab Notebooks/saved_models/fake_news_bertimbau",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2.5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=10,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

### Validando o modelo

In [None]:
trainer.evaluate(train_ds)

In [None]:
trainer.evaluate(val_ds)

In [None]:
trainer.evaluate(test_ds)

### Salvando o melhor modelo

In [None]:
model.save_pretrained("drive/MyDrive/Colab Notebooks/saved_models/fake_news_bertimbau/best_model")