In [None]:
!pip install torch transformers scikit-learn pandas wandb




In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)


Path to dataset files: /root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1


In [None]:
!pip install wandb
import wandb

wandb.login()



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Inicializar o W&B
wandb.init(
    project="distilbert-phishing-classifier",  # Nome do projeto
    name="experiment-1",                      # Nome do experimento
    config={                                  # Hiperparâmetros a registrar
        "model_name": "distilbert-base-uncased",
        "epochs": 3,
        "batch_size": 16,
        "learning_rate": 5e-5,
        "max_length": 128,
    },
)

# Acessar os hiperparâmetros como wandb.config["param"]
config = wandb.config


[34m[1mwandb[0m: Currently logged in as: [33mangelolimamiranda[0m ([33mangelolimamiranda-universidade-federal-do-rio-grande-do-[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import pandas as pd

df = pd.read_csv('/root/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1/spam.csv', encoding='latin1')

df.columns = ['label', 'text', 'Unnamed1', 'Unnamed2', 'Unnamed3']

df = df.drop(columns=['Unnamed1', 'Unnamed2', 'Unnamed3'])

print(df.head())



  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
from sklearn.model_selection import train_test_split

texts = df['text']
labels = df['label'].map({'ham': 0, 'spam': 1})

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)


In [None]:
from transformers import DistilBertTokenizer

# Carregar o tokenizer do DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenizar os textos
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)


In [None]:
import torch

class SpamDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Criar datasets PyTorch
train_dataset = SpamDataset(train_encodings, train_labels.tolist())
val_dataset = SpamDataset(val_encodings, val_labels.tolist())


In [None]:
from transformers import DistilBertForSequenceClassification

# Carregar modelo pré-treinado
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

# Configurar DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Configurar otimizador
optimizer = AdamW(model.parameters(), lr=5e-5)

# Enviar o modelo para GPU, se disponível
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

for epoch in range(config["epochs"]):
    model.train()
    total_loss = 0
    all_labels = []
    all_preds = []

    for batch in tqdm(train_loader):
        inputs = {key: val.to(device) for key, val in batch.items()}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        all_labels.extend(inputs["labels"].cpu().tolist())
        all_preds.extend(preds.cpu().tolist())

    # Calcular métricas
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")

    # Registrar no W&B
    wandb.log({
        "epoch": epoch + 1,
        "loss_epoch": total_loss / len(train_loader),
        "accuracy": acc,
        "f1_score": f1,
    })

    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_loader)}, Accuracy: {acc}, F1: {f1}")


100%|██████████| 279/279 [51:34<00:00, 11.09s/it]


Epoch 1 - Loss: 0.0731421991540987, Accuracy: 0.9789095804352703, F1: 0.9785984186500756


100%|██████████| 279/279 [50:57<00:00, 10.96s/it]


Epoch 2 - Loss: 0.023292417035690145, Accuracy: 0.994166479694862, F1: 0.9941456181953092


100%|██████████| 279/279 [50:46<00:00, 10.92s/it]

Epoch 3 - Loss: 0.007858070268568593, Accuracy: 0.9973076060130133, F1: 0.9973056959585955





In [None]:
# Avaliação no conjunto de validação
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch['labels'].tolist())

# Calcular métricas de validação
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

# Registrar métricas de validação no W&B
wandb.log({
    "val_accuracy": accuracy,
    "val_f1_score": f1,
})
print(f"Validation Accuracy: {accuracy}, F1-Score: {f1}")


Validation Accuracy: 0.9946188340807175, F1-Score: 0.9798657718120806


In [None]:
from torch.nn.functional import softmax

# Avaliar com o modelo
model.eval()  # Colocar o modelo em modo de avaliação
with torch.no_grad():  # Desativar cálculo de gradientes
    outputs = model(**inputs)
    logits = outputs.logits  # Saída dos logits

# Calcular probabilidades
probs = softmax(logits, dim=1)
print("Probabilidades:", probs)

# Determinar classes preditas
predicted_classes = torch.argmax(probs, dim=1)
print("Classes preditas:", predicted_classes)

Probabilidades: tensor([[9.9985e-01, 1.5253e-04],
        [9.9984e-01, 1.6314e-04],
        [6.5765e-04, 9.9934e-01],
        [9.9983e-01, 1.7461e-04],
        [9.9985e-01, 1.5002e-04],
        [9.9984e-01, 1.5508e-04],
        [9.9984e-01, 1.6061e-04],
        [9.9985e-01, 1.4752e-04],
        [9.9985e-01, 1.4596e-04],
        [9.9985e-01, 1.5048e-04],
        [2.5658e-03, 9.9743e-01]])
Classes preditas: tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1])


In [None]:
# Salvar modelo
model.save_pretrained("./distilbert_phishing")
tokenizer.save_pretrained("./distilbert_phishing")

# Registrar modelo como artefato no W&B
artifact = wandb.Artifact("distilbert_phishing_model", type="model")
artifact.add_dir("./distilbert_phishing")
wandb.log_artifact(artifact)


[34m[1mwandb[0m: Adding directory to artifact (./distilbert_phishing)... Done. 6.7s


<Artifact distilbert_phishing_model>