In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shashwatwork/web-page-phishing-detection-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/shashwatwork/web-page-phishing-detection-dataset/versions/2


In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# Carregar o dataset
df = pd.read_csv('/root/.cache/kagglehub/datasets/shashwatwork/web-page-phishing-detection-dataset/versions/2/dataset_phishing.csv', encoding='latin1')

# Selecionar apenas as colunas relevantes
df = df[['url', 'status']]
df = df.dropna()

df['status'] = df['status'].map({'legitimate': 0, 'phishing': 1})


In [None]:
# URLs e rótulos
urls = df['url'].values
labels = df['status'].values

# Tokenizar URLs (modo caractere)
tokenizer = Tokenizer(char_level=True)  # Tokenização por caractere
tokenizer.fit_on_texts(urls)
sequences = tokenizer.texts_to_sequences(urls)

# Padronizar as URLs sem limite de comprimento fixo
# O truncamento e padding será feito automaticamente
padded_sequences = pad_sequences(sequences, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
# Parâmetros do modelo
vocab_size = len(tokenizer.word_index) + 1  # Tamanho do vocabulário
embedding_dim = 50  # Dimensão dos embeddings

# Construir o modelo
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Saída binária
])

# Compilar o modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [None]:
# Early stopping para evitar overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Treinamento
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=32,
    callbacks=[early_stopping],
    verbose=1
)


Epoch 1/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 347ms/step - accuracy: 0.7012 - loss: 0.5827 - val_accuracy: 0.8338 - val_loss: 0.3678
Epoch 2/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 337ms/step - accuracy: 0.8459 - loss: 0.3571 - val_accuracy: 0.8808 - val_loss: 0.2758
Epoch 3/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 342ms/step - accuracy: 0.8976 - loss: 0.2625 - val_accuracy: 0.8939 - val_loss: 0.2489
Epoch 4/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 340ms/step - accuracy: 0.9220 - loss: 0.2032 - val_accuracy: 0.9103 - val_loss: 0.2087
Epoch 5/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 353ms/step - accuracy: 0.9511 - loss: 0.1451 - val_accuracy: 0.9185 - val_loss: 0.1995
Epoch 6/15
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 338ms/step - accuracy: 0.9580 - loss: 0.1207 - val_accuracy: 0.9136 - val_loss: 0.2116
Epoch 7/15

In [None]:
# Avaliar no conjunto de teste
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9298 - loss: 0.1873
Test Loss: 0.2033
Test Accuracy: 0.9221


In [None]:
# URLs para teste
test_urls = ["https://www.southbankmosaics.com", "https://grassform-my.sharepoint.com/:b:/g/personal/accounts_grassform_co_uk/EYsmX_DhwSVAlooW1ETKf50B..."]
test_sequences = tokenizer.texts_to_sequences(test_urls)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Fazer previsões
predictions = model.predict(test_padded)
for url, pred in zip(test_urls, predictions):
    print(f"URL: {url} - Probabilidade de phishing: {pred[0]:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
URL: https://www.southbankmosaics.com - Probabilidade de phishing: 0.0300
URL: https://grassform-my.sharepoint.com/:b:/g/personal/accounts_grassform_co_uk/EYsmX_DhwSVAlooW1ETKf50B... - Probabilidade de phishing: 0.9788
