In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
url_dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset 2 v0/phishing-dataset-variation.csv")
url_dataset

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,3,0,0,1,0,0,0,0,0,0,...,1,2,0,892,0,0,0,0,0,1
1,5,0,1,3,0,3,0,2,0,0,...,1,2,1,9540,1,0,0,0,0,1
2,2,0,0,1,0,0,0,0,0,0,...,1,2,3,589,1,0,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,1,2,0,292,1,0,0,0,0,1
4,2,0,0,0,0,0,0,0,0,0,...,1,2,1,3597,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88642,3,1,0,0,0,0,0,0,0,0,...,1,3,1,3597,0,0,0,0,0,0
88643,2,0,0,0,0,0,0,0,0,0,...,1,2,2,591,0,2,0,0,0,0
88644,2,1,0,5,0,0,0,0,0,0,...,1,2,5,14391,1,0,0,0,0,1
88645,2,0,0,1,0,0,0,0,0,0,...,1,1,1,52,1,0,0,0,0,1


In [None]:
# Replace empty cells with whitespace
url_dataset = url_dataset.fillna("")

In [None]:
# Extract URLs and labels from the dataset
last_column_index = -1  # Assuming the last column is the target feature
all_urls = url_dataset.iloc[:, :-1].apply(lambda row: ''.join(row.dropna().astype(str)), axis=1)
labels = np.array(url_dataset.iloc[:, last_column_index]).reshape(-1, 1)

In [None]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_urls)
sequences = tokenizer.texts_to_sequences(all_urls)
padded_sequences = pad_sequences(sequences)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Detect TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Using TPU.")
except ValueError:
    tpu = None
    # If TPU is not available, check for GPU
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        strategy = tf.distribute.MirroredStrategy()
        print("Using GPU.")
    else:
        # If neither TPU nor GPU is available, use default CPU strategy
        strategy = tf.distribute.get_strategy()
        print("Using CPU.")

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.75.157.2:8470
Using TPU.
REPLICAS:  8


In [None]:
with strategy.scope():
    # Build the neural network model
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16, input_length=padded_sequences.shape[1]))
    model.add(layers.Flatten())
    model.add(layers.Dense(8, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    optimizer = tf.compat.v1.train.AdamOptimizer()

    # Compile the model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Ensure both padded_sequences and labels are NumPy arrays
    model.fit(X_train, y_train)



In [None]:
# Predict labels for the test set
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)



In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
# Display evaluation metrics
print(f"\nTest Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Test Accuracy: 95.54%
Precision: 0.9195148842337376
Recall: 0.954233409610984
F1-Score: 0.9365524985962942
