In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np

In [None]:
# Load the dataset
url_dataset = pd.read_csv("../split_urls.csv")

In [None]:
# Replace empty cells with whitespace
url_dataset = url_dataset.fillna("")

In [None]:
# Extract URLs and labels from the dataset
all_urls = url_dataset['protocol'] + "://" + url_dataset['domain'] + url_dataset['path'] + url_dataset['query'] + url_dataset['fragment']
labels = np.array((url_dataset['category'] == 'Malicious').astype(int)).reshape(-1, 1)

In [None]:
# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_urls)
sequences = tokenizer.texts_to_sequences(all_urls)
padded_sequences = pad_sequences(sequences)

In [None]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [None]:
# Detect TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print("Using TPU.")
except ValueError:
    tpu = None
    # If TPU is not available, check for GPU
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        strategy = tf.distribute.MirroredStrategy()
        print("Using GPU.")
    else:
        # If neither TPU nor GPU is available, use default CPU strategy
        strategy = tf.distribute.get_strategy()
        print("Using CPU.")

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
with strategy.scope():
    # Build the neural network model
    model = models.Sequential()
    model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=16, input_length=padded_sequences.shape[1]))
    model.add(layers.Flatten())
    model.add(layers.Dense(8, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    optimizer = tf.compat.v1.train.AdamOptimizer()

    # Compile the model
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Ensure both padded_sequences and labels are NumPy arrays
    model.fit(X_train, y_train, epochs=10, batch_size=2)

In [None]:
# Save the model
model.save("Models/malicious_url_model.h5")

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")