# Test lstm3 model for phishing URL detection

## Imports

In [27]:
import numpy as np
import os
import pathlib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential

import tensorflow_text as tf_text

## Params

In [28]:
dataset_dir = pathlib.Path("../URL_dataset/")
model_dir = pathlib.Path('../URL_models/')
checkpoint_dir = pathlib.Path('../URL_checkpoints/')

In [29]:
model_name = "lstm3"
batch_size = 32
rest_split_ratio = 0.2
test_split_ratio = 0.2

unicode_max = 1114111

## Load dataset

In [30]:
train_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="training",
  label_mode = 'binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 36275 files for training.


In [31]:
rest_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="validation",
  label_mode='binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 9068 files for validation.


In [32]:
rest_batches = rest_ds.cardinality().numpy()
print(rest_batches)

284


In [33]:
val_ds = rest_ds.skip(int(rest_batches * test_split_ratio))
test_ds = rest_ds.take(int(rest_batches * test_split_ratio))

In [34]:
class_names = train_ds.class_names

In [35]:
class_names

['benign', 'phishing']

## Load model

In [92]:
# change filename to the model you want to load
model_filename = "2021-09-10_19;13;08.969881_lstm3_valacc0.9964_e15_b32.tf"
model = keras.models.load_model(
    model_dir / model_filename,
    compile=True)

## Explore dataset

In [70]:
b = test_ds.take(1)

In [149]:
for batch in b:
    test_urls = batch[0]
    test_labels = batch[1]

In [150]:
for url, label in zip(test_urls.numpy(), test_labels.numpy()):
    print(url, class_names[int(label[0])])

b'http://depositphotos.com/58703015/stock-illustration-road-infographic-timeline-element-layout.html' benign
b'http://www.itfindia.org/confirmation-account/c85652703f710065c1255aa03a9aefae/fed65c93bcc26a7c2727b52e7ad57a9b/login.php' phishing
b'http://venturebeat.com/2015/04/22/itunes-stops-working-for-windows-xp-users-apple-security-change-likely-to-blame/' benign
b'http://distractify.com/post/related/id/541839304a0c4be1088b6b86/skip/20/limit/10/back/0' benign
b'http://hubpages.com/hub/Automotive-Preventative-Maintenance-Checklist-and-Estimated-Repair-Costs' benign
b'https://myspace.com/article/2015/5/11/watch-the-new-series-trailer-for-orange-is-the-new-black-season-3' benign
b'http://europa.eu/about-eu/agencies/regulatory_agencies_bodies/pol_agencies/eurojust/index_en.htm' benign
b'http://appleid.apple.co.uk.cgi-bin.webobjects.myappleid.woa.verify7.id8-eu.co.uk/identify-customer=1hJmtMByghjwxGNNZnsoQbzKTqZH175F5K1qN5RytnnQU9KkJ0999aDq3z1pncQ7SBcr7bnYr4aiQQ15XBXVmeiitv9kRHGhTS3k/' phi

## Vectorize

Cannot use the vectorization layers since it is not supported in TFJS; we can rather have a custom tokenizer which can be then exported as json then loaded in JS

In [151]:
tokenizer = tf_text.UnicodeCharTokenizer()

In [152]:
tokenized_urls = tokenizer.tokenize(test_urls)

## Test the model

In [153]:
pred = tf.round(tf.sigmoid(model.predict(tokenized_urls))).numpy().astype(int)

In [154]:
predictions = [class_names[p[0]] for p in pred]

In [155]:
for i, (url, p, label) in enumerate(zip(test_urls.numpy(), predictions, test_labels.numpy())):
    print(url, "Prediction (model):", p, "Truth:", class_names[int(label[0])])
#     print(tokenized_urls[i])

b'http://depositphotos.com/58703015/stock-illustration-road-infographic-timeline-element-layout.html' Prediction (model): benign Truth: benign
b'http://www.itfindia.org/confirmation-account/c85652703f710065c1255aa03a9aefae/fed65c93bcc26a7c2727b52e7ad57a9b/login.php' Prediction (model): phishing Truth: phishing
b'http://venturebeat.com/2015/04/22/itunes-stops-working-for-windows-xp-users-apple-security-change-likely-to-blame/' Prediction (model): benign Truth: benign
b'http://distractify.com/post/related/id/541839304a0c4be1088b6b86/skip/20/limit/10/back/0' Prediction (model): benign Truth: benign
b'http://hubpages.com/hub/Automotive-Preventative-Maintenance-Checklist-and-Estimated-Repair-Costs' Prediction (model): benign Truth: benign
b'https://myspace.com/article/2015/5/11/watch-the-new-series-trailer-for-orange-is-the-new-black-season-3' Prediction (model): benign Truth: benign
b'http://europa.eu/about-eu/agencies/regulatory_agencies_bodies/pol_agencies/eurojust/index_en.htm' Predicti

## Evaluate (test)

In [156]:
test_results = model.evaluate(tokenized_urls, test_labels)
print("test loss, test acc:", test_results)

test loss, test acc: [0.018451431766152382, 0.96875]
