# Test lstm4 model for phishing URL detection

## Imports

In [17]:
import numpy as np
import os
import pathlib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential

import tensorflow_text as tf_text

## Params

In [18]:
dataset_dir = pathlib.Path("../URL_dataset/")
model_dir = pathlib.Path('../URL_models/')
checkpoint_dir = pathlib.Path('../URL_checkpoints/')

In [19]:
model_name = "lstm4"
batch_size = 32
rest_split_ratio = 0.2
test_split_ratio = 0.2

unicode_max = 1114111

## Load dataset

In [20]:
train_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="training",
  label_mode = 'binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 36275 files for training.


In [21]:
rest_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="validation",
  label_mode='binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 9068 files for validation.


In [22]:
rest_batches = rest_ds.cardinality().numpy()
print(rest_batches)

284


In [23]:
val_ds = rest_ds.skip(int(rest_batches * test_split_ratio))
test_ds = rest_ds.take(int(rest_batches * test_split_ratio))

In [24]:
class_names = train_ds.class_names

In [25]:
class_names

['benign', 'phishing']

## Explore dataset

In [54]:
b = test_ds.take(2)

In [55]:
for batch in b:
    test_urls = batch[0]
    test_labels = batch[1]

In [56]:
for url, label in zip(test_urls.numpy(), test_labels.numpy()):
    print(url, class_names[int(label[0])])

b'http://serverfault.com/questions/484707/is-it-possible-to-have-a-100-secure-virtual-private-server' benign
b'http://techcrunch.com/2015/05/08/rewinds-new-app-lets-you-create-photo-timelines-that-disappear-after-a-day/' benign
b'http://ns303913.ovh.net/theme/js/images/ok/c7060c871fcdb4d12ea8f18374f4b441/Login.php' phishing
b'http://codecanyon.net/item/sitecloner-make-clones-or-copies-of-any-website/11172863' benign
b'http://www.cnsatlas.com/imagedrag/aol/index.htm' phishing
b'http://www.preferredcontracts.co.uk/js/ing.be/default.htm' phishing
b'http://web.de/magazine/unterhaltung/lifestyle/blog/jessica-labbadia/sommerhaus-fertigbauweise-30508184' benign
b'http://www.acts17leadership.com/wp-admin/css/gmailen_webmail.htm' phishing
b'http://censor.net.ua/news/335966/na_donbasse_esche_ne_dostigli_togo_chego_hoteli_merkel_o_vypolnenii_minskih_soglasheniyi' benign
b'http://motthegioi.vn/tuong-tinh-bao-chien-luoc-pham-xuan-an/ky-7-hai-su-that-cua-ong-pham-xuan-an-188405.html' benign
b'http:/

## Vectorize

Cannot use the vectorization layers since it is not supported in TFJS; we can rather have a custom tokenizer which can be then exported as json then loaded in JS

In [57]:
tokenizer = tf_text.UnicodeCharTokenizer()

In [58]:
tokenized_urls = tokenizer.tokenize(test_urls)

## Load model

In [59]:
# change filename to the model you want to load
model_filename = "2021-09-11_12;38;14.218039_lstm4_valacc0.9136_e15_b32.tf"
model = keras.models.load_model(
    model_dir / model_filename,
    compile=True)

## Test the model

In [60]:
pred = tf.round(tf.sigmoid(model.predict(tokenized_urls))).numpy().astype(int)

In [61]:
predictions = [class_names[p[0]] for p in pred]

In [62]:
for i, (url, p, label) in enumerate(zip(test_urls.numpy(), predictions, test_labels.numpy())):
    print(url, "Prediction (model):", p, "Truth:", class_names[int(label[0])])
#     print(tokenized_urls[i])

b'http://serverfault.com/questions/484707/is-it-possible-to-have-a-100-secure-virtual-private-server' Prediction (model): benign Truth: benign
b'http://techcrunch.com/2015/05/08/rewinds-new-app-lets-you-create-photo-timelines-that-disappear-after-a-day/' Prediction (model): benign Truth: benign
b'http://ns303913.ovh.net/theme/js/images/ok/c7060c871fcdb4d12ea8f18374f4b441/Login.php' Prediction (model): benign Truth: phishing
b'http://codecanyon.net/item/sitecloner-make-clones-or-copies-of-any-website/11172863' Prediction (model): benign Truth: benign
b'http://www.cnsatlas.com/imagedrag/aol/index.htm' Prediction (model): phishing Truth: phishing
b'http://www.preferredcontracts.co.uk/js/ing.be/default.htm' Prediction (model): phishing Truth: phishing
b'http://web.de/magazine/unterhaltung/lifestyle/blog/jessica-labbadia/sommerhaus-fertigbauweise-30508184' Prediction (model): benign Truth: benign
b'http://www.acts17leadership.com/wp-admin/css/gmailen_webmail.htm' Prediction (model): phishin

## Evaluate (test)

In [63]:
test_results = model.evaluate(tokenized_urls, test_labels)
print("test loss, test acc:", test_results)

test loss, test acc: [0.22960908710956573, 0.96875]
