# Test lstm1 model for phishing URL detection

## Imports

In [1]:
import numpy as np
import os
import pathlib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential

import tensorflow_text as tf_text

In [2]:
dataset_dir = pathlib.Path("../URL_dataset/")
model_dir = pathlib.Path('../URL_models/')
checkpoint_dir = pathlib.Path('../URL_checkpoints/')

In [3]:
import datetime
date = datetime.datetime.now()
date = '_'.join(str(date).split()).replace(':', ';')
print(date)

2021-09-03_17;12;57.557846


In [4]:
os.listdir(dataset_dir)

['benign', 'phishing']

## Params

In [5]:
model_name = "lstm1"
batch_size = 16
rest_split_ratio = 0.2
test_split_ratio = 0.2

unicode_max = 1114111

## Load dataset

In [6]:
train_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="training",
  label_mode = 'binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 36275 files for training.


In [7]:
rest_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="validation",
  label_mode='binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 9068 files for validation.


In [8]:
rest_batches = rest_ds.cardinality().numpy()
print(rest_batches)

567


In [9]:
val_ds = rest_ds.skip(int(rest_batches * test_split_ratio // batch_size))
test_ds = rest_ds.take(int(rest_batches * test_split_ratio // batch_size))

In [10]:
class_names = train_ds.class_names

In [11]:
class_names

['benign', 'phishing']

## Explore dataset

In [12]:
b = test_ds.take(1)

In [13]:
for batch in b:
    test_urls = batch[0]
    test_labels = batch[1]

In [14]:
for url, label in zip(test_urls.numpy(), test_labels.numpy()):
    print(url, class_names[int(label[0])])

b'http://perezhilton.com/2015-05-12-prince-harry-baby-fever-adorable-new-zealand-visit/?from=featured' benign
b'http://distractify.com/post/related/id/553568574a0c4b4443bec65b/skip/20/limit/10/back/0' benign
b'http://www.tsekourasp.gr/language/en-GB/msg/ln.php' phishing
b'http://hollywoodlife.com/2015/05/02/ian-somerhalder-engagement-nikki-reed-khloe-kardashian-lamar-odom-wedding-pics/' benign
b'http://www.sz-leteng.com/images/?http://us.battle.net/login/en/?ref=http://bopgkotus.battle.net/d3/en/index&amp;app=com-d3' phishing
b'http://askubuntu.com/questions/106028/is-it-possible-to-run-64-bit-mongo-db-on-my-32-bit-pae-enabled-ubuntu' benign
b'http://serverfault.com/questions/528627/nginx-returning-404-with-python-urllib-but-fine-if-accessed-from-browser' benign
b'http://twitter.com/home?status=%E3%83%8C%E3%81%91%E3%82%8B%EF%BC%81%E3%80%90%E4%BA%BA%E5%A6%BB%E3%83%BB%E7%86%9F%E5%A5%B3%E3%80%91+http%3A%2F%2Fero-video.net%2Ft%2FRVIEW8Nj0X3Idi8o+%E5%B7%A8%E4%B9%B3%E4%BA%BA%E5%A6%BB%E3%81%A

## Vectorize

Cannot use the vectorization layers since it is not supported in TFJS; we can rather have a custom tokenizer which can be then exported as json then loaded in JS

In [15]:
tokenizer = tf_text.UnicodeCharTokenizer()

In [16]:
tokenized_urls = tokenizer.tokenize(test_urls)

## Load model

In [17]:
# change filename to the model you want to load
model_filename = "2021-08-19_20;05;32.036830_lstm1_valacc0.9948_e10_b16.tf"
model = keras.models.load_model(
    model_dir / model_filename,
    compile=True)

## Test the model

In [23]:
pred = tf.round(tf.sigmoid(model.predict(tokenized_urls))).numpy().astype(int)

In [30]:
predictions = [class_names[p[0]] for p in pred]

In [32]:
for url, p, label in zip(test_urls.numpy(), predictions, test_labels.numpy()):
    print(url, "Prediction (model):", p, "Truth:", class_names[int(label[0])])

b'http://perezhilton.com/2015-05-12-prince-harry-baby-fever-adorable-new-zealand-visit/?from=featured' Prediction (model): benign Truth: benign
b'http://distractify.com/post/related/id/553568574a0c4b4443bec65b/skip/20/limit/10/back/0' Prediction (model): benign Truth: benign
b'http://www.tsekourasp.gr/language/en-GB/msg/ln.php' Prediction (model): phishing Truth: phishing
b'http://hollywoodlife.com/2015/05/02/ian-somerhalder-engagement-nikki-reed-khloe-kardashian-lamar-odom-wedding-pics/' Prediction (model): benign Truth: benign
b'http://www.sz-leteng.com/images/?http://us.battle.net/login/en/?ref=http://bopgkotus.battle.net/d3/en/index&amp;app=com-d3' Prediction (model): phishing Truth: phishing
b'http://askubuntu.com/questions/106028/is-it-possible-to-run-64-bit-mongo-db-on-my-32-bit-pae-enabled-ubuntu' Prediction (model): benign Truth: benign
b'http://serverfault.com/questions/528627/nginx-returning-404-with-python-urllib-but-fine-if-accessed-from-browser' Prediction (model): benign

## Evaluate (test)

In [18]:
test_results = model.evaluate(tokenized_urls, test_labels, batch_size=16)
print("test loss, test acc:", test_results)

test loss, test acc: [0.00033692337456159294, 1.0]
