## Imports

In [27]:
import numpy as np
import os
import pathlib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential

import tensorflow_text as tf_text

## Params

In [28]:
dataset_dir = pathlib.Path("../URL_dataset/")
model_dir = pathlib.Path('../URL_models/')
checkpoint_dir = pathlib.Path('../URL_checkpoints/')

In [29]:
model_name = "lstm3"
batch_size = 32
rest_split_ratio = 0.2
test_split_ratio = 0.2

unicode_max = 1114111

## Load dataset

In [30]:
train_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="training",
  label_mode = 'binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 36275 files for training.


In [31]:
rest_ds = keras.preprocessing.text_dataset_from_directory(
  dataset_dir,
  validation_split=rest_split_ratio,
  subset="validation",
  label_mode='binary',
  seed=132,
  batch_size=batch_size)

Found 45343 files belonging to 2 classes.
Using 9068 files for validation.


In [32]:
rest_batches = rest_ds.cardinality().numpy()
print(rest_batches)

284


In [33]:
val_ds = rest_ds.skip(int(rest_batches * test_split_ratio))
test_ds = rest_ds.take(int(rest_batches * test_split_ratio))

In [34]:
class_names = train_ds.class_names

In [35]:
class_names

['benign', 'phishing']

## Load model

In [92]:
# change filename to the model you want to load
model_filename = "2021-09-10_19;13;08.969881_lstm3_valacc0.9964_e15_b32.tf"
model = keras.models.load_model(
    model_dir / model_filename,
    compile=True)

## Explore dataset

In [70]:
b = test_ds.take(1)

In [139]:
for batch in b:
    test_urls = batch[0]
    test_labels = batch[1]

In [140]:
for url, label in zip(test_urls.numpy(), test_labels.numpy()):
    print(url, class_names[int(label[0])])

b'http://www.naturesway2health.com/store/Apple@1/7ffcaed16b1d58eef6068d7569454ef5/' phishing
b'http://www.tarahenergy.com/wp-admin/js/login.alibaba.com/login.jsp.php' phishing
b'http://interpark.com/displaycorner/FreeMarket.do?_method=itemCateList&free1=pc&free2=007001009014&free3=item&disp_no=007001009014' benign
b'http://seekingalpha.com/article/3180326-10-top-dividend-aristocrats-aim-at-7-percent-to-12-percent-april-upsides-1-misses' benign
b'http://sourceforge.net/directory/audio-video/add_facet_filter?facet=license&constraint=Public+Domain' benign
b'http://babal.net/games/view/731/%D8%A7%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D9%82%D8%B7%D8%B7-%D8%A7%D9%84%D8%B4%D9%82%D9%8A%D8%A9' benign
b'http://tinnhanh360.net/tim-thay-ca-tram-thi-the-nguoi-leo-nui-bi-tuyet-vui-o-nepal.html' benign
b'http://emgn.com/entertainment/the-most-amazing-and-beautiful-bridges-you-will-ever-see/' benign
b'http://hollywoodlife.com/2015/05/02/ian-somerhalder-engagement-nikki-reed-khloe-kardashian-lamar-odom-

## Vectorize

Cannot use the vectorization layers since it is not supported in TFJS; we can rather have a custom tokenizer which can be then exported as json then loaded in JS

In [141]:
tokenizer = tf_text.UnicodeCharTokenizer()

In [142]:
tokenized_urls = tokenizer.tokenize(test_urls)

## Test the model

In [108]:
url = "http://example.com"

In [109]:
tokenizer.tokenize(url)

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([104, 116, 116, 112,  58,  47,  47, 101, 120,  97, 109, 112, 108,
       101,  46,  99, 111, 109])>

In [110]:
tf.sigmoid(model.predict(tf.expand_dims(tokenizer.tokenize(url), 0))).numpy()

array([[0.99995744]], dtype=float32)