In [1]:

import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import string
import numpy as np
from tensorflow.keras.utils import to_categorical

def tokenizer(alphabet,url_length=200):
    dictionary_size = len(alphabet) + 1
    url_shape = (url_length, dictionary_size)
    dictionary = {}
    reverse_dictionary = {}
    for i, c in enumerate(alphabet):
        dictionary[c]=i+1
        reverse_dictionary[i+1]=c
    return dictionary, reverse_dictionary

def data_npz(good,bad,alphabet,dictionary,samples=75000,url_length=200,npz_filename='phishing.npz'):
        good_data = []
        i = 0
        for i in range(samples+3000):
            line = good['URL'][i]
            this_sample=np.zeros(url_shape)

            line = line.lower()
            if len ( set(line) - set(alphabet)) == 0 and len(line) < url_length:
                for i, position in enumerate(this_sample):
                    this_sample[i][0]=1.0

                for i, char in enumerate(line):
                    this_sample[i][0]=0.0
                    this_sample[i][dictionary[char]]=1.0
                good_data.append(this_sample)
            else:
                print("Uncompatible line:",  line)

        #print("Data ready. Lines:", len(good_data))
        good_data = np.array(good_data)
        good_data = good_data[:samples]

        bad_data = []
        i = 0
        for i in range(samples+10000):
            line = bad['URL'][i]
            this_sample=np.zeros(url_shape)

            line = line.lower()
            if len ( set(line) - set(alphabet)) == 0 and len(line) < url_length:
                for i, position in enumerate(this_sample):
                    this_sample[i][0]=1.0

                for i, char in enumerate(line):
                    this_sample[i][0]=0.0
                    this_sample[i][dictionary[char]]=1.0
                bad_data.append(this_sample)
            else:
                print("Uncompatible line:",  line)

        #print("Data ready. Lines:", len(bad_data))
        bad_data = np.array(bad_data)
        bad_data = bad_data[:samples]
        print ("Array Shape:", len(good_data))
        print ("Array Shape:", len(bad_data))

        x_train_len = int(samples* 0.8)
        x_train = np.concatenate((good_data[:x_train_len,:,:], bad_data[:x_train_len,:,:]),axis=0)
        x_test = np.concatenate((good_data[x_train_len:samples,:,:], bad_data[x_train_len:samples,:,:]),axis=0)

        good_label = np.ones((samples,1))
        bad_label = np.zeros((samples,1))
        y_train = np.concatenate((good_label[:x_train_len,:], bad_label[:x_train_len,:]),axis=0)
        y_train_cat = to_categorical(y_train)
        y_test = np.concatenate((good_label[x_train_len:samples,:], bad_label[x_train_len:samples,:]),axis=0)
        y_test_cat = to_categorical(y_test)

        np.savez_compressed(npz_filename, X_train=x_train, X_test=x_test, y_train=y_train_cat, y_test=y_test_cat)
        print("文件保存成功")

if __name__ == "__main__":

    url_length=200
    npz_filename='/content/drive/MyDrive/phishingDetection/phishing.npz'
    n_samples=180000


    alphabet = string.ascii_lowercase + string.digits + "!#$%&()*+,-./:;<=>?@[\\]^_`{|}~"
    dictionary_size = len(alphabet) + 1
    url_shape = (url_length, dictionary_size)

    df = pd.read_csv('phishing_site_urls.csv')
    good = df[df['Label']=='good']
    bad = df[df['Label']=='bad']
    good.reset_index(drop=True, inplace=True)
    bad.reset_index(drop=True, inplace=True)


    each_class_samples= n_samples //2
    dictionary, reverse_dictionary = tokenizer(alphabet,url_length= url_length)

    data_npz(good,bad,alphabet,dictionary,samples=each_class_samples,url_length=url_length,npz_filename=npz_filename)



[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Uncompatible line: www.paypal.com.us.webscrlcmdl.login.submit.dispatch.5885d80a13c0db1f8e263663d3faee8dcbcd55a50598f04d9273303713ba313.5885d80a13c0db1f8e263663d3f.aee811d01j111yh0111isl153364wxekhteamindexfo100011s.digicozum.com/limit/limit-id=ih6u80m3ucd333647066husje3l4bygg274eb1njn3gkec45omr/2f48c6eb25f10683d3b6e62b29afe937/
Uncompatible line: www.paypal.com.us.webscrlcmdl.login.submit.dispatch.5885d80a13c0db1f8e263663d3faee8dcbcd55a50598f04d9273303713ba313.5885d80a13c0db1f8e263663d3f.aee811d01j111yh0111isl153364wxekhteamindexfo100011s.digicozum.com/limit/limit-id=ih83cjg3j6snr3l3ebygee4414uudboh0u06mnk3cg3524676m7/72e83612655982b4631dbc6b2b9912d8/
Uncompatible line: www.paypal.com.us.webscrlcmdl.login.submit.dispatch.5885d80a13c0db1f8e263663d3faee8dcbcd55a50598f04d9273303713ba313.5885d80a13c0db1f8e263663d3f.aee811d01j111yh0111isl153364wxekhteamindexfo100011s.digicozum.com/limit/limit-id=ij04136clg46ygbdeu832o756sujenm3brc064g43e3hnmhk7u3/bf8