File used for features extraction for given url in csv format

In [1]:
import features_extractors.content_features as ctnfe
import features_extractors.url_features as urlfe
import features_extractors.external_features as trdfe
import features_extractors.feature_extractor as ftext
import sys
import pandas as pd
import torch
import numpy as np
from torch.utils.data import TensorDataset,DataLoader
from sklearn.model_selection import train_test_split
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
header = [
        "url", "length_url", "length_hostname", "ip", "nb_dots", "nb_hyphens", "nb_at", "nb_qm", "nb_and", "nb_or", "nb_eq", "nb_underscore",
        "nb_tilde", "nb_percent", "nb_slash", "nb_star", "nb_colon", "nb_comma", "nb_semicolumn", "nb_dollar", "nb_space", "nb_www", "nb_com", "nb_dslash",
        "http_in_path", "https_token", "ratio_digits_url", "ratio_digits_host", "punycode", "port", "tld_in_path", "tld_in_subdomain",
        "abnormal_subdomain", "nb_subdomains", "prefix_suffix", "shortening_service", "path_extension", "nb_redirection",
        "nb_external_redirection", "length_words_raw", "char_repeat", "shortest_words_raw", "shortest_word_host", "shortest_word_path",
        "longest_words_raw", "longest_word_host", "longest_word_path", "avg_words_raw", "avg_word_host", "avg_word_path", "phish_hints",
        "domain_in_brand", "brand_in_subdomain", "brand_in_path", "suspecious_tld", "statistical_report", "nb_hyperlinks", "ratio_intHyperlinks",
        "ratio_extHyperlinks", "ratio_nullHyperlinks", "nb_extCSS", "ratio_intRedirection", "ratio_extRedirection", "ratio_intErrors",
        "ratio_extErrors", "login_form", "external_favicon", "links_in_tags", "submit_email", "ratio_intMedia", "ratio_extMedia", "sfh", "iframe",
        "popup_window", "safe_anchor", "onmouseover", "right_clic", "empty_title", "domain_in_title", "domain_with_copyright", "whois_registered_domain",
        "domain_registration_length", "domain_age", "web_traffic", "dns_record", "google_index", "page_rank", "status"
]

In [3]:
# Setup input

url = input("Give url:")
# url = 'https://technofizi.net/top-best-mp3-downloader-app-for-android-free-music-download/'
print("Your url is: ", url)

Your url is:  https://technofizi.net/top-best-mp3-downloader-app-for-android-free-music-download/


In [4]:
status = 0
features = ftext.extract_features(url, status)
print(features)

['https://technofizi.net/top-best-mp3-downloader-app-for-android-free-music-download/', 83, 14, 0, 1, 9, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.012048192771084338, 0.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 11, 2, 3, 10, 3, 10, 10, 10, 5.454545454545454, 10.0, 5.0, 0, 0, 0, 0, 0, 0, 261, 0.7318007662835249, 0.2681992337164751, 0.0, 0, 0.0, 0.02857142857142857, 0.0, 0.2, 0, 0, 100.0, 0, 97.82608695652173, 2.1739130434782608, 0, 0, 0, 39.823008849557525, 0, 0, 0, 1, 1, 1, -1, 172183, 0, 0, 1, 5, 0]


In [5]:
savedModelName = 'phishing_v1.pt'
model = torch.load(savedModelName)
model.eval()

Sequential(
  (0): Linear(in_features=86, out_features=300, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2, inplace=False)
  (3): Linear(in_features=300, out_features=300, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.1, inplace=False)
  (6): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Linear(in_features=300, out_features=1, bias=True)
  (8): Sigmoid()
)

In [6]:
df = pd.DataFrame(data=[features], columns=header).drop("url", axis="columns")
df.head(1)

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,83,14,0,1,9,0,0,0,0,0,...,1,1,1,-1,172183,0,0,1,5,0


In [7]:
features = df.iloc[:, :-1]
is_phishing = df.iloc[:,-1:]

# Normalize the data
scaler = pickle.load(open('scaler.pkl', 'rb'))
feats_scaled = scaler.transform(features.values)
features = pd.DataFrame(data=feats_scaled, columns=features.columns)
features.head(1)

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,nb_eq,...,empty_title,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank
0,0.043585,0.047619,0.0,0.0,0.209302,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,13.362952,0.0,0.0,1.0,0.5


In [8]:
test_x_t=torch.from_numpy(features.values).float()
test_y_t=torch.from_numpy(is_phishing['status'].values).float().unsqueeze(1)

testingSet=TensorDataset(test_x_t,test_y_t)
testLoader = DataLoader(testingSet, batch_size=1, shuffle=True)

In [9]:
with torch.no_grad():
    for data in testLoader:
        inputs, labels = data
        outputs = model(inputs)
        predicted = torch.round(outputs.data)
        print(predicted)

tensor([[0.]])


In [10]:
if predicted == 1:
    print('The link might be phishing!')
else:
    print('The link is not phishing!')

The link is not phishing!
