## 1- Read Dataset

In [16]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns


## Uncomment to read the dataset online

In [3]:

#!pip install ucimlrepo
# from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
# phiusiil_phishing_url_website = fetch_ucirepo(id=967) 
  
# # data (as pandas dataframes) 
# X = phiusiil_phishing_url_website.data.features 
# y = phiusiil_phishing_url_website.data.targets 
  
# # metadata 
# print(phiusiil_phishing_url_website.metadata) 
  
# # variable information 
# print(phiusiil_phishing_url_website.variables) 


## 2- Preprocessing & Visualization

In [17]:
# load the dataset
DATASET_PATH = "./dataset/PhiUSIIL_Phishing_URL_Dataset.csv"


In [28]:
# find spark
spark = SparkSession.builder.appName("PhishingURL").getOrCreate()
sc = spark.sparkContext


In [29]:
# read dataset
phishing_rdd = spark.read.csv(DATASET_PATH, header=True, inferSchema=True)

In [31]:
# show rdd header and first 10 rows
phishing_rdd.show(10)

+----------+--------------------+---------+--------------------+------------+----------+---+------------------+--------------------+-----------------+-----------+---------+-------------+--------------+------------------+----------------+----------------+----------------+---------------+---------------+---------------+--------------+------------------+--------------------------+---------------------+-------+----------+-----------------+--------+--------------------+---------------------+------------------+----------+------+------------+---------------+----------------+--------------+---------+----------+---------------------+------------+---------------+---------------+----------------+----+---+------+----------------+---------+-------+------+-----------+------------+---------------+-----+
|  FILENAME|                 URL|URLLength|              Domain|DomainLength|IsDomainIP|TLD|URLSimilarityIndex|CharContinuationRate|TLDLegitimateProb|URLCharProb|TLDLength|NoOfSubDomain|HasObfuscation

In [32]:
# drop filename column since it's not relevant
phishing_rdd = phishing_rdd.drop("filename")


In [33]:
# show rows, columns, and schema
print("Rows: ", phishing_rdd.count())
print("Columns: ", len(phishing_rdd.columns))
print("Schema: ", phishing_rdd.printSchema())

Rows:  235795
Columns:  55
root
 |-- URL: string (nullable = true)
 |-- URLLength: integer (nullable = true)
 |-- Domain: string (nullable = true)
 |-- DomainLength: integer (nullable = true)
 |-- IsDomainIP: integer (nullable = true)
 |-- TLD: string (nullable = true)
 |-- URLSimilarityIndex: double (nullable = true)
 |-- CharContinuationRate: double (nullable = true)
 |-- TLDLegitimateProb: double (nullable = true)
 |-- URLCharProb: double (nullable = true)
 |-- TLDLength: integer (nullable = true)
 |-- NoOfSubDomain: integer (nullable = true)
 |-- HasObfuscation: integer (nullable = true)
 |-- NoOfObfuscatedChar: integer (nullable = true)
 |-- ObfuscationRatio: double (nullable = true)
 |-- NoOfLettersInURL: integer (nullable = true)
 |-- LetterRatioInURL: double (nullable = true)
 |-- NoOfDegitsInURL: integer (nullable = true)
 |-- DegitRatioInURL: double (nullable = true)
 |-- NoOfEqualsInURL: integer (nullable = true)
 |-- NoOfQMarkInURL: integer (nullable = true)
 |-- NoOfAmpers

In [34]:
# check the number of missing values in each column
print("Missing values: ")
for col in phishing_rdd.columns:
    print(col, ":", phishing_rdd.where(phishing_rdd[col].isNull()).count())
    

Missing values: 
URL : 0
URLLength : 0
Domain : 0
DomainLength : 0
IsDomainIP : 0
TLD : 0
URLSimilarityIndex : 0
CharContinuationRate : 0
TLDLegitimateProb : 0
URLCharProb : 0
TLDLength : 0
NoOfSubDomain : 0
HasObfuscation : 0
NoOfObfuscatedChar : 0
ObfuscationRatio : 0
NoOfLettersInURL : 0
LetterRatioInURL : 0
NoOfDegitsInURL : 0
DegitRatioInURL : 0
NoOfEqualsInURL : 0
NoOfQMarkInURL : 0
NoOfAmpersandInURL : 0
NoOfOtherSpecialCharsInURL : 0
SpacialCharRatioInURL : 0
IsHTTPS : 0
LineOfCode : 0
LargestLineLength : 0
HasTitle : 0
Title : 0
DomainTitleMatchScore : 0
URLTitleMatchScore : 0
HasFavicon : 0
Robots : 0
IsResponsive : 0
NoOfURLRedirect : 0
NoOfSelfRedirect : 0
HasDescription : 0
NoOfPopup : 0
NoOfiFrame : 0
HasExternalFormSubmit : 0
HasSocialNet : 0
HasSubmitButton : 0
HasHiddenFields : 0
HasPasswordField : 0
Bank : 0
Pay : 0
Crypto : 0
HasCopyrightInfo : 0
NoOfImage : 0
NoOfCSS : 0
NoOfJS : 0
NoOfSelfRef : 0
NoOfEmptyRef : 0
NoOfExternalRef : 0
label : 0


In [43]:
# keep thee unique values of the url column
unique_urls = phishing_rdd.select("url").distinct()
print("Unique URLs: ", unique_urls.count())
filtered_rdd = phishing_rdd.dropDuplicates(["url"])

Unique URLs:  235370


In [44]:
# generate descriptive statistics
filtered_rdd.describe().show()


+-------+--------------------+------------------+--------------------+------------------+--------------------+------------------+------------------+--------------------+-------------------+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+--------------------------+---------------------+------------------+-----------------+------------------+------------------+--------------------+---------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+---------------------+------------------+-------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-------

In [45]:
# split the dataset into the rows that are real and phishing based on the label column
# where 1 is phishing and 0 is real
all_legitimate_rdd = filtered_rdd.where(filtered_rdd["label"] == 0)
all_phishing_rdd = filtered_rdd.where(filtered_rdd["label"] == 1)

In [46]:
# show the number of the legitimate and phishing URLs
print("Legitimate URLs: ", all_legitimate_rdd.count())
print("Phishing URLs: ", all_phishing_rdd.count())


Legitimate URLs:  100520
Phishing URLs:  134850


In [15]:
# # show histograms for integer columns in the dataset to understand the distribution of the data


# # convert the rdd to a pandas dataframe
# phishing_df = phishing_rdd.toPandas()

# # plot histograms for integer columns
# for col in phishing_df.select_dtypes(include=["int"]).columns:
#     sns.histplot(phishing_df[col])
#     plt.title(col)
#     plt.show()
    

KeyboardInterrupt: 

## 3- Model & Training

## 4- Evaluation