# IMPORTS

In [1]:
from utils import *
from sklearn.model_selection import train_test_split
from tabulate import tabulate
import matplotlib.pyplot as plt


## 1- Read Dataset

In [2]:
spark=SparkSession.builder.appName("PySparkDataFrame").getOrCreate()
df=spark.read.csv("dataset/PhiUSIIL_Phishing_URL_Dataset.csv",header=True,inferSchema=True)


KeyboardInterrupt: 

In [None]:
df.printSchema() #prints the dataframeschema
print(df.columns) #prints the columns names.

## 2- Preprocessing & Visualization

In [None]:
'''
########FEATURES:

FILENAME-> 
URL -> URL of the website
URLLength	-> Length of the URL
Domain	-> Domain of the URL
DomainLength	-> Length of the domain
IsDomainIP	-> Is the domain an IP
TLD	-> Top Level Domain
URLSimilarityIndex	-> URL Similarity Index
CharContinuationRate	-> Character Continuation Rate which is the rate of the same character in the URL
TLDLegitimateProb	-> TLD Legitimate Probability which is the probability of the TLD being legitimate
URLCharProb	-> URL Character Probability 
TLDLength	-> Length of the TLD
NoOfSubDomain	-> Number of subdomains
HasObfuscation	-> Has Obfuscation or not
NoOfObfuscatedChar	-> Number of obfuscated characters in the URL 
ObfuscationRatio	-> Obfuscation Ratio 
NoOfLettersInURL	-> Number of letters in the URL
LetterRatioInURL	-> Letter Ratio in the URL
NoOfDegitsInURL	-> Number of digits in the URL
DegitRatioInURL	-> Digit Ratio in the URL
NoOfEqualsInURL	-> Number of equals in the URL
NoOfQMarkInURL	-> Number of question marks in the URL
NoOfAmpersandInURL	-> Number of ampersands in the URL
NoOfOtherSpecialCharsInURL	-> Number of other special characters in the URL
SpacialCharRatioInURL	-> Special Character Ratio in the URL
IsHTTPS	-> Is the URL HTTPS or not
LineOfCode	-> Line of code in the website 
LargestLineLength	-> Largest Line Length in the website
HasTitle	-> Has Title or not
Title	-> Title of the website
DomainTitleMatchScore	-> Domain Title Match Score
URLTitleMatchScore	-> URL Title Match Score
HasFavicon	-> Has Favicon or not
Robots	-> Robots of the website 
IsResponsive	-> Is the website responsive or not 
NoOfURLRedirect	-> Number of URL Redirect 
NoOfSelfRedirect	-> Number of self redirect 
HasDescription	-> Has Description or not 
NoOfPopup	-> Number of popups 
NoOfiFrame	-> Number of iFrame whcih is an HTML tag
HasExternalFormSubmit	-> Has External Form Submit or not
HasSocialNet	-> Has Social Network or not 
HasSubmitButton	-> Has Submit Button or not 
HasHiddenFields	-> Has Hidden Fields or not
HasPasswordField	-> Has Password Field or not
Bank	-> Bank of the website which is the bank name
Pay	-> Pay of the website which is the payment gateway
Crypto	-> Crypto of the website which is the cryptocurrency
HasCopyrightInfo	-> Has Copyright Info or not
NoOfImage	-> Number of images in the website
NoOfCSS	-> Number of CSS in the website
NoOfJS	-> Number of JS in the website
NoOfSelfRef	-> Number of self reference which is the reference to the same website
NoOfEmptyRef	-> Number of empty reference which is the reference to the empty website
NoOfExternalRef	-> Number of external reference which is the reference to the external website

######### OUTPUT:
label -> Phishing or not 

'''

In [None]:
# Get possible values for each column for better understanding of data
for col in df.columns:
    possibleValues = df[col].unique()
    possibleValues = set(possibleValues)
    print(f"\nThere are {len(possibleValues)} possible values in column '{col}' are: {possibleValues}")

In [None]:
# Get statistics for each column
stats = df.describe(include='all')

# Get null instances for each column
null_count = df.isnull().sum()
stats.loc['null_count'] = null_count
stats.loc['null_percentage'] = null_count / len(df) * 100

# Print Statistics' output
print(tabulate(stats, headers='keys', tablefmt='psql'))

In [None]:
#Visualize Data


In [None]:
#Heat map and correlation matrix

In [None]:
# Split labels and Features

In [None]:
# Preprocess Data

In [None]:
#Split Data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

## 3- Model & Training

## 4- Evaluation