# Course Project: Malicious URL Detection
## Dataset EDA

### Import Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

import tldextract
import re
from urllib.parse import urlparse

import plotly.express as px
import altair as alt
import statsmodels.api as sm

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)
sns.set_theme()

### Malicious URLs dataset

In [27]:
df = pd.read_csv("dataset/malicious_phish.csv")

print(df.head(10))
print("\nShape:", df.shape)
print("\nInfo:"); print(df.info())
print("\nNulls per column:\n", df.isna().sum())
print("\nDuplicate rows:", df.duplicated().sum())

target_col = "type" if "type" in df.columns else df.columns[-1]
print("Target column:", target_col)
print(df[target_col].value_counts(dropna=False, normalize=True))

print("Columns:", df.columns.tolist())

                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement
5  http://buzzfil.net/m/show-art/ils-etaient-loin...      benign
6      espn.go.com/nba/player/_/id/3457/brandon-rush      benign
7     yourbittorrent.com/?q=anthony-hamilton-soulife      benign
8       http://www.pashminaonline.com/pure-pashminas  defacement
9      allmusic.com/album/crazy-from-the-heat-r16990      benign

Shape: (651191, 2)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651191 entries, 0 to 651190
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     651191 non-null  object
 1   type    651191 non-null  object


### Phishing URL dataset

In [26]:
df = pd.read_csv("dataset/Phishing_URLs.csv")

print(df.head(10))
print("\nShape:", df.shape)
print("\nInfo:"); print(df.info())
print("\nNulls per column:\n", df.isna().sum())
print("\nDuplicate rows:", df.duplicated().sum())

target_col = "type" if "type" in df.columns else df.columns[-1]
print("Target column:", target_col)
print(df[target_col].value_counts(dropna=False, normalize=True))

print("Columns:", df.columns.tolist())

                                                 url      Type
0  https://docs.google.com/presentation/d/e/2PACX...  Phishing
1    https://btttelecommunniccatiion.weeblysite.com/  Phishing
2                        https://kq0hgp.webwave.dev/  Phishing
3  https://brittishtele1bt-69836.getresponsesite....  Phishing
4         https://bt-internet-105056.weeblysite.com/  Phishing
5                         https://teleej.weebly.com/  Phishing
6          https://maryleyshon.wixsite.com/my-site-1  Phishing
7           https://chamakhman.wixsite.com/my-site-4  Phishing
8                          https://posts-ch.buzz/ch/  Phishing
9                       https://tinyurl.com/bdfpfyur  Phishing

Shape: (54807, 2)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54807 entries, 0 to 54806
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     54807 non-null  object
 1   Type    54807 non-null  object
dtypes: object(2)
memory usag

In [25]:
df = pd.read_csv("dataset/URL_dataset.csv")

print(df.head(10))
print("\nShape:", df.shape)
print("\nInfo:"); print(df.info())
print("\nNulls per column:\n", df.isna().sum())
print("\nDuplicate rows:", df.duplicated().sum())

target_col = "type" if "type" in df.columns else df.columns[-1]
print("Target column:", target_col)
print(df[target_col].value_counts(dropna=False, normalize=True))

print("Columns:", df.columns.tolist())

                         url        type
0     https://www.google.com  legitimate
1    https://www.youtube.com  legitimate
2   https://www.facebook.com  legitimate
3      https://www.baidu.com  legitimate
4  https://www.wikipedia.org  legitimate
5     https://www.reddit.com  legitimate
6      https://www.yahoo.com  legitimate
7   https://www.google.co.in  legitimate
8         https://www.qq.com  legitimate
9     https://www.amazon.com  legitimate

Shape: (450176, 2)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   type    450176 non-null  object
dtypes: object(2)
memory usage: 6.9+ MB
None

Nulls per column:
 url     0
type    0
dtype: int64

Duplicate rows: 0
Target column: type
type
legitimate    0.768006
phishing      0.231994
Name: proportion, dtype: float64
Columns: ['url', 'type']


### Malicious And Benign URLs

In [24]:
df = pd.read_csv("dataset/urldata.csv")

print(df.head(10))
print("\nShape:", df.shape)
print("\nInfo:"); print(df.info())
print("\nNulls per column:\n", df.isna().sum())
print("\nDuplicate rows:", df.duplicated().sum())

target_col = "type" if "type" in df.columns else df.columns[-1]
print("Target column:", target_col)
print(df[target_col].value_counts(dropna=False, normalize=True))

print("Columns:", df.columns.tolist())

   Unnamed: 0                        url   label  result
0           0     https://www.google.com  benign       0
1           1    https://www.youtube.com  benign       0
2           2   https://www.facebook.com  benign       0
3           3      https://www.baidu.com  benign       0
4           4  https://www.wikipedia.org  benign       0
5           5     https://www.reddit.com  benign       0
6           6      https://www.yahoo.com  benign       0
7           7   https://www.google.co.in  benign       0
8           8         https://www.qq.com  benign       0
9           9     https://www.amazon.com  benign       0

Shape: (450176, 4)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  450176 non-null  int64 
 1   url         450176 non-null  object
 2   label       450176 non-null  object
 3   result      450176 non-nul