In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("verified_online.csv")
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#Converting the timestamps to date format
df['submission_time'] = pd.to_datetime(df['submission_time'])
df['verification_time'] = pd.to_datetime(df['verification_time'])

In [None]:
# Converting categorical columns to category dtype
df['verified'] = df['verified'].astype('category')
df['online'] = df['online'].astype('category')
df['target'] = df['target'].astype('category')
df.loc[:, 'url_length'] = df['url'].apply(len)
df.info()

In [None]:
###After analyzing and observing the target columns seems to be more explored rather than those verified and online column
##removing other from the target column to better analyze targets

In [None]:
filtered_df = df[df['target'] != 'Other']
plt.figure(figsize=(14, 20))
sns.countplot(y='target', data=filtered_df)
plt.title('Distribution of Targets (Excluding "Other")')
plt.xlabel('Count')
plt.ylabel('Target')
plt.tick_params(axis='y')
plt.show()


In [None]:
#So here we can see a huge number of companies are targeted
#Lets see what are the top 10 companies which are affected

In [None]:
top_n = 10
top_targets = filtered_df['target'].value_counts().nlargest(top_n)
top_targets_df = filtered_df[filtered_df['target'].isin(top_targets.index)]
plt.figure(figsize=(14, 7))
sns.countplot(y='target', data=top_targets_df, order=top_targets.index)
plt.title(f'Top {top_n} Targets (Excluding "Other")')
plt.xlabel('Count')
plt.ylabel('Target')
plt.show()

In [None]:
#Lets do a frequency distribution analysis of url length
# Plot histogram of URL lengths
plt.figure(figsize=(15, 15))
sns.histplot(data=df, x='url_length' , kde=True)
plt.title('Distribution of URL Lengths')
plt.xlabel('URL Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
#We can see that url lengths are huge. This tells that phishers use large urls. Lets find out the biggest one.
max_length_index = df['url_length'].idxmax()
row_with_max_length = df.loc[max_length_index]
row_with_max_length

In [None]:
#Now for better analysis lets filter the url length to 500 characters only
new_df = df[df['url_length'] <= 500]
plt.figure(figsize=(15, 15))
sns.histplot(data=new_df, x='url_length' , kde=True)
plt.title('Distribution of URL Lengths')
plt.xlabel('URL Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
#Now we can see the url length that are mostly used ranges from 0 to 100.
#But we can see a sudden rise around 200 also.
#Lets see what goes for the top 10 most attacked companies
# Creating a boxplot to visualize the distribution of URL lengths for each target
#Please note I am filtering the data again as there is a issue coming while running this cell if run again
top_n = 10
top_targets = top_targets_df['target'].value_counts().nlargest(top_n).index.tolist()

# Filter data to include only the top targets
filtered_top_10 = top_targets_df[top_targets_df['target'].isin(top_targets)]
plt.figure(figsize=(14, 7))
sns.boxplot(data=filtered_top_10, x='target', y='url_length', order=top_targets)
plt.title('URL Lengths by Target (Top Attacked Companies)')
plt.xlabel('Target')
plt.ylabel('URL Length')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
#From here that we can say that phishers use almost same url length for Apple.
#For Microsoft and Internal Revenue Service sometimes different url lengths are also used. We can see the outliers there.
#And for Yahoo there are no outliers. Well it may be said that phishing is easier on yahoo sites.