In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from collections import Counter
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
# Model Building
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
#--------------------------------------------

#Goal: create the final dataset from the 7 email phishing dataset (the union of all small datasets)
# list of current baby sets: data1, data2, data3, data4, data5, data6, data7
# to do list:
# 1. try to reach a good number of emails, around 100k        [  X  ] 
# 2. figure out what to do with the Nan datasets: data2_nan, data3_nan,data4_nan,data5_nan  [  ]
# 3. deal witht he weird ascii data in all datasets, best to remove them imo, but for later [  ]
# 4. before joining the small datasets, make sure to remove sender from dataset 1, 2 , 3 , 4, 6. and add url (1 if contains, 0 if not) to dataset 5, 7. [ X ] 
# statistics of final_dataset:
#(202917, 4), 94305 is phishing, 108612 is safe

url_pattern = r'(https?://\S+|www\.\S+)'
#--------------------------------------------
data1 = pd.read_csv("../datasets/CEAS_08.csv",encoding='latin1')
#CEAS 8 Dataset, has after removing the unwanted columns (39154, 5),   21842 is phishing, 17312 is safe , dtype of label is int64

#preprocessing so far for it ( no vectorization, just data cleaning...etc)
data1.drop(columns=["sender","receiver", "date"], inplace=True)
data1.rename(columns={'label': 'isPhishing'}, inplace=True) 

#--------------------------------------------

data2 = pd.read_csv("../datasets/TREC_07.csv", encoding='latin1', engine='python', on_bad_lines='skip')
#TREC 7 Dataset, has (53745, 5),   29392 is phishing, 24353   is safe , dtype of label is now int64, used to be object with other out of place labels( None, '   ', 'const')
data2.drop(columns=["sender","receiver", "date"], inplace=True)
data2.rename(columns={'label': 'isPhishing'}, inplace=True) 

#could probably do this step in a ..cleaner more effecient way...removing any value other than 1 or 0
data2_placeholder= data2[data2['isPhishing'].isin(['0', '1'])]
data2 = data2_placeholder
del data2_placeholder

#i removed the rows with label Nan in "isPhishing", will see what to do with that later, saved the Nan rows to data2_nan
data2_nan = data2[data2['isPhishing'].isna()]
data2 = data2.dropna(subset=['isPhishing'])

#--------------------------------------------

data3 = pd.read_csv("../datasets/TREC_06.csv", encoding='latin1',index_col=0, engine='python', on_bad_lines='skip') # i added teh index col because it fixed a problem..god bless stackoverflow

# Trec 6, has (16382, 5), 3989 are phsihing, 12393 are safe, dtype of label is int 64 as well, make sure numbers are updated after cleaning
#NAn problem handled first, decide to kill it or keep it later
data3_nan = data3[data3['label'].isna()]
data3 = data3.dropna(subset=['label'])
data3.drop(columns=["receiver", "date"], inplace=True)
data3.rename(columns={'label': 'isPhishing'}, inplace=True) 

#--------------------------------------------

data4 = pd.read_csv("../datasets/TREC_05.csv", encoding='latin1', engine='python', on_bad_lines='skip')
#Trec 5, has (55210, 5), 22932 are phishing, 22932 are safe
#Nan Problem
data4_nan = data4[data4['label'].isna()]
data4 = data4.dropna(subset=['label'])
#Label mismatch type problem 
data4 = data4[data4['label'].isin(['0', '1'])]
data4['label'] = data4['label'].astype(int)

#dropping 2 columns and renaming one
data4.drop(columns=["sender","receiver", "date"], inplace=True)
data4.rename(columns={'label': 'isPhishing'}, inplace=True) 

#--------------------------------------------
data5 = pd.read_csv("../datasets/Enron.csv", encoding='latin1', engine='python', on_bad_lines='skip')
#features: subject, body, isphishing
#Enron , has (29763, 3), 13976 are phishing, 15787 safe, Enron has weird emails probably better to discard it!!!!!!!!!!!!!!!!!!!!
data5.rename(columns={'label': 'isPhishing'}, inplace=True)
data5_nan = data5[data5['isPhishing'].isna()]
data5 = data5.dropna(subset=['isPhishing'])
data5 = data5[data5['isPhishing'].isin(['0', '1'])]
data5['isPhishing'] = data5['isPhishing'].astype('int64')
#add urls column
data5['urls'] = 0   # creates a new column 'url' and fills it with 0
data5['urls'] = data5['body'].apply(lambda x: 1 if re.search(url_pattern, str(x)) else 0)

#--------------------------------------------
data6 = pd.read_csv("../datasets/Assassin.csv", encoding='latin1', engine='python', on_bad_lines='skip')
#Assassin dataset, (5805, 5), 1716 phishing, 4089 safe, 
data6.drop(columns=["sender","receiver", "date"], inplace=True)
data6.rename(columns={'label': 'isPhishing'}, inplace=True) 
data6_nan = data6[data6['isPhishing'].isna()]
data6 = data6.dropna(subset=['isPhishing'])
data6['isPhishing'] = data6['isPhishing'].round().astype('int64')

#--------------------------------------------
data7 = pd.read_csv("../datasets/Ling.csv", encoding='latin1', engine='python', on_bad_lines='skip')
# Ling, has 3 features subject, body, isPhishing,(2859, 3), 458 phishing emails, 2401 safe emails.
data7.rename(columns={'label': 'isPhishing'}, inplace=True) 
data7_nan = data7[data7['isPhishing'].isna()]
data7 = data7.dropna(subset=['isPhishing'])
#add urls column
data7['urls'] = 0   # creates a new column 'url' and fills it with 0
data7['urls'] = data5['body'].apply(lambda x: 1 if re.search(url_pattern, str(x)) else 0)

#Combine all datasets
final_data = pd.concat([data1, data2, data3, data4, data5, data6, data7], ignore_index=True)
final_data['isPhishing'] = final_data['isPhishing'].astype(int)
final_data['urls'] = final_data['isPhishing'].astype(int)
# 2.Drop exact duplicates
final_data = final_data.drop_duplicates()
final_data = final_data.reset_index(drop=True)
# save it to a csv
final_data.to_csv('../datasets/final_dataset.csv', index=False, encoding='latin1', sep=',')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omark\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# some useful functions!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#to check a specific row
data1.loc[21609, 'body']
#check a sample
data1.sample(15)
#check features/ columns
data1.columns
#check general overview (rows and number of columns)
data1.shape
#see how many rows are in the dataset
len(data1)

# delete a dataset
del data223344
#drop a column
data4.drop(columns=["c1", "c2"], inplace=True)
# rename a column
data4.rename(columns={'oldname': 'newname'}, inplace=True) 
#check type and number of unique values
print(data1['isPhishing'].dtype)     
print(data1['isPhishing'].unique())   # check the unique values returns an array
# check the number of rows with each value
data1['label'].value_counts()
data3['label'].value_counts(dropna=False)

#convert to another datatype
data1['label'] = data['label'].astype(new data type)
#keep certain values:
# Keep only rows where label is 0 or 1
data3 = data3[data3['label'].isin(['0', '1'])]
#look at one row
data1.loc[123,'column']



In [None]:
#Plotting 
# Set style for better looking plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Create a figure with multiple subplots
fig = plt.figure(figsize=(15, 12))

# 1. Basic Distribution of Phishing vs Non-Phishing
plt.subplot(2, 3, 1)
counts = final_data['isPhishing'].value_counts()
labels = ['Non-Phishing (0)', 'Phishing (1)']
colors = ['#2ecc71', '#e74c3c']
plt.pie(counts.values, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
plt.title('Distribution of Phishing vs Non-Phishing Emails', fontsize=12, fontweight='bold')

# 2. Bar plot of the same data
plt.subplot(2, 3, 2)
bars = plt.bar(['Non-Phishing', 'Phishing'], counts.values, color=colors)
plt.title('Email Count by Category', fontsize=12, fontweight='bold')
plt.ylabel('Number of Emails')
# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height):,}',
             ha='center', va='bottom', fontweight='bold')

# 3. If you have email length data (create if you don't have it)
if 'email_length' not in final_data.columns:
    # Assuming you have a text column, replace 'email_text' with your actual column name
    # final_data['email_length'] = final_data['email_text'].str.len()
    # For demonstration, let's create synthetic length data
    np.random.seed(42)
    final_data['email_length'] = np.random.normal(500, 200, len(final_data))
    final_data['email_length'] = np.abs(final_data['email_length'])  # Make sure no negative lengths

plt.subplot(2, 3, 3)
phishing_lengths = final_data[final_data['isPhishing'] == 1]['email_length']
non_phishing_lengths = final_data[final_data['isPhishing'] == 0]['email_length']

plt.hist([non_phishing_lengths, phishing_lengths], bins=50, alpha=0.7, 
         label=['Non-Phishing', 'Phishing'], color=colors)
plt.title('Email Length Distribution', fontsize=12, fontweight='bold')
plt.xlabel('Email Length (characters)')
plt.ylabel('Frequency')
plt.legend()

# 4. Box plot comparing email lengths
plt.subplot(2, 3, 4)
data_to_plot = [non_phishing_lengths, phishing_lengths]
box_plot = plt.boxplot(data_to_plot, labels=['Non-Phishing', 'Phishing'], patch_artist=True)
box_plot['boxes'][0].set_facecolor(colors[0])
box_plot['boxes'][1].set_facecolor(colors[1])
plt.title('Email Length Comparison', fontsize=12, fontweight='bold')
plt.ylabel('Email Length (characters)')


# 5. Summary statistics
plt.subplot(2, 3, 6)
plt.axis('off')  # Turn off axis for text display

# Calculate some statistics
total_emails = len(final_data)
phishing_count = (final_data['isPhishing'] == 1).sum()
non_phishing_count = (final_data['isPhishing'] == 0).sum()
phishing_percentage = (phishing_count / total_emails) * 100

stats_text = f"""
Dataset Summary Statistics

Total Emails: {total_emails:,}

Phishing Emails: {phishing_count:,}
Non-Phishing Emails: {non_phishing_count:,}

Phishing Rate: {phishing_percentage:.1f}%

Dataset Balance:
{'Balanced' if abs(phishing_percentage - 50) < 10 else 'Imbalanced'}

Avg Email Length:
Non-Phishing: {non_phishing_lengths.mean():.0f} chars
Phishing: {phishing_lengths.mean():.0f} chars
"""

plt.text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.7))

# Adjust layout and display
plt.tight_layout()
plt.suptitle('Phishing Email Dataset Analysis', fontsize=16, fontweight='bold', y=0.98)
plt.subplots_adjust(top=0.93)
plt.show()

# Optional: Individual plots with more detail
def create_detailed_distribution_plot():
    """Create a more detailed distribution plot"""
    plt.figure(figsize=(10, 6))
    
    # Create a more detailed bar plot
    counts = final_data['isPhishing'].value_counts().sort_index()
    bars = plt.bar(['Non-Phishing (0)', 'Phishing (1)'], counts.values, 
                   color=['#3498db', '#e74c3c'], alpha=0.8, edgecolor='black', linewidth=1.2)
    
    plt.title('Phishing Email Dataset Distribution', fontsize=16, fontweight='bold', pad=20)
    plt.ylabel('Number of Emails', fontsize=12)
    plt.xlabel('Email Category', fontsize=12)
    
    # Add value labels and percentages
    total = counts.sum()
    for i, bar in enumerate(bars):
        height = bar.get_height()
        percentage = (height / total) * 100
        plt.text(bar.get_x() + bar.get_width()/2., height + total*0.01,
                f'{int(height):,}\n({percentage:.1f}%)',
                ha='center', va='bottom', fontweight='bold', fontsize=11)
    
    # Add grid for better readability
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    plt.ylim(0, max(counts.values) * 1.1)
    
    plt.tight_layout()
    plt.show()

# Call the detailed plot function
create_detailed_distribution_plot()

print("Plots generated successfully!")
print(f"Dataset contains {len(final_data):,} emails")
print(f"Phishing emails: {(final_data['isPhishing'] == 1).sum():,}")
print(f"Non-phishing emails: {(final_data['isPhishing'] == 0).sum():,}")

In [58]:
final_data.shape


(202917, 4)

In [49]:
final_data['urls'].value_counts()

urls
0    108612
1     94305
Name: count, dtype: int64