<a href="https://colab.research.google.com/github/atheeqmazarik/machinelearningfinal/blob/main/machine_learning_report.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import time

In [None]:
#Read our dataset
df = pd.read_csv('Phishing_Email.csv')

#Print first 5 rows
print(df.head())

   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  unbelievable new homes for the usa ! it ' s a ...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


In [None]:
#Number of observations (Emails)
print("Number of observations:", len(df))

#Number of variables (Columns)
print("Number of variables:", len(df.columns))


Number of observations: 18650
Number of variables: 3


In [None]:
#Email Type is mapped to binary values (Safe Email = 0, Phishing Email = 1)
df['Email Type'] = df['Email Type'].map({'Safe Email': 0, 'Phishing Email': 1})

#Detects any 'NaN' values in the dataset and drops those rows
df['Email Text'] = df['Email Text'].fillna('')
df = df.dropna(subset=['Email Type'])

#Distribution of emails (safe [0] or phishing [1])
print("Class Distribution:")
print(df['Email Type'].value_counts())


Class Distribution:
Email Type
0    11322
1     7328
Name: count, dtype: int64


In [None]:
X = df['Email Text']
y = df['Email Type']

#Split dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
#Convert text data (Email Text) into numerical data for the models to process

vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_train_vectorizer = vectorizer.fit_transform(X_train.values)
X_test_vectorizer = vectorizer.transform(X_test)

In [None]:
#Logistic Regression Model

startTime = time.time()

model = LogisticRegression(random_state=42, max_iter=1000)

model.fit(X_train_vectorizer, y_train) #Training model

y_pred = model.predict(X_test_vectorizer) #Predictions

endTime = time.time()
executionTime = endTime - startTime

#Evaluation Metrics
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")

#Execution time
print(f"Execution time: {(executionTime):.3f}s")

Logistic Regression Results:
Accuracy: 0.949
Precision: 0.916
Recall: 0.957
F1 Score: 0.936
Execution time: 0.194s


In [None]:
#Support Vector Machine (SVM) Model

startTime = time.time()

svmModel = SVC(kernel='linear', random_state=42)
svmModel.fit(X_train_vectorizer, y_train)

#Predictions
y_pred = svmModel.predict(X_test_vectorizer)

endTime = time.time()
executionTime = endTime - startTime

#Evaluation Metrics
print("SVM Model Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")

#Execution time
print(f"Execution time: {(executionTime):.3f}s")

SVM Model Results:
Accuracy: 0.941
Precision: 0.903
Recall: 0.954
F1 Score: 0.927
Execution time: 14.225s


In [None]:
#Random Forest Model

startTime = time.time()

randomForest = RandomForestClassifier(n_estimators=100, random_state=42)
randomForest.fit(X_train_vectorizer, y_train)

#Predictions
y_pred = randomForest.predict(X_test_vectorizer)

endTime = time.time()
executionTime = endTime - startTime

#Evaluation Metrics
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"Precision: {precision_score(y_test, y_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.3f}")

#Execution time
print(f"Execution time: {(executionTime):.3f}s")

Random Forest Results:
Accuracy: 0.957
Precision: 0.925
Recall: 0.970
F1 Score: 0.947
Execution time: 14.089s
