# Step 1: Data Preperation #

In [2]:
import pandas as pd

# Load the dataset
file_path = 'emails.csv'
emails_df = pd.read_csv(file_path)

# Display the first few rows of the dataset and basic info
emails_df.head(), emails_df.info()

# Check for missing values
missing_values = emails_df.isnull().sum().sum()

# Inspect the target variable 'Prediction'
target_variable_info = emails_df['Prediction'].value_counts()

missing_values, target_variable_info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


(0,
 0    3672
 1    1500
 Name: Prediction, dtype: int64)

# Step 2: Model Selection #

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the dataset into features and target variable
X = emails_df.drop(columns=['Email No.', 'Prediction'])
y = emails_df['Prediction']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Confirm the shapes of the training and testing sets
X_train_scaled.shape, X_test_scaled.shape, y_train.shape, y_test.shape


((4137, 3000), (1035, 3000), (4137,), (1035,))

# Step 3: Model training and Evaluation #

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate and print model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Initialize models
log_reg = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
svm = SVC(random_state=42)

# Train models
log_reg.fit(X_train_scaled, y_train)
decision_tree.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)

# Evaluate models
log_reg_results = evaluate_model(log_reg, X_test_scaled, y_test)
decision_tree_results = evaluate_model(decision_tree, X_test_scaled, y_test)
svm_results = evaluate_model(svm, X_test_scaled, y_test)

log_reg_results, decision_tree_results, svm_results

print("Logistic Regression Results: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*log_reg_results))
print("Decision Tree Results: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*decision_tree_results))
print("SVM Results: Accuracy: {:.4f}, Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(*svm_results))


Logistic Regression Results: Accuracy: 0.9652, Precision: 0.9037, Recall: 0.9831, F1-score: 0.9417
Decision Tree Results: Accuracy: 0.9237, Precision: 0.8653, Recall: 0.8682, F1-score: 0.8668
SVM Results: Accuracy: 0.9469, Precision: 0.9959, Recall: 0.8176, F1-score: 0.8980
