# Problem 5

## Part 1:
**Step 1:** Load the Dataset into a Pandas DataFrame

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("email_phishing_data.csv")

# Display the first 5 rows to inspect the data
df.head()

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
0,140,94,52,0,0,0,0,0,0
1,5,5,1,0,0,0,0,0,0
2,34,32,15,0,0,0,0,0,0
3,6,6,2,0,0,0,0,0,0
4,9,9,2,0,0,0,0,0,0


In [2]:
df.describe()

Unnamed: 0,num_words,num_unique_words,num_stopwords,num_links,num_unique_domains,num_email_addresses,num_spelling_errors,num_urgent_keywords,label
count,524846.0,524846.0,524846.0,524846.0,524846.0,524846.0,524846.0,524846.0,524846.0
mean,276.228,123.012167,80.045465,0.895588,0.347767,2.114897,24.694731,0.245301,0.01324
std,3335.457,201.626478,1023.33038,5.603001,1.774209,13.592682,311.312358,0.55932,0.114301
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,38.0,12.0,0.0,0.0,0.0,2.0,0.0,0.0
50%,120.0,79.0,34.0,0.0,0.0,0.0,8.0,0.0,0.0
75%,269.0,145.0,79.0,0.0,0.0,1.0,22.0,0.0,0.0
max,2339682.0,51251.0,720411.0,824.0,524.0,1150.0,190104.0,7.0,1.0


**Step 2:** Split Data into 80% Train and 20% Test with Stratified Sampling

In [3]:
from sklearn.model_selection import train_test_split

# Separate features (X) and labels (y)
X = df.drop("label", axis=1)  # Replace "label" with your target column name
y = df["label"]

# Split into train (80%) and test (20%) with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,  # Fixed random state for reproducibility
    stratify=y       # Ensures equal class distribution in train/test
)

# Verify class distribution
print("Train class distribution:\n", y_train.value_counts(normalize=True))
print("\nTest class distribution:\n", y_test.value_counts(normalize=True))

Train class distribution:
 label
0    0.98676
1    0.01324
Name: proportion, dtype: float64

Test class distribution:
 label
0    0.986758
1    0.013242
Name: proportion, dtype: float64


## Part 2:
Implementing Logistic Regression on the Phishing Email Dataset

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train the model
# Increased max_iter for convergence
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9868


## Part 3:
Feature Selection with Sequential Forward Selection (SFS)

In [5]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

**Step 1:** Initialize SFS for 3, 4, and 5 Features

In [6]:
# SFS for 2 features
sfs_2 = SFS(logreg,
            k_features=2,
            forward=True,
            scoring='accuracy',
            cv=2,  # 2-fold cross-validation
            n_jobs=-1)

# SFS for 3 features
sfs_3 = SFS(logreg,
            k_features=3,
            forward=True,
            scoring='accuracy',
            cv=2,  # 2-fold cross-validation
            n_jobs=-1)

# SFS for 4 features
sfs_4 = SFS(logreg,
            k_features=4,
            forward=True,
            scoring='accuracy',
            cv=2,
            n_jobs=-1)

# SFS for 5 features
sfs_5 = SFS(logreg,
            k_features=5,
            forward=True,
            scoring='accuracy',
            cv=2,
            n_jobs=-1)

**Step 3:** Fit SFS and Report Selected Features

In [12]:
# Fit SFS to training data
sfs_2.fit(X_train, y_train)
sfs_3.fit(X_train, y_train)
sfs_4.fit(X_train, y_train)
sfs_5.fit(X_train, y_train)

# Get selected feature names
feature_names = X_train.columns.tolist()

# Print results
print("Top 2 Features:", [feature_names[i] for i in sfs_2.k_feature_idx_])
print("Top 3 Features:", [feature_names[i] for i in sfs_3.k_feature_idx_])
print("Top 4 Features:", [feature_names[i] for i in sfs_4.k_feature_idx_])
print("Top 5 Features:", [feature_names[i] for i in sfs_5.k_feature_idx_])

Top 2 Features: ['num_words', 'num_unique_domains']
Top 3 Features: ['num_words', 'num_unique_domains', 'num_spelling_errors']
Top 4 Features: ['num_words', 'num_unique_words', 'num_unique_domains', 'num_spelling_errors']
Top 5 Features: ['num_words', 'num_unique_words', 'num_unique_domains', 'num_email_addresses', 'num_spelling_errors']


## Part 4:
Implementing Custom Sequential Forward Selection (SFS) from Scratch

In [8]:
import numpy as np
from sklearn.model_selection import cross_val_score


class CustomSFS:
    def __init__(self, estimator, k_features, scoring=accuracy_score, cv=2):
        self.estimator = estimator
        self.k_features = k_features
        self.scoring = scoring
        self.cv = cv
        self.selected_features = []

    def fit(self, X, y):
        feature_set = set(range(X.shape[1]))
        self.selected_features = []

        for _ in range(self.k_features):
            best_score = -np.inf
            best_feature = None

            # Evaluate each candidate feature
            for feature in feature_set:
                current_features = self.selected_features + [feature]
                X_subset = X.iloc[:, current_features]

                # Cross-validation
                scores = cross_val_score(
                    self.estimator, X_subset, y,
                    scoring='accuracy', cv=self.cv, n_jobs=-1
                )
                mean_score = np.mean(scores)

                if mean_score > best_score:
                    best_score = mean_score
                    best_feature = feature

            # Add the best feature to selected features
            self.selected_features.append(best_feature)
            feature_set.remove(best_feature)

        return self

    def get_k_features(self):
        return self.selected_features

In [9]:
# Initialize CustomSFS
custom_sfs_2 = CustomSFS(
    estimator=LogisticRegression(
        max_iter=1000, random_state=42),
    k_features=2,
    cv=2
)

custom_sfs_3 = CustomSFS(
    estimator=LogisticRegression(
        max_iter=1000, random_state=42),
    k_features=3,
    cv=2
)

custom_sfs_4 = CustomSFS(
    estimator=LogisticRegression(
        max_iter=1000, random_state=42),
    k_features=4,
    cv=2
)

custom_sfs_5 = CustomSFS(
    estimator=LogisticRegression(
        max_iter=1000, random_state=42),
    k_features=5,
    cv=2
)

# Fit to data
custom_sfs_2.fit(X_train, y_train)
custom_sfs_3.fit(X_train, y_train)
custom_sfs_4.fit(X_train, y_train)
custom_sfs_5.fit(X_train, y_train)

# Get selected features
custom_features_2 = [X_train.columns[i] for i in custom_sfs_2.get_k_features()]
custom_features_3 = [X_train.columns[i] for i in custom_sfs_3.get_k_features()]
custom_features_4 = [X_train.columns[i] for i in custom_sfs_4.get_k_features()]
custom_features_5 = [X_train.columns[i] for i in custom_sfs_5.get_k_features()]

print("CustomSFS Top 2 Features:", custom_features_2)
print("CustomSFS Top 3 Features:", custom_features_3)
print("CustomSFS Top 4 Features:", custom_features_4)
print("CustomSFS Top 5 Features:", custom_features_5)

CustomSFS Top 2 Features: ['num_words', 'num_unique_domains']
CustomSFS Top 3 Features: ['num_words', 'num_unique_domains', 'num_spelling_errors']
CustomSFS Top 4 Features: ['num_words', 'num_unique_domains', 'num_spelling_errors', 'num_unique_words']
CustomSFS Top 5 Features: ['num_words', 'num_unique_domains', 'num_spelling_errors', 'num_unique_words', 'num_email_addresses']


**Validation That Results Are Equivalent:**

In [15]:
# The sets of selected features are identical
mlxtend_features_5 = [feature_names[i] for i in sfs_5.k_feature_idx_]
set(custom_features_5) == set(mlxtend_features_5)

True

In [16]:
# Check if both feature sets give same accuracy
from sklearn.metrics import accuracy_score

# Train on custom features
logreg.fit(X_train[custom_features_5], y_train)
custom_acc = accuracy_score(y_test, logreg.predict(X_test[custom_features_5]))

# Train on mlxtend features (same features, different order)
logreg.fit(X_train[mlxtend_features_5], y_train)
mlxtend_acc = accuracy_score(y_test, logreg.predict(X_test[mlxtend_features_5]))

print(f"Custom Accuracy: {custom_acc:.4f}, mlxtend Accuracy: {mlxtend_acc:.4f}")

Custom Accuracy: 0.9868, mlxtend Accuracy: 0.9868


### **Phishing Email Detection - Feature Selection Report**  

#### **1. Objective**  
Implement **Sequential Forward Selection (SFS)** to identify the most discriminative features for detecting phishing emails using Logistic Regression.  

#### **2. Methodology**  
- **Dataset**: Phishing email data with features like `num_words`, `num_unique_domains`, `num_spelling_errors`, etc.  
- **Approach**:  
  - Used **mlxtend’s SFS** and a **custom SFS implementation** to select top `k` features.  
  - Evaluated feature subsets using **2-fold cross-validation** with Logistic Regression.  
  - Compared results between both implementations.  

#### **3. Key Findings**  
##### **Selected Features (Identical in Both Implementations)**  
| Features Selected | Top 2 | Top 3 | Top 4 | Top 5 |
|------------------|-------|-------|-------|-------|
| **CustomSFS** | `num_words`, `num_unique_domains` | + `num_spelling_errors` | + `num_unique_words` | + `num_email_addresses` |
| **mlxtend SFS** | `num_words`, `num_unique_domains` | + `num_spelling_errors` | + `num_unique_words` | + `num_email_addresses` |

- **Consistency**: Both methods selected the **same features**, differing only in **minor ordering** (no impact on model performance).  
- **Performance**:  
  - **Accuracy (Top 5 Features)**: **0.9868** (same for both implementations).  
  - **Most Important Features**:  
    1. `num_words` (email length is a strong phishing indicator)  
    2. `num_unique_domains` (multiple suspicious domains → phishing)  
    3. `num_spelling_errors` (common in scam emails)  