In [54]:
# Install libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
import joblib
import numpy as np

In [55]:
# Load the DataFrame WITH HEADER (Fix #1)
df = pd.read_csv('features_with_clusters.csv')

# Verify columns loaded
print("Columns loaded from CSV:", df.columns.tolist())
print(f"Total rows: {len(df)}")
print("\nFirst few rows:")
print(df.head())

Columns loaded from CSV: ['email', 'spam', 'exclamation_marks', 'question_marks', 'flagged_words', 'cluster']
Total rows: 5728

First few rows:
                                               email  spam  exclamation_marks  \
0  Subject: naturally irresistible your corporate...     1                  0   
1  Subject: the stock trading gunslinger  fanny i...     1                  0   
2  Subject: unbelievable new homes made easy  im ...     1                  0   
3  Subject: 4 color printing special  request add...     1                  2   
4  Subject: do not have money , get software cds ...     1                  1   

   question_marks  flagged_words  cluster  
0               0              0        0  
1               0              0        0  
2               0              1        0  
3               0              0        0  
4               1              0        0  


In [56]:
# Clean the target variable
df['spam'] = pd.to_numeric(df['spam'], errors='coerce')
df.dropna(subset=['spam'], inplace=True)
df = df[df['spam'].isin([0.0, 1.0])]
df['spam'] = df['spam'].astype(int)
df.reset_index(drop=True, inplace=True)

In [57]:
# Convert numeric columns properly
numeric_cols = ['exclamation_marks', 'question_marks', 'flagged_words', 'cluster']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df[numeric_cols] = df[numeric_cols].fillna(0).astype(np.int64)

print(f"\nRows after cleaning: {len(df)}")
print(f"Spam distribution:\n{df['spam'].value_counts()}")
print(f"\nCluster distribution:\n{df['cluster'].value_counts()}")


Rows after cleaning: 5728
Spam distribution:
spam
0    4360
1    1368
Name: count, dtype: int64

Cluster distribution:
cluster
1    2954
0    2774
Name: count, dtype: int64


In [58]:
# FIX: Use TF-IDF on email text + ALL the features
print("\n--- Extracting TF-IDF features from email text ---")

# Extract TF-IDF features from email text
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=1000,  # Limit features for XGBoost performance
    min_df=2,  # Ignore very rare words
    max_df=0.95  # Ignore very common words
)

tfidf_matrix = tfidf.fit_transform(df['email']).toarray()
tfidf_df = pd.DataFrame(
    tfidf_matrix,
    columns=[f'tfidf_{word}' for word in tfidf.get_feature_names_out()]
)

print(f"TF-IDF matrix shape: {tfidf_df.shape}")

# FIX: Use cluster as categorical feature with one-hot encoding
print("\n--- One-hot encoding cluster feature ---")

cluster_dummies = pd.get_dummies(df['cluster'], prefix='cluster')
print(f"Cluster dummy variables:\n{cluster_dummies.columns.tolist()}")


--- Extracting TF-IDF features from email text ---
TF-IDF matrix shape: (5728, 1000)

--- One-hot encoding cluster feature ---
Cluster dummy variables:
['cluster_0', 'cluster_1']


In [59]:
# Combine ALL features
X_basic = df[['exclamation_marks', 'question_marks', 'flagged_words']].reset_index(drop=True)
X_combined = pd.concat([X_basic, cluster_dummies, tfidf_df], axis=1)
y = df['spam'].reset_index(drop=True)

print(f"\nFinal feature matrix shape: {X_combined.shape}")
print(f"Total features: {len(X_combined.columns)}")
print(f"Feature breakdown:")
print(f"  - Basic features: 3")
print(f"  - Cluster features: {len(cluster_dummies.columns)}")
print(f"  - TF-IDF features: {len(tfidf_df.columns)}")


Final feature matrix shape: (5728, 1005)
Total features: 1005
Feature breakdown:
  - Basic features: 3
  - Cluster features: 2
  - TF-IDF features: 1000


In [60]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"Training spam ratio: {y_train.sum() / len(y_train):.3f}")
print(f"Testing spam ratio: {y_test.sum() / len(y_test):.3f}")

# Define hyperparameter grid (smaller for faster search)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}

# Calculate class imbalance ratio
spam_ratio = y_train.sum() / len(y_train)
scale_pos_weight = (1 - spam_ratio) / spam_ratio

print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")


Training set size: 4582
Testing set size: 1146
Training spam ratio: 0.239
Testing spam ratio: 0.239

Calculated scale_pos_weight: 3.19


In [62]:
# Calculate class imbalance ratio
spam_ratio = y_train.sum() / len(y_train)
scale_pos_weight = (1 - spam_ratio) / spam_ratio

print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        scale_pos_weight=scale_pos_weight
    ),
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

print("\n--- Starting Grid Search (this may take a while) ---")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best F1 score (CV): {grid_search.best_score_:.4f}")
best_xgb_model = grid_search.best_estimator_


Calculated scale_pos_weight: 3.19

--- Starting Grid Search (this may take a while) ---
Fitting 3 folds for each of 48 candidates, totalling 144 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



Best parameters found: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.9}
Best F1 score (CV): 0.9566


In [63]:
# Evaluate the model
y_pred = best_xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n‚úÖ Model Accuracy on Test Set: {accuracy:.4f}")

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam']))

conf_mat = confusion_matrix(y_test, y_pred)
print("\nüî¢ Confusion Matrix:")
print(conf_mat)


‚úÖ Model Accuracy on Test Set: 0.9738

üìä Classification Report:
              precision    recall  f1-score   support

    Not Spam       1.00      0.97      0.98       872
        Spam       0.91      0.99      0.95       274

    accuracy                           0.97      1146
   macro avg       0.95      0.98      0.97      1146
weighted avg       0.98      0.97      0.97      1146


üî¢ Confusion Matrix:
[[844  28]
 [  2 272]]


In [64]:
# Calculate and display specific metrics
tn, fp, fn, tp = conf_mat.ravel()
spam_recall = tp / (tp + fn)
spam_precision = tp / (tp + fp) if (tp + fp) > 0 else 0

print(f"\nüìà Key Spam Detection Metrics:")
print(f"  - Spam Recall (Sensitivity): {spam_recall:.2%} ({tp}/{tp + fn})")
print(f"  - Spam Precision: {spam_precision:.2%} ({tp}/{tp + fp})")
print(f"  - False Positive Rate: {fp / (fp + tn):.2%}")


üìà Key Spam Detection Metrics:
  - Spam Recall (Sensitivity): 99.27% (272/274)
  - Spam Precision: 90.67% (272/300)
  - False Positive Rate: 3.21%


In [65]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_combined.columns,
    'importance': best_xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nüéØ Top 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))


üéØ Top 20 Most Important Features:
          feature  importance
        cluster_1    0.266194
        cluster_0    0.212883
      tfidf_vince    0.037109
      tfidf_enron    0.036419
      tfidf_model    0.015417
       tfidf_date    0.009091
exclamation_marks    0.007493
    tfidf_meeting    0.007486
        tfidf_pro    0.006942
 tfidf_assistance    0.006634
  tfidf_utilities    0.006407
       tfidf_2000    0.006360
   tfidf_research    0.006057
      tfidf_kevin    0.006036
    tfidf_dollars    0.005935
   tfidf_attached    0.005849
      tfidf_allow    0.005781
     tfidf_energy    0.005522
     tfidf_issues    0.005383
   tfidf_february    0.005302


In [66]:
# Save the model and vectorizer
model_filename = 'optimized_spam_detector_xgb_model.joblib'
vectorizer_filename = 'tfidf_vectorizer.joblib'

try:
    joblib.dump(best_xgb_model, model_filename)
    joblib.dump(tfidf, vectorizer_filename)
    print(f"\n‚úÖ Successfully saved model as: {model_filename}")
    print(f"‚úÖ Successfully saved TF-IDF vectorizer as: {vectorizer_filename}")
except Exception as e:
    print(f"‚ùå Error saving files: {e}")

# Save comprehensive results
results_filename = 'spam_detector_performance_report.txt'
with open(results_filename, 'w') as f:
    f.write("=" * 70 + "\n")
    f.write("XGBoost Spam Detector Performance Report (OPTIMIZED MODEL)\n")
    f.write("=" * 70 + "\n\n")

    f.write(f"Best Parameters: {grid_search.best_params_}\n")
    f.write(f"Best CV F1 Score: {grid_search.best_score_:.4f}\n\n")

    f.write(f"Test Set Accuracy: {accuracy:.4f}\n\n")

    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred, target_names=['Not Spam', 'Spam']))

    f.write("\n\nConfusion Matrix:\n")
    f.write(str(conf_mat))

    f.write("\n\nKey Metrics:\n")
    f.write(f"Spam Recall: {spam_recall:.2%}\n")
    f.write(f"Spam Precision: {spam_precision:.2%}\n")

    f.write("\n\nTop 20 Important Features:\n")
    f.write(feature_importance.head(20).to_string(index=False))

print(f"\n‚úÖ Performance report saved as: {results_filename}")


‚úÖ Successfully saved model as: optimized_spam_detector_xgb_model.joblib
‚úÖ Successfully saved TF-IDF vectorizer as: tfidf_vectorizer.joblib

‚úÖ Performance report saved as: spam_detector_performance_report.txt
