In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [71]:
df = pd.read_csv('final_combined.csv')

In [72]:
df

Unnamed: 0,email,category
0,Congratulations! You've won a $1000 Walmart gi...,spam
1,Join us for a special event this weekend!,social
2,Limited time offer: 50% off your next purchase!,promotional
3,"Hey, just checking in to see how you're doing.",personal
4,Your bank statement is ready for review.,finance
...,...,...
2998,Please confirm your email address to continue ...,important
2999,Your scheduled payment is due in 2 days. Check...,important
3000,Your account requires immediate attention. Ple...,important
3001,Your recent purchase receipt is available. Che...,finance


In [73]:
X = df['email'].astype(str) 
y = df['category']

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [75]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Vectorizes the raw text
    ('classifier', RandomForestClassifier())  # Random Forest classifier
])

In [83]:
param_grid = {
    'tfidf__max_df': [0.7, 0.8, 0.85],
    'tfidf__min_df': [0.01, 0.05],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'classifier__n_estimators': [50, 100, 200],  # Reduced upper limit
    'classifier__max_depth': [None, 5, 10, 15],  # Lower depth
    'classifier__min_samples_split': [2, 5, 10], 
    'classifier__max_features': ['auto', 'sqrt'],  # New hyperparameter
    'classifier__class_weight': ['balanced', None]
}

In [84]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

In [85]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


In [86]:
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'classifier__class_weight': None, 'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200, 'tfidf__max_df': 0.7, 'tfidf__min_df': 0.01, 'tfidf__ngram_range': (1, 1)}


In [87]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [88]:
train_accuracy = best_model.score(X_train, y_train)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.9600
Test Accuracy: 0.7870


In [89]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

     finance       1.00      0.33      0.50         3
   important       0.00      0.00      0.00         3
    personal       0.00      0.00      0.00         3
 promotional       1.00      0.67      0.80         3
      social       1.00      0.33      0.50         3
        spam       0.00      0.00      0.00         3
     finance       0.82      0.81      0.81       100
   important       0.70      0.84      0.77       115
    personal       0.84      0.85      0.84        73
 promotional       0.79      0.89      0.84       103
      social       0.80      0.63      0.71       103
        spam       0.81      0.81      0.81        89

    accuracy                           0.79       601
   macro avg       0.65      0.51      0.55       601
weighted avg       0.78      0.79      0.78       601

