In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [3]:
import re
data = pd.read_csv('/content/final_combined.csv')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

data['email'] = data['email'].apply(preprocess_text)

label_encoder = LabelEncoder()
data['category'] = label_encoder.fit_transform(data['category'])

X = data['email']
y = data['category']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_df=0.8, min_df=5, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [5]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

In [6]:
grid = GridSearchCV(SVC(decision_function_shape='ovr'), param_grid, cv=5)
grid.fit(X_train_tfidf, y_train)

# Best parameters
print(f"Best Parameters: {grid.best_params_}")

Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}


In [7]:
# Train the SVM model with the best parameters
best_svm_model = grid.best_estimator_
best_svm_model.fit(X_train_tfidf, y_train)


In [10]:
# Predict on training data
y_train_pred = best_svm_model.predict(X_train_tfidf)

print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")

Accuracy: 0.9600333055786844


In [8]:
# Predict and evaluate
y_pred = best_svm_model.predict(X_test_tfidf)

# Evaluation metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.8302828618968386
              precision    recall  f1-score   support

     finance       1.00      0.33      0.50         3
   important       0.00      0.00      0.00         3
    personal       0.00      0.00      0.00         3
 promotional       0.00      0.00      0.00         3
      social       0.00      0.00      0.00         3
        spam       0.00      0.00      0.00         3
     finance       0.89      0.91      0.90       100
   important       0.79      0.86      0.82       115
    personal       0.87      0.84      0.85        73
 promotional       0.80      0.89      0.84       103
      social       0.75      0.77      0.76       103
        spam       0.92      0.85      0.88        89

    accuracy                           0.83       601
   macro avg       0.50      0.45      0.46       601
weighted avg       0.81      0.83      0.82       601



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
