<a href="https://colab.research.google.com/github/bhavanisankardavuluri10/bhavanisankardavuluri10/blob/main/spam_advance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Display dataset overview
print("🟢 Dataset Preview:")
print(data.head())

print("\n🟠 Dataset Info:")
print(data.info())

print("\n🔵 Value Counts:")
print(data['label'].value_counts())


🟢 Dataset Preview:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

🟠 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None

🔵 Value Counts:
label
ham     4825
spam     747
Name: count, dtype: int64


In [7]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Clean the messages
data['message'] = data['message'].apply(clean_text)

# Encode labels (ham = 0, spam = 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(data['message']).toarray()
y = data['label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Initialize models with improved settings
log_reg_model = LogisticRegression(max_iter=2000, C=1.0)
naive_bayes_model = ComplementNB()
svm_model = SVC(C=1.0, kernel='linear', probability=True)
random_forest_model = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)

# Train each model
log_reg_model.fit(X_train, y_train)
naive_bayes_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)


In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(model, model_name):
    predictions = model.predict(X_test)
    print(f"\n📊 Evaluation for {model_name}")
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Precision:", precision_score(y_test, predictions))
    print("Recall:", recall_score(y_test, predictions))
    print("F1 Score:", f1_score(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("-" * 50)

# Evaluate all models
evaluate_model(log_reg_model, "Logistic Regression")
evaluate_model(naive_bayes_model, "Naive Bayes")
evaluate_model(svm_model, "SVM")
evaluate_model(random_forest_model, "Random Forest")



📊 Evaluation for Logistic Regression
Accuracy: 0.972488038277512
Precision: 0.989010989010989
Recall: 0.8035714285714286
F1 Score: 0.8866995073891626
Confusion Matrix:
 [[1446    2]
 [  44  180]]
--------------------------------------------------

📊 Evaluation for Naive Bayes
Accuracy: 0.9581339712918661
Precision: 0.7789855072463768
Recall: 0.9598214285714286
F1 Score: 0.86
Confusion Matrix:
 [[1387   61]
 [   9  215]]
--------------------------------------------------

📊 Evaluation for SVM
Accuracy: 0.9898325358851675
Precision: 0.9952153110047847
Recall: 0.9285714285714286
F1 Score: 0.9607390300230947
Confusion Matrix:
 [[1447    1]
 [  16  208]]
--------------------------------------------------

📊 Evaluation for Random Forest
Accuracy: 0.9742822966507177
Precision: 1.0
Recall: 0.8080357142857143
F1 Score: 0.8938271604938272
Confusion Matrix:
 [[1448    0]
 [  43  181]]
--------------------------------------------------


In [10]:
from sklearn.metrics import precision_recall_curve

def threshold_tuning(model, X_test, y_test):
    probabilities = model.predict_proba(X_test)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_test, probabilities)

    # Find the best threshold for balanced precision and recall
    best_threshold_index = (precision + recall).argmax()
    best_threshold = thresholds[best_threshold_index]

    print(f"Best Threshold for {model}: {best_threshold:.2f}")
    return best_threshold

# Run threshold tuning for each model
best_threshold_log_reg = threshold_tuning(log_reg_model, X_test, y_test)
best_threshold_nb = threshold_tuning(naive_bayes_model, X_test, y_test)
best_threshold_svm = threshold_tuning(svm_model, X_test, y_test)
best_threshold_rf = threshold_tuning(random_forest_model, X_test, y_test)


Best Threshold for LogisticRegression(max_iter=2000): 0.32
Best Threshold for ComplementNB(): 0.78
Best Threshold for SVC(kernel='linear', probability=True): 0.66
Best Threshold for RandomForestClassifier(max_depth=30, n_estimators=200, random_state=42): 0.25


In [11]:
import joblib

# Save models with their best thresholds
joblib.dump((log_reg_model, best_threshold_log_reg), 'logistic_regression.pkl')
joblib.dump((naive_bayes_model, best_threshold_nb), 'naive_bayes.pkl')
joblib.dump((svm_model, best_threshold_svm), 'svm.pkl')
joblib.dump((random_forest_model, best_threshold_rf), 'random_forest.pkl')

# Save the improved vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

print("✅ Models with optimized thresholds saved successfully!")


✅ Models with optimized thresholds saved successfully!


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Retrain models with class balancing
log_reg_model = LogisticRegression(max_iter=2000, C=1.0, class_weight='balanced')
random_forest_model = RandomForestClassifier(n_estimators=200,
                                             max_depth=30,
                                             random_state=42,
                                             class_weight='balanced')

log_reg_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)


In [12]:
best_threshold_log_reg = threshold_tuning(log_reg_model, X_test, y_test)
best_threshold_rf = threshold_tuning(random_forest_model, X_test, y_test)


Best Threshold for LogisticRegression(max_iter=2000): 0.32
Best Threshold for RandomForestClassifier(max_depth=30, n_estimators=200, random_state=42): 0.25


In [13]:
import joblib
joblib.dump((log_reg_model, best_threshold_log_reg), 'logistic_regression.pkl')
joblib.dump((random_forest_model, best_threshold_rf), 'random_forest.pkl')
print("✅ Improved models saved successfully!")


✅ Improved models saved successfully!
