# 🔍 Spam Detection: Advanced Feature Visualization

## Exploring Model Parameters and Token Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Seed for reproducibility
np.random.seed(42)

## 📊 Dataset Preparation

In [None]:
# Sample Email Dataset
emails = [
    "Urgent! You've won a free iPhone. Click here now!",
    "Meeting scheduled for project review next week",
    "Get rich quick with this amazing investment opportunity!",
    "Quarterly team performance report attached",
    "Limited time offer: Massive discount on luxury watches!",
    "Client proposal for Q3 marketing strategy",
    "Congratulations! You're selected for a free cruise!",
    "Weekly team sync-up agenda and discussion points"
]

labels = [
    'spam', 'not_spam', 'spam', 'not_spam', 
    'spam', 'not_spam', 'spam', 'not_spam'
]

# Preprocess emails
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

processed_emails = [preprocess_text(email) for email in emails]

## 🧠 Token and Feature Extraction

In [None]:
# Vectorization
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(processed_emails)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Create feature matrix
feature_matrix = X_vectorized.toarray()

# Create DataFrame for visualization
feature_df = pd.DataFrame(feature_matrix, columns=feature_names, index=[f'Email {i+1}' for i in range(len(emails))])
print("Feature Matrix:")
print(feature_df)

## 🌈 Token Frequency Visualization

In [None]:
# Calculate token frequencies
token_frequencies = feature_df.sum()

# Visualization
plt.figure(figsize=(15, 6))
token_frequencies.sort_values(ascending=False).plot(kind='bar')
plt.title('Token Frequencies Across Emails')
plt.xlabel('Tokens')
plt.ylabel('Frequency')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 🎯 Spam vs Not Spam Token Analysis

In [None]:
# Separate spam and not spam emails
spam_emails = [email for email, label in zip(processed_emails, labels) if label == 'spam']
not_spam_emails = [email for email, label in zip(processed_emails, labels) if label == 'not_spam']

# Vectorize spam and not spam separately
spam_vectorizer = CountVectorizer()
not_spam_vectorizer = CountVectorizer()

spam_matrix = spam_vectorizer.fit_transform(spam_emails)
not_spam_matrix = not_spam_vectorizer.fit_transform(not_spam_emails)

# Get feature names
spam_features = spam_vectorizer.get_feature_names_out()
not_spam_features = not_spam_vectorizer.get_feature_names_out()

# Calculate token frequencies
spam_token_freq = pd.Series(spam_matrix.toarray().sum(axis=0), index=spam_features)
not_spam_token_freq = pd.Series(not_spam_matrix.toarray().sum(axis=0), index=not_spam_features)

# Visualization
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
spam_token_freq.sort_values(ascending=False).head(10).plot(kind='bar', color='red')
plt.title('Top 10 Spam Tokens')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
not_spam_token_freq.sort_values(ascending=False).head(10).plot(kind='bar', color='green')
plt.title('Top 10 Not Spam Tokens')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

## 🔬 Probabilistic Feature Heatmap

In [None]:
# Train Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(X_vectorized, labels)

# Get log probabilities
feature_log_prob = classifier.feature_log_prob_

# Create heatmap of feature log probabilities
plt.figure(figsize=(15, 8))
sns.heatmap(
    feature_log_prob, 
    annot=True, 
    cmap='coolwarm', 
    xticklabels=feature_names,
    yticklabels=['Spam', 'Not Spam']
)
plt.title('Feature Log Probabilities: Spam vs Not Spam')
plt.xlabel('Tokens')
plt.ylabel('Class')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 🌟 Key Insights

### Token Analysis Findings
- Identified distinctive tokens for spam and legitimate emails
- Visualized token frequencies
- Explored probabilistic feature distributions