### Naive Bayes
Naive Bayes is a classification algorithm based on Bayes' Theorem, which assumes that features are independent given the class. It is widely used in spam detection, sentiment analysis, and document classification due to its simplicity and effectiveness.

### Binary Classification Example

In [5]:
from collections import Counter, defaultdict

# Minimal Naive Bayes example for binary classification

# Training data: features are 'rainy' and 'sunny', labels are 'yes' or 'no'
data = [
    (['rainy'], 'yes'),
    (['sunny'], 'no'),
    (['rainy'], 'yes'),
    (['sunny'], 'no'),
    (['rainy'], 'yes')
]

# Calculate priors and likelihoods

labels = [label for _, label in data]
priors = {label: labels.count(label)/len(labels) for label in set(labels)}

word_counts = defaultdict(lambda: defaultdict(int))
label_counts = Counter(labels)

for features, label in data:
    for word in features:
        word_counts[label][word] += 1

def predict(features):
    probs = {}
    for label in priors:
        prob = priors[label]
        for word in features:
            # Laplace smoothing
            prob *= (word_counts[label][word] + 1) / (label_counts[label] + 2)
        probs[label] = prob
    return max(probs, key=probs.get)

# Test
print(predict(['rainy']))  # Output: 'yes'
print(predict(['sunny']))  # Output: 'no'

yes
no


### Email Classifier Example

In [1]:
# Classes: Spam vs Not Spam
classes = {
    'spam': {
        'prior': 0.3,
        'word_probs': {
            'free': 0.6,
            'money': 0.5,
            'urgent': 0.4,
            'meeting': 0.1
        }
    },
    'not_spam': {
        'prior': 0.7,
        'word_probs': {
            'free': 0.1,
            'money': 0.05,
            'urgent': 0.02,
            'meeting': 0.3
        }
    }
}

In [3]:
def classify_email(words, classes):
    """Classify email based on words using Naive Bayes"""
    results = {}
    
    for class_name, class_info in classes.items():
        # Start with prior
        probability = class_info['prior']
        
        # Multiply by likelihood for each word (Naive assumption: independence)
        for word in words:
            if word in class_info['word_probs']:
                probability *= class_info['word_probs'][word]
        
        results[class_name] = probability
    
    # Normalize to get probabilities
    total = sum(results.values())
    if total > 0:
        for class_name in results:
            results[class_name] /= total
    
    return results

In [4]:
# Test emails
test_emails = [
    ['free', 'money'],           # Likely spam
    ['meeting', 'urgent'],       # Mixed signals
    ['free', 'meeting']          # Could go either way
]

print("Naive Bayes Email Classification:")
print("=" * 50)

for i, email_words in enumerate(test_emails, 1):
    probabilities = classify_email(email_words, classes)
    print(f"\nEmail {i}: {email_words}")
    for class_name, prob in probabilities.items():
        print(f"  P({class_name}) = {prob:.4f} ({prob:.1%})")
    
    # Prediction
    prediction = max(probabilities, key=probabilities.get)
    print(f"  Prediction: {prediction.upper()}")

Naive Bayes Email Classification:

Email 1: ['free', 'money']
  P(spam) = 0.9626 (96.3%)
  P(not_spam) = 0.0374 (3.7%)
  Prediction: SPAM

Email 2: ['meeting', 'urgent']
  P(spam) = 0.7407 (74.1%)
  P(not_spam) = 0.2593 (25.9%)
  Prediction: SPAM

Email 3: ['free', 'meeting']
  P(spam) = 0.4615 (46.2%)
  P(not_spam) = 0.5385 (53.8%)
  Prediction: NOT_SPAM
