#Naïve bayes_assignment_2

In [None]:
# Difference between Bernoulli Naive Bayes and Multinomial Naive Bayes
# Bernoulli Naive Bayes: Assumes binary features (0 or 1) indicating whether a particular term occurs or not in the document. It is suitable for discrete data and typically used for document classification tasks where the presence or absence of a feature matters.

# Multinomial Naive Bayes: Deals with discrete counts, where features represent occurrences of events (e.g., word counts in text classification).
# It is appropriate when the frequency of occurrences matters rather than just their presence or absence.

In [None]:
# How Bernoulli Naive Bayes handles missing values
# Bernoulli Naive Bayes in scikit-learn treats missing values (NaNs) as zeros in the feature vectors. This means that missing values are considered as the absence of a feature in the document (0

In [None]:
# Can Gaussian Naive Bayes be used for multi-class classification?
# Yes, Gaussian Naive Bayes can be used for multi-class classification. It assumes that features follow a Gaussian distribution, and it calculates the likelihood of each class based on the Gaussian probability density function. It's suitable when features are continuous and can be used for problems with more than two classes.

In [4]:
# Q5. Assignment: Implementing Naive Bayes Classifiers
# Steps to Implement:
# Data Preparation:

# Download the "Spambase Data Set" from the UCI Machine Learning Repository.
# Load and preprocess the dataset as needed.
# Implement Naive Bayes Classifiers:

# Implement Bernoulli Naive Bayes, Multinomial Naive Bayes, and Gaussian Naive Bayes using scikit-learn.
# Evaluation:

# Use 10-fold cross-validation to evaluate each classifier's performance.
# Calculate and report the following metrics: Accuracy, Precision, Recall, F1-score.
# Discussion:

# Discuss the results obtained from each classifier.
# Analyze which variant of Naive Bayes performed the best and provide reasons for your observation.
# Identify any limitations or challenges observed with Naive Bayes classifiers.
# Conclusion:

# Summarize the findings from your evaluation.
# Provide suggestions for future work or improvements.

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the Spambase dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
columns = [
    f'word_freq_{i}' for i in range(48)
] + ['char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'class']

data = pd.read_csv(url, header=None, names=columns)

# Prepare data
X = data.drop('class', axis=1)
y = data['class']

# Initialize classifiers
bernoulli_nb = BernoulliNB()
multinomial_nb = MultinomialNB()
gaussian_nb = GaussianNB()

# Function to evaluate a classifier using cross-validation
def evaluate_classifier(clf, X, y):
    accuracy = cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()
    precision = cross_val_score(clf, X, y, cv=10, scoring='precision').mean()
    recall = cross_val_score(clf, X, y, cv=10, scoring='recall').mean()
    f1 = cross_val_score(clf, X, y, cv=10, scoring='f1').mean()
    return accuracy, precision, recall, f1

# Evaluate each classifier
results = {}
for clf, name in [(bernoulli_nb, 'Bernoulli Naive Bayes'), (multinomial_nb, 'Multinomial Naive Bayes'), (gaussian_nb, 'Gaussian Naive Bayes')]:
    accuracy, precision, recall, f1 = evaluate_classifier(clf, X, y)
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}

# Print results
for clf_name, metrics in results.items():
    print(f'{clf_name}:')
    for metric, value in metrics.items():
        print(f' - {metric}: {value:.4f}')

# Discussion and Conclusion would follow based on the results obtained


Bernoulli Naive Bayes:
 - Accuracy: 0.8839
 - Precision: 0.8870
 - Recall: 0.8152
 - F1-score: 0.8481
Multinomial Naive Bayes:
 - Accuracy: 0.7863
 - Precision: 0.7393
 - Recall: 0.7215
 - F1-score: 0.7283
Gaussian Naive Bayes:
 - Accuracy: 0.8218
 - Precision: 0.7104
 - Recall: 0.9570
 - F1-score: 0.8131


In [None]:
# Discussion and Conclusion:
# Discussion: Compare the performance metrics (Accuracy, Precision, Recall, F1-score) across different Naive Bayes classifiers.
# Analyze which variant performed the best and discuss possible reasons (e.g., suitability of data characteristics).

# Conclusion: Summarize findings from the evaluation, discuss limitations observed (e.g., assumptions of independence in Naive Bayes), and suggest areas for future research or improvement.