# Part-I: Naive Bayes Classifier 

# Task 1: Theory Questions 

Answer in 2–4 sentences: 
1. What is the core assumption of Naive Bayes? 
* The core assumption of Naive Bayes is that each feature contributes independently to the probability of a certain outcome.

2. Differentiate between GaussianNB, MultinomialNB, and BernoulliNB. 
* GaussianNB: Used for continuous data, assumes features follow a normal distribution.
* MultinomialNB: Suitable for discrete data like word counts in text classification.
* BernoulliNB: Used for binary/boolean features where each feature is either present or absent.

3. Why is Naive Bayes considered suitable for high-dimensional data? 
* Because it assumes feature independence and requires fewer parameters to estimate which makes it computationally efficient and less prone to overfitting even suitable for high-dimensional data.

In [152]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Task 2: Spam Detection using MultinomialNB

In [None]:
data = pd.read_csv('emails2.csv')

In [154]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   spam    1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [155]:
data.shape

(1000, 2)

In [156]:
data

Unnamed: 0,text,spam
0,Subject: fwd : update return - path : receiv...,0
1,Subject: re : energy derivatives conference - ...,0
2,Subject: thanks dear mr . kaminski : thank y...,0
3,Subject: re : visual numerics cnl licensing is...,0
4,Subject: promote your business the power of e...,1
...,...,...
995,"Subject: moore medz hello , welcome to medzon...",1
996,Subject: more then 70 great pornstars sex movl...,1
997,Subject: re : possible rtp conference i look ...,0
998,"Subject: get big , ripped & strong ! ! deca , ...",1


In [157]:
X = data['text']
y = data['spam']

In [158]:
tfd = TfidfVectorizer()
Xvector = tfd.fit_transform(X)

In [159]:
X_train, X_test, y_train, y_test = train_test_split(Xvector,y,test_size=0.2,random_state=42)

In [160]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [161]:
y_pred = clf.predict(X_test)

In [162]:
print("Accuracy:        ", accuracy_score(y_test, y_pred))
print("Precision:       ", precision_score(y_test, y_pred))
print("Recall:          ", recall_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy:         0.995
Precision:        1.0
Recall:           0.99
Confusion matrix:
 [[100   0]
 [  1  99]]


# Task 3: GaussianNB with Iris or Wine Dataset 

In [163]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [164]:
df = load_iris()
X = df.data
y = df.target

In [165]:
df

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

GaussianNB classifier

In [167]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [168]:
y_pred_gnb = gnb.predict(X_test)
print("Accuracy:        ", accuracy_score(y_test, y_pred_gnb))
print("Precision:       ", precision_score(y_test, y_pred_gnb, average='macro'))
print("Recall:          ", recall_score(y_test, y_pred_gnb, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_gnb))


Accuracy:         1.0
Precision:        1.0
Recall:           1.0
Confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [169]:
print(classification_report(y_test, y_pred_gnb))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Logistic Regression

In [170]:
lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [171]:
y_pred_lr = lr.predict(X_test)


In [172]:
print("Accuracy:        ", accuracy_score(y_test, y_pred_lr))
print("Precision:       ", precision_score(y_test, y_pred_lr, average='macro'))
print("Recall:          ", recall_score(y_test, y_pred_lr, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))


Accuracy:         1.0
Precision:        1.0
Recall:           1.0
Confusion matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [173]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

