# importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import math

# Naïve Bayes classify function


In [2]:
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    p1 = sum(vec2Classify * p1Vec) + math.log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + math.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

# Naïve Bayes classifier training function


In [3]:
def trainNB0(trainMatrix, trainCategory):
    numTrainDocs = len(trainMatrix)  # Total number of documents
    numWords = len(trainMatrix[0])    # Total number of unique words (features)

    pAbusive = sum(trainCategory) / float(numTrainDocs)  # Probability of spam
    p0Num = np.zeros(numWords)  # Initialize counts for non-spam
    p1Num = np.zeros(numWords)  # Initialize counts for spam
    p0Denom = 0.0  # Total words in non-spam
    p1Denom = 0.0  # Total words in spam

    for i in range(numTrainDocs):  # Loop through each document
        if trainCategory[i] == 1:  # If the document is spam
            p1Num += trainMatrix[i]  # Add word counts for spam
            p1Denom += sum(trainMatrix[i])  # Total words in spam
        else:  # If the document is not spam
            p0Num += trainMatrix[i]  # Add word counts for non-spam
            p0Denom += sum(trainMatrix[i])  # Total words in non-spam

    # Apply Laplace smoothing and convert to log probabilities
    p1Vect = np.log((p1Num + 1) / (p1Denom + numWords))  # Log probabilities for spam
    p0Vect = np.log((p0Num + 1) / (p0Denom + numWords))  # Log probabilities for non-spam

    return p0Vect, p1Vect, pAbusive  # Return word probabilities and spam probability


In [4]:
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [6]:
df['Spam']=df['Category'].apply(lambda x:1 if x == 'spam' else 0)
df.head(5)

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


# CounterVectorizer Convert the text into matrics 

In [7]:
X = df['Message']
y= df['Spam']
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# splite data to test and train data

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X_vectorized,y,test_size=0.25)
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# Train the model

In [9]:
p0Vect, p1Vect, pAbusive = trainNB0(X_train.toarray(), y_train.to_numpy())

- p0Vect and p1Vect are crucial for determining the likelihood of each word appearing in spam and non-spam emails.
    - p1Vect is P(word/spam)
    - p0Vec is P(word/non-spam)
- "pAbusive" is the probability of spam P(Spam)

# Classssify new emails

## function to classify new emails

In [10]:
def classify_email(email):
    email_vectorized = vectorizer.transform([email]).toarray()  # Vectorize the new email
    prediction = classifyNB(email_vectorized[0], p0Vect, p1Vect, pAbusive)  # Classify the email
    return "Spam" if prediction == 1 else "Not Spam"

## exemple

In [11]:
new_email = "Claim your free gift now! Don't miss out!"
print(f'The email: "{new_email}" is classified as: {classify_email(new_email)}')

The email: "Claim your free gift now! Don't miss out!" is classified as: Spam


In [12]:
new_email = "hi dear friend, how are you?"
print(f'The email: "{new_email}" is classified as: {classify_email(new_email)}')

The email: "hi dear friend, how are you?" is classified as: Not Spam


# Error Rate

## Error rate function

In [13]:
def error_rate(y_true, y_pred):
    
    # Calculate the number of incorrect predictions
    incorrect = sum(y_true != y_pred)
    
    # Calculate the total number of predictions
    total = len(y_true)
    
    # Error rate = incorrect predictions / total predictions
    return incorrect / total




## exemple:

In [14]:
y_pred = [classifyNB(email, p0Vect, p1Vect, pAbusive) for email in X_test.toarray()]  # Predicted labels
error = error_rate(y_test, y_pred)  # Error rate
print(f'Error Rate: {error * 100:.2f}%')

Error Rate: 2.15%


In [15]:
print(f'porcentage to get the correct classification is : {(1-error) *100:2.2f}')

porcentage to get the correct classification is : 97.85
