In [27]:
# Connecting google colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
# Importing required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [29]:
# Reading the data set as data
data = pd.read_csv('/content/drive/MyDrive/Datasets/Email_Spam_Data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Ham means good email, spam means bad email

In [30]:
# Check some numerical information from the data
data.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [31]:
# Define a function to convert the 'ham' to 0 and 'spam' to 1 in a newly created 'spam' column
def spam_detect(entry):
  if entry == 'ham':
    return 0
  else:
    return 1

# Apply the function to create a new column called spam
data['spam'] = data['Category'].apply(spam_detect)
data = data.drop(['Category'], axis = 1)
data.head()

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [32]:
# Separating the input(X) and outpuy(y) variables
X = data['Message']
y = data['spam']

In [20]:
# Define CountVectorizer objcet
cntv = CountVectorizer()

In [34]:
# Convering the inpus into matrix format
X_vectorized = cntv.fit_transform(X)

In [35]:
# Train test spliting the data set
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size = 0.2, random_state=42)

In [36]:
# Define the model object
model = MultinomialNB()

In [37]:
# Train the model with X_train_count and y_train
model.fit(X_train, y_train)

In [38]:
# Predicting the outcome
y_pred = model.predict(X_test)

In [39]:
# Check the accuracy score of the model
print('Accuracy Score of the model is:', accuracy_score(y_test, y_pred))

Accuracy Score of the model is: 0.9856502242152466


#### The problem with this approach is when a new email arrives, one must convert it into a matrix form then it only can be supplied to the model for predicting. This can be resolved by using a pipeline in ML

In [42]:
# Building the Complete pipeline for this model in a single step
# Name the model as pipeline

# Separate features and target
X = data['Message']
y = data['spam']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),   # Step 1: Convert text to vectors
    ('nb', MultinomialNB())              # Step 2: Train a Naive Bayes model
])

# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

Accuracy: 0.9919282511210762
