<a href="https://colab.research.google.com/github/arunrajanp/ML-Projects/blob/main/Email_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import pandas as pd
import numpy as np

In [33]:
path = "/content/drive/MyDrive/Data Science/Used Case/2. Classification /2. Email Spam Detection/spam.csv"

In [34]:
data = pd.read_csv(path)

In [35]:
data.nunique()

Category       2
Message     5157
dtype: int64

In [36]:
data['Category'].unique()

array(['ham', 'spam'], dtype=object)

In [37]:
data.dtypes

Category    object
Message     object
dtype: object

In [38]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Encoding the column

In [39]:
data['Category'] = data['Category'].map({'spam' : 0, 'ham' : 1})


In [46]:
data['Category'].value_counts()

Category
1    4825
0     747
Name: count, dtype: int64

# Traing Data

In [47]:
from sklearn.model_selection import train_test_split
X = data['Message']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 , random_state = 42)

# Feature extraction using TfidfVectorizer

In [51]:
# Feature extraction using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# Training the logistic regression model

In [52]:

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [53]:
model.fit(X_train_features,y_train)

# Evaluating the model

In [55]:

train_predictions = model.predict(X_train_features)
test_predictions = model.predict(X_test_features)

In [57]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

In [58]:
print(f'Accuracy on training data: {train_accuracy * 100:.2f}%')
print(f'Accuracy on test data: {test_accuracy * 100:.2f}%')

Accuracy on training data: 96.61%
Accuracy on test data: 96.77%


In [60]:
def predict_email_spam(email):
    email_features = vectorizer.transform([email])
    prediction = model.predict(email_features)
    return 'Ham' if prediction[0] == 1 else 'Spam'

In [66]:
new_email = "Congratulations! You've won a free ticket. Call now!"
print(f'The email is classified as: {predict_email_spam(new_email)}')

The email is classified as: Ham
