<a href="https://colab.research.google.com/github/ayshabincy/Email-Spam-or-Harm-Detection-/blob/main/ML_spam_or_harm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
dataset = pd.read_csv('/content/drive/MyDrive/spam_assassin.csv')
# overview
dataset.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [2]:


# Display basic information about the dataset
dataset.info()

# Display the first few rows, specifically the 'text' and 'label' columns
dataset[['text', 'target']].head()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB


Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [3]:
#The Naive Bayes model
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size=0.2, random_state=42)

# Convert the text data into feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)
# Make predictions on the test set
predictions = classifier.predict(X_test_vectorized)
# Evaluate the performance of the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.975
Confusion Matrix:
[[779   0]
 [ 29 352]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       779
           1       1.00      0.92      0.96       381

    accuracy                           0.97      1160
   macro avg       0.98      0.96      0.97      1160
weighted avg       0.98      0.97      0.97      1160



In [4]:
# Display an example of a spam email
spam_example = dataset[dataset['target'] == 1]['text'].iloc[0]
print("Example of a Spam Email:")
print(spam_example)

# Display an example of a ham email
ham_example = dataset[dataset['target'] == 0]['text'].iloc[0]
print("\nExample of a Ham Email:")
print(ham_example)

Example of a Spam Email:
From gort44@excite.com Mon Jun 24 17:54:21 2002 Return-Path: gort44@excite.com Delivery-Date: Tue Jun 4 05:31:16 2002 Received: from mandark.labs.netnoteinc.com ([213.105.180.140]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g544VFO20182 for <jm@jmason.org>; Tue, 4 Jun 2002 05:31:15 +0100 Received: from wi-poli.poli.cl ([200.54.149.34]) by mandark.labs.netnoteinc.com (8.11.2/8.11.2) with SMTP id g544VC729935; Tue, 4 Jun 2002 05:31:13 +0100 Received: from 216.77.61.89 (unverified [218.5.180.148]) by wi-poli.poli.cl (EMWAC SMTPRS 0.83) with SMTP id <B0000918901@wi-poli.poli.cl>; Tue, 04 Jun 2002 00:14:29 -0400 Message-Id: <B0000918901@wi-poli.poli.cl> To: <chrbader@telecom.at> From: "irese" <gort44@excite.com> Subject: Cash in on your home equity Date: Tue, 04 Jun 2002 00:18:34 -1600 MIME-Version: 1.0 Content-Type: text/plain; charset="Windows-1252" X-Keywords: Content-Transfer-Encoding: 7bit Mortgage Lenders & Brokers Are Ready to compete for your busin

In [5]:
# Example of a new email text
new_email_text = "Congratulations! You've won a free vacation. Click here to claim your prize."

# Convert the new email text into a feature vector using the same CountVectorizer used during training
new_email_vectorized = vectorizer.transform([new_email_text])

# Use the trained classifier to predict the probability of each class
predicted_probabilities = classifier.predict_proba(new_email_vectorized)

# Check if the spam class exists in the classifier classes
if 1 in classifier.classes_:
    spam_class_index = list(classifier.classes_).index(1)
    spam_probability = predicted_probabilities[0][spam_class_index]
else:
    spam_probability = 0.0  # Assume zero probability if the spam class does not exist

# Set a threshold for spam classification (you may adjust this threshold as needed)
spam_threshold = 0.5

# Print diagnostic information
print("Predicted Probabilities:", predicted_probabilities)
print("Spam Probability:", spam_probability)

# Check if the spam probability exceeds the threshold
if spam_probability > spam_threshold:
    print("The email is classified as spam.")
else:
    print("The email is classified as not spam (ham).")


Predicted Probabilities: [[1.03997607e-04 9.99896002e-01]]
Spam Probability: 0.999896002392689
The email is classified as spam.


In [6]:
 #  Support Vector Classifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC  # Import Support Vector Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size=0.2, random_state=42)

# Convert the text data into feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Support Vector Machine classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = svm_classifier.predict(X_test_vectorized)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Accuracy: 0.9948275862068966
Confusion Matrix:
[[779   0]
 [  6 375]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       779
           1       1.00      0.98      0.99       381

    accuracy                           0.99      1160
   macro avg       1.00      0.99      0.99      1160
weighted avg       0.99      0.99      0.99      1160



In [7]:
#  Random Forest Classifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Classifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['target'], test_size=0.2, random_state=42)

# Convert the text data into feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test_vectorized)

# Evaluate the performance of the model
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Accuracy: 0.9922413793103448
Confusion Matrix:
[[779   0]
 [  9 372]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       779
           1       1.00      0.98      0.99       381

    accuracy                           0.99      1160
   macro avg       0.99      0.99      0.99      1160
weighted avg       0.99      0.99      0.99      1160



In [8]:
# Example of a new email text
new_email_text = "Congratulations! You've won a free vacation. Click here to claim your prize."

# Convert the new email text into a feature vector using the same CountVectorizer used during training
new_email_vectorized = vectorizer.transform([new_email_text])

# Use the trained classifier to predict the probability of each class
predicted_probabilities = classifier.predict_proba(new_email_vectorized)

# Check if the spam class exists in the classifier classes
if 1 in rf_classifier.classes_:
    spam_class_index = list(classifier.classes_).index(1)
    spam_probability = predicted_probabilities[0][spam_class_index]
else:
    spam_probability = 0.0  # Assume zero probability if the spam class does not exist

# Set a threshold for spam classification (you may adjust this threshold as needed)
spam_threshold = 0.5

# Print diagnostic information
print("Predicted Probabilities:", predicted_probabilities)
print("Spam Probability:", spam_probability)

# Check if the spam probability exceeds the threshold
if spam_probability > spam_threshold:
    print("The email is classified as spam.")
else:
    print("The email is classified as not spam (ham).")


Predicted Probabilities: [[1.03997607e-04 9.99896002e-01]]
Spam Probability: 0.999896002392689
The email is classified as spam.
