<a href="https://colab.research.google.com/github/bhandarisatkriti/Email_Spam_Classifier/blob/main/Email_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn nltk




In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:

# Load dataset
data = pd.read_csv("spam.csv", encoding="latin-1")

# Keep only the relevant columns
data = data[['v1', 'v2']]
data.columns = ['Label', 'Text']

# Map labels to 0 (ham) and 1 (spam)
data['Label'] = data['Label'].map({'ham': 0, 'spam': 1})

# Check the dataset
print(data.head())


   Label                                               Text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['Cleaned_Text'] = data['Text'].apply(clean_text)
print(data['Cleaned_Text'].head())


0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in a wkly comp to win fa cup final ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: Cleaned_Text, dtype: object


In [None]:
import nltk
print(nltk.data.find('tokenizers/punkt'))


/root/nltk_data/tokenizers/punkt


In [None]:
# Convert text to numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['Cleaned_Text']).toarray()
y = data['Label']

print(f"Shape of Features: {X.shape}")


Shape of Features: (5572, 5000)


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 96.59%

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
[[965   0]
 [ 38 112]]


In [9]:
!pip install pyngrok streamlit




In [10]:
from google.colab import files
uploaded = files.upload()


Saving tfidf_vectorizer.pkl to tfidf_vectorizer (1).pkl
Saving spam_classifier_model.pkl to spam_classifier_model (1).pkl


In [16]:
import streamlit as st
import joblib
import os

# Load the trained model and vectorizer
model_path = 'spam_classifier_model.pkl'
vectorizer_path = 'tfidf_vectorizer.pkl'

# Check if the model and vectorizer files are present
if os.path.exists(model_path) and os.path.exists(vectorizer_path):
    model = joblib.load(model_path)
    vectorizer = joblib.load(vectorizer_path)

    st.title("Spam Email Classifier")
    user_input = st.text_area("Enter your email text here:")

    if st.button("Predict"):
        # Vectorize the input text
        input_vectorized = vectorizer.transform([user_input])

        # Debugging: Print the input text and its vectorized form
        st.write(f"Input Text: {user_input}")
        st.write(f"Vectorized Input: {input_vectorized.toarray()}")  # Shows the feature vector

        # Debugging: Check prediction
        prediction = model.predict(input_vectorized)
        st.write(f"Prediction: {prediction}")  # Shows the raw prediction output

        # Optionally check prediction probability if your model supports it
        if hasattr(model, 'predict_proba'):
            prediction_proba = model.predict_proba(input_vectorized)
            st.write(f"Prediction Probability: {prediction_proba}")  # Shows the probability distribution

        # Show prediction result
        if prediction[0] == 1:
            st.error("This is SPAM!")
        else:
            st.success("This is NOT SPAM!")

else:
    st.error("Model or vectorizer files not found. Please upload the correct files.")


2025-01-25 08:49:21.494 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-01-25 08:49:21.509 Session state does not function when running a script without `streamlit run`


In [17]:
from pyngrok import ngrok

# Set ngrok authtoken
ngrok.set_auth_token("2s6lFAIdZyazYMlZCfq6wkovwct_5oX5UhTU3zzAuWJV2Tv1i")


In [18]:
import subprocess

# Open a tunnel to the Streamlit app (default port is 8501)
public_url = ngrok.connect(8501)

# Start the Streamlit app in the background
subprocess.Popen(["streamlit", "run", "app.py"])

# Print the public URL where the Streamlit app is running
print(f"Streamlit app is live at {public_url}")


Streamlit app is live at NgrokTunnel: "https://632d-34-106-139-26.ngrok-free.app" -> "http://localhost:8501"
