In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

nltk.download("punkt")
nltk.download("stopwords")


In [None]:

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)


In [None]:

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/spam_Emails_data.csv', encoding='latin1')
data['text'] = data['text'].apply(preprocess_text)


In [None]:

X = data['text']
y = data['label']
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


In [None]:

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:

y_pred = rf_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:

def predict_new_text(new_texts, model):
    new_texts_preprocessed = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts_preprocessed)
    predictions = rf_model.predict(new_texts_vectorized)
    predicted_labels = ['spam' if prediction == 1 else 'ham' for prediction in predictions]
    print("Predicted labels:", predicted_labels)
    return predicted_labels

def predict_new_texts(new_texts, rf_model, vectorizer):
    new_texts = [preprocess_text(text) for text in new_texts]
    new_texts_vectorized = vectorizer.transform(new_texts)
    return rf_model.predict(new_texts_vectorized)

new_emails = [
    "Please join the team meeting on Monday at 10 AM in the conference room. We'll discuss our progress and next steps.",
    "Book your dream vacation now and save up to 70% on travel costs. Limited time offer!"
]

predictions = predict_new_texts(new_emails, rf_model, vectorizer)
for email, prediction in zip(new_emails, predictions):
    print(f'Email: {email}\nPrediction: {prediction}\n')


In [None]:

import joblib
joblib.dump(rf_model, 'rndf_model.joblib')

import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
with open('rndf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
