In [14]:




import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer  # Use TF-IDF for better weighting
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
from nltk.corpus import stopwords
import joblib

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
data = pd.read_csv('mail_data.csv', encoding='latin-1')
data = data[['Category', 'Message']]
data.columns = ['Category', 'Message']
data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

# Preprocess text (consider stemming/lemmatization)
def preprocess_text(text):
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Consider stemming or lemmatization for better normalization (optional)
    # from nltk.stem import PorterStemmer
    # stemmer = PorterStemmer()
    # words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

data['Message'] = data['Message'].apply(preprocess_text)

# Feature extraction (try TF-IDF)
vectorizer = TfidfVectorizer(max_features=2000)  # Reduce features for efficiency (consider tuning)
X = vectorizer.fit_transform(data['Message'])
y = data['Category']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model (consider GridSearchCV for hyperparameter tuning)
model = MultinomialNB()
# You can try GridSearchCV to find optimal hyperparameters like alpha:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0]}
# grid = GridSearchCV(MultinomialNB(), param_grid=param_grid, cv=5)
# grid.fit(X_train, y_train)
# model = grid.best_estimator_

model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# class_report = classification_report(y_test, y_pred)   

print(f'Accuracy: {accuracy * 100:.2f}%')
print('Confusion Matrix:\n', conf_matrix)
# print('Classification Report:\n', class_report)

# Test model with a new email
def predict_email(text):
    text = preprocess_text(text)
    text_vector = vectorizer.transform([text])
    prediction = model.predict(text_vector)
    return 'Spam' if prediction[0] == 1 else 'Ham'

print('Testing Model ...')

email = "job oppurtunity in WIPRO"
print(email)

print(predict_email(email))

# Save model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')



# old code 

# import pandas as pd
# import nltk
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# from nltk.corpus import stopwords
# import joblib

# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

# # Load dataset
# data = pd.read_csv('mail_data.csv', encoding='latin-1')
# data = data[['Category', 'Message']]
# data.columns = ['Category', 'Message']
# data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

# # Preprocess text
# def preprocess_text(text):
#     text = text.lower()
#     words = text.split()
#     words = [word for word in words if word not in stop_words]
#     return ' '.join(words)

# data['Message'] = data['Message'].apply(preprocess_text)

# # Feature extraction
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(data['Message'])
# y = data['Message']

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Train model
# model = MultinomialNB()
# model.fit(X_train, y_train)

# # Evaluate model
# y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)
# class_report = classification_report(y_test, y_pred)

# print(f'Accuracy: {accuracy * 100:.2f}%')
# # print('Confusion Matrix:\n', conf_matrix)
# # print('Classification Report:\n', class_report)

# # Test model with a new email
# def predict_email(text):
#     text = preprocess_text(text)
#     text_vector = vectorizer.transform([text])
#     prediction = model.predict(text_vector)
#     return 'Spam' if prediction[0] == 1 else 'Ham'

# print('Testing Model ...')

# email = "Last Chance! Exclusive Deal Just for You"
# print(email)

# print(predict_email(email))

# # Save model and vectorizer
# joblib.dump(model, 'spam_model.pkl')
# joblib.dump(vectorizer, 'vectorizer.pkl')


Accuracy: 98.30%
Confusion Matrix:
 [[965   1]
 [ 18 131]]
Testing Model ...
job oppurtunity in WIPRO
Ham


[nltk_data] Downloading package stopwords to C:\Users\Mohammed Amjad
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['vectorizer.pkl']