In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!ls

emails.csv  sample_data


In [None]:
df = pd.read_csv("emails.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
import re
from nltk.stem import PorterStemmer

In [None]:

# stop words
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'\W+|\d+', ' ', text)

    # lowercase
    text = text.lower()

    # Tokenization
    tokens = text.split()

    # Removing stop words and stemming
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    return ' '.join(filtered_tokens)

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)


In [None]:
df.head()

Unnamed: 0,text,spam,processed_text
0,Subject: naturally irresistible your corporate...,1,subject natur irresist corpor ident lt realli ...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trade gunsling fanni merril muzo...
2,Subject: unbelievable new homes made easy im ...,1,subject unbeliev new home made easi im want sh...
3,Subject: 4 color printing special request add...,1,subject color print special request addit info...
4,"Subject: do not have money , get software cds ...",1,subject money get softwar cd softwar compat gr...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

X = tfidf_vectorizer.fit_transform(df['processed_text'])



In [None]:
from sklearn.model_selection import train_test_split

y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))


Accuracy: 0.8821989528795812
              precision    recall  f1-score   support

           0       0.86      1.00      0.93       856
           1       1.00      0.53      0.70       290

    accuracy                           0.88      1146
   macro avg       0.93      0.77      0.81      1146
weighted avg       0.90      0.88      0.87      1146



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

grid_search = GridSearchCV(MultinomialNB(), param_grid, refit=True, verbose=3, cv=5)

grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['alpha']
print("Best alpha:", best_alpha)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)



Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END .........................alpha=0.0;, score=0.969 total time=   0.0s
[CV 2/5] END .........................alpha=0.0;, score=0.975 total time=   0.0s
[CV 3/5] END .........................alpha=0.0;, score=0.980 total time=   0.0s
[CV 4/5] END .........................alpha=0.0;, score=0.975 total time=   0.0s
[CV 5/5] END .........................alpha=0.0;, score=0.980 total time=   0.0s
[CV 1/5] END .........................alpha=0.1;, score=0.989 total time=   0.0s
[CV 2/5] END .........................alpha=0.1;, score=0.984 total time=   0.0s
[CV 3/5] END .........................alpha=0.1;, score=0.993 total time=   0.0s
[CV 4/5] END .........................alpha=0.1;, score=0.991 total time=   0.0s
[CV 5/5] END .........................alpha=0.1;, score=0.984 total time=   0.0s
[CV 1/5] END .........................alpha=0.2;, score=0.978 total time=   0.0s
[CV 2/5] END .........................alpha=0.2;



[CV 2/5] END .........................alpha=0.3;, score=0.959 total time=   0.0s
[CV 3/5] END .........................alpha=0.3;, score=0.969 total time=   0.0s
[CV 4/5] END .........................alpha=0.3;, score=0.959 total time=   0.0s
[CV 5/5] END .........................alpha=0.3;, score=0.961 total time=   0.0s
[CV 1/5] END .........................alpha=0.4;, score=0.951 total time=   0.0s
[CV 2/5] END .........................alpha=0.4;, score=0.944 total time=   0.0s
[CV 3/5] END .........................alpha=0.4;, score=0.955 total time=   0.0s
[CV 4/5] END .........................alpha=0.4;, score=0.950 total time=   0.0s
[CV 5/5] END .........................alpha=0.4;, score=0.941 total time=   0.0s
[CV 1/5] END .........................alpha=0.5;, score=0.929 total time=   0.0s
[CV 2/5] END .........................alpha=0.5;, score=0.927 total time=   0.0s
[CV 3/5] END .........................alpha=0.5;, score=0.939 total time=   0.0s
[CV 4/5] END ...............

In [None]:
optimal_model = MultinomialNB(alpha=0.1)
optimal_model.fit(X_train, y_train)

y_pred_optimal = optimal_model.predict(X_test)

accuracy_optimal = accuracy_score(y_test, y_pred_optimal)
print(f"Optimized Model Accuracy: {accuracy_optimal}")
print(classification_report(y_test, y_pred_optimal))


Optimized Model Accuracy: 0.9851657940663177
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       856
           1       0.99      0.96      0.97       290

    accuracy                           0.99      1146
   macro avg       0.99      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146

