In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df.columns = ['label', 'message']

In [3]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [4]:
df_uncleaned = df.copy()

In [5]:
# Text Preprocessing 1
df['message'] = df['message'].str.lower()

In [6]:
# Text Preprocessing 2
import string

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["message"] = df["message"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [7]:
# Text Preprocessing 3
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["message"] = df["message"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


In [8]:
# Using Bag of Words vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)
X_train = count_vectorizer.transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [9]:
# Training on cleaned data
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

estimators = ['Multinomial Naive Bayes', 'Random Forest', 'XGBoost']
accuracies = []
precisions = []
recalls = []
f1s = []

# Multinomial Naive Bayes
bayes = MultinomialNB()

bayes.fit(X_train, y_train)
y_pred = bayes.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# Random Forest
forest = RandomForestClassifier()

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# XGBoost
xgboost = XGBClassifier()

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

print('For Bag of Words (CountVectorizer) on cleaned data:')
report = pd.DataFrame({'Estimator': estimators, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s})
print(report)

For Bag of Words (CountVectorizer) on cleaned data:
                 Estimator  Accuracy  Precision    Recall        F1
0  Multinomial Naive Bayes  0.980269   0.970588  0.880000  0.923077
1            Random Forest  0.973991   1.000000  0.806667  0.892989
2                  XGBoost  0.970404   0.953488  0.820000  0.881720


In [10]:
# Using Bag of Words vectorizer (uncleaned)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_uncleaned['message'], df_uncleaned['label'], test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)
X_train = count_vectorizer.transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [11]:
# Training on cleaned data
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

estimators = ['Multinomial Naive Bayes', 'Random Forest', 'XGBoost']
accuracies = []
precisions = []
recalls = []
f1s = []

# Multinomial Naive Bayes
bayes = MultinomialNB()

bayes.fit(X_train, y_train)
y_pred = bayes.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# Random Forest
forest = RandomForestClassifier()

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# XGBoost
xgboost = XGBClassifier()

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

print('For Bag of Words (CountVectorizer) on uncleaned data:')
report = pd.DataFrame({'Estimator': estimators, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s})
print(report)

For Bag of Words (CountVectorizer) on uncleaned data:
                 Estimator  Accuracy  Precision    Recall        F1
0  Multinomial Naive Bayes  0.983857   0.985294  0.893333  0.937063
1            Random Forest  0.977578   1.000000  0.833333  0.909091
2                  XGBoost  0.977578   0.962963  0.866667  0.912281


In [12]:
# Using TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

count_vectorizer = TfidfVectorizer()
count_vectorizer.fit(X_train)
X_train = count_vectorizer.transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [13]:
# Training on cleaned data
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

estimators = ['Multinomial Naive Bayes', 'Random Forest', 'XGBoost']
accuracies = []
precisions = []
recalls = []
f1s = []

# Multinomial Naive Bayes
bayes = MultinomialNB()

bayes.fit(X_train, y_train)
y_pred = bayes.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# Random Forest
forest = RandomForestClassifier()

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# XGBoost
xgboost = XGBClassifier()

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

print('For TF-IDF Vectorizer on cleaned data:')
report = pd.DataFrame({'Estimator': estimators, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s})
print(report)

For TF-IDF Vectorizer on cleaned data:
                 Estimator  Accuracy  Precision    Recall        F1
0  Multinomial Naive Bayes  0.967713      1.000  0.760000  0.863636
1            Random Forest  0.972197      1.000  0.793333  0.884758
2                  XGBoost  0.963229      0.936  0.780000  0.850909


In [14]:
# Using TF-IDF vectorizer (uncleaned)
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df_uncleaned['message'], df_uncleaned['label'], test_size=0.2, random_state=42)

count_vectorizer = TfidfVectorizer()
count_vectorizer.fit(X_train)
X_train = count_vectorizer.transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [15]:
# Training on cleaned data
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

estimators = ['Multinomial Naive Bayes', 'Random Forest', 'XGBoost']
accuracies = []
precisions = []
recalls = []
f1s = []

# Multinomial Naive Bayes
bayes = MultinomialNB()

bayes.fit(X_train, y_train)
y_pred = bayes.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# Random Forest
forest = RandomForestClassifier()

forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

# XGBoost
xgboost = XGBClassifier()

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

accuracies.append(accuracy_score(y_test, y_pred))
precisions.append(precision_score(y_test, y_pred))
recalls.append(recall_score(y_test, y_pred))
f1s.append(f1_score(y_test, y_pred))

print('For TF-IDF Vectorizer on uncleaned data:')
report = pd.DataFrame({'Estimator': estimators, 'Accuracy': accuracies, 'Precision': precisions, 'Recall': recalls, 'F1': f1s})
print(report)

For TF-IDF Vectorizer on uncleaned data:
                 Estimator  Accuracy  Precision    Recall        F1
0  Multinomial Naive Bayes  0.962332   1.000000  0.720000  0.837209
1            Random Forest  0.974888   1.000000  0.813333  0.897059
2                  XGBoost  0.976682   0.984375  0.840000  0.906475


- Best performance by Multinomial Naive Bayes on Bag of Words vectorized uncleaned data. :o
- Let's use soft voting with Multinomial Naive Bayes, Random Forest and XGBoost.

In [18]:
# Using Bag of Words vectorizer (uncleaned)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_uncleaned['message'], df_uncleaned['label'], test_size=0.2, random_state=42)

count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)
X_train = count_vectorizer.transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [19]:
from sklearn.ensemble import VotingClassifier

estimators = [('bayes', MultinomialNB()), ('forest', RandomForestClassifier()), ('xgboost', XGBClassifier())]
ensemble = VotingClassifier(estimators, voting='soft')

ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print('Ensemble of Multinomial Naive Bayes, Random Forest, and XGBoost:')
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred))

Ensemble of Multinomial Naive Bayes, Random Forest, and XGBoost:
Accuracy: 0.9838565022421525
Precision: 1.0
Recall: 0.88
F1: 0.9361702127659575
