In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [92]:
dataset = pd.read_csv('spam.csv',encoding='latin-1')
dataset = dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])
dataset.columns = ['label','email']
dataset

Unnamed: 0,label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [93]:
import re
from nltk.corpus import stopwords

def clean_text(raw_text):
    stop_words = set(stopwords.words('english'))
    text = re.sub('[^A-Za-z]',' ',raw_text).lower()
    words = text.split()
    clean_words = [word for word in words if not word in stop_words]
    clean_email = (' ').join(clean_words) 
    return clean_email

In [94]:
dataset['clean_email'] = dataset['email'].apply(clean_text)
dataset

Unnamed: 0,label,email,clean_email
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset['clean_email']).toarray()

In [96]:
print(X)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [97]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(dataset['label'])
y

array([0, 0, 1, ..., 0, 0, 0])

In [98]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [99]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_train,y_train)

MultinomialNB()

In [100]:
nb_pred = nb_clf.predict(X_test)

In [101]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
print(f'Accuracy Score: {accuracy_score(y_test,nb_pred)}')
print(f'Precision: {precision_score(y_test,nb_pred)}')
print(f'Recall: {recall_score(y_test,nb_pred)}')
print(f'F1 Score: {f1_score(y_test,nb_pred)}')

Accuracy Score: 0.979372197309417
Precision: 1.0
Recall: 0.8345323741007195
F1 Score: 0.9098039215686274


In [102]:
from sklearn.linear_model import PassiveAggressiveClassifier
pass_clf = PassiveAggressiveClassifier()
pass_clf.fit(X_train,y_train)

PassiveAggressiveClassifier()

In [103]:
pass_pred = pass_clf.predict(X_test)

In [104]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
print(f'Accuracy Score: {accuracy_score(y_test,pass_pred)}')
print(f'Precision: {precision_score(y_test,pass_pred)}')
print(f'Recall: {recall_score(y_test,pass_pred)}')
print(f'F1 Score: {f1_score(y_test,pass_pred)}')

Accuracy Score: 0.989237668161435
Precision: 1.0
Recall: 0.9136690647482014
F1 Score: 0.9548872180451127


In [105]:
target = 'Congratulations! You have won a free vacation! Claim your prize now!' 
pred = le.inverse_transform(pass_clf.predict(vectorizer.transform([target])))[0]
print(f'The predicted category is {pred}!')

The predicted category is spam!


In [106]:
target = 'I will be coming back to Pakistan next week. You need to pick me up from airport' 
pred = le.inverse_transform(pass_clf.predict(vectorizer.transform([target])))[0]
print(f'The predicted category is {pred}!')

The predicted category is ham!
