In [50]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set()

In [51]:
# Passing dataset into variables
data = pd.read_csv('email.csv')
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [52]:
# Check datatypes info for each columns and length
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [53]:
# Check value_counts for each categoric
data['Category'].value_counts()

Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64

In [54]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Category'] = le.fit_transform(data['Category'])

In [55]:
data.head(10)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [56]:
data['Category'].value_counts()

Category
0    4825
1     747
2       1
Name: count, dtype: int64

In [57]:
# Menghapus baris yang tidak memiliki nilai
data.drop(data[data['Category'] == 2].index, axis=0, inplace=True)

In [58]:
data['Category'].value_counts()

Category
0    4825
1     747
Name: count, dtype: int64

In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   int64 
 1   Message   5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [60]:
data['Message'] = data['Message'].astype(str)

# Model Training

In [61]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

cls = GaussianNB()
tfidf = TfidfVectorizer()

In [62]:
X = data['Message']
y = data['Category']

In [63]:
from sklearn.model_selection import train_test_split

# We use 75-25 as split (train - test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [64]:
# Change X into more dense array
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [65]:
# Model building
cls.fit(X_train_tfidf, y_train)

In [66]:
# Prediction
y_pred = cls.predict(X_test_tfidf)

In [67]:
# Checking the accuracy
score = accuracy_score(y_test, y_pred)
print(score * 100) 

90.38047379755922


In [68]:
# Saving model with pickle
import pickle

model_pkl_file = "email_spam.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(cls, file)

In [69]:
# Saving tfidf
tfidf_model = "vectorizer.pkl"

with open(tfidf_model, 'wb') as file:
    pickle.dump(tfidf,file)