# 1. Import library 


In [32]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# 2. Import dataset

In [34]:
df = pd.read_csv('spam.csv', header=None, names=['label', 'text'], sep='\t', encoding='latin-1')

# 3. Eksplorasi dataset

In [35]:
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [37]:
df.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [39]:
df.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [41]:
df.tail(7)

Unnamed: 0,label,text
5565,ham,Huh y lei...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


# 4. Preprocessing data

In [44]:
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text
df['text'] = df['text'].apply(clean_text)

# 5. Ekstraksi fitur dari teks

In [45]:
cv = CountVectorizer()
X = cv.fit_transform(df['text']).toarray()
y = df['label'].values

# 6. Split data 

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Modeling dan training

In [47]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

# 8. Evaluasi Model

In [49]:
acc_score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print('Akurasi Model : ', acc_score)
print('Confusion Matrix : \n', conf_matrix)

Akurasi Model :  0.9515695067264573
Confusion Matrix : 
 [[965   1]
 [ 53  96]]


# 9. Test Model pada email baru

In [50]:
def predict_email(email):
    cleaned_email = clean_text(email)
    features = cv.transform([cleaned_email]).toarray()
    prediction = lr.predict(features)[0]
    if prediction == 0:
        print('Not Spam')
    else :
        print('Spam')

In [51]:
predict_email('Congrats!! you win a prize from presiden scholarship US$123456')

Spam


In [53]:
predict_email('I have a legit and genuine lucrative business deal to discuss with you. Can i trust you to be a sincere partner to handle the business? For more information reply back.')

Spam
