# Email Classification

In [1]:
import pandas as pd
df = pd.read_csv('./data/email_spam.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2636 entries, 0 to 2635
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kategori  2636 non-null   object
 1   Pesan     2636 non-null   object
dtypes: object(2)
memory usage: 41.3+ KB


In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
df.groupby('Kategori').count()

## Preprocessing

### Membersihkan data dari yang bukan teks (angka, tanda baca)

In [None]:
import re
import string

clean_text = lambda text: re.sub(f'[{string.punctuation}0-9]', '', text)
df['Pesan'] = df['Pesan'].apply(clean_text)

df.sample(5)

### Mengubah text menjadi lowercase

In [None]:
df['Pesan'] = df['Pesan'].apply(lambda text: text.lower())
df.sample(5)

### Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# df['Pesan'] = df['Pesan'].apply(stemmer.stem)
# df.sample(5)

stemmer.stem('Kedewasaan')

In [None]:
df = pd.read_csv('data/email_stemming.csv')
df.sample(5)

### Tokenisasi

In [None]:
from nltk.tokenize import word_tokenize

df['Pesan'] = df['Pesan'].apply(word_tokenize)
df.sample(5)

### Remove Stopwords

In [None]:
# from nltk.corpus import stopwords

# remove_stopwords = lambda words: [word for word in words if word not in stopwords.words('indonesian')]

# df['Pesan'] = df['Pesan'].apply(remove_stopwords)
# df.sample(5)

In [49]:
df = pd.read_csv('data/email_stopword.csv')
df.sample(5)

Unnamed: 0,Kategori,Pesan
199,spam,"['kirim', 'surat', 'gagal', 'pesan', 'kirim', ..."
2290,ham,"['hrgovcic', 'hrvoje', 'tolong', 'hancur', 'bo..."
1050,spam,"['percaya', 'identitas', 'visual', 'pikir', 'h..."
1174,spam,"['paypal', 'account', 'review', 'r', 'dear', '..."
1922,ham,"['seri', 'seminar', 'rice', 'enron', 'finance'..."


### TF-IDF

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Pesan'])

feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
tfidf_df


Unnamed: 0,aa,aaa,aal,aaliyah,aall,aaron,aawesome,ab,aba,abacha,...,zwischen,zwzm,zxghlajf,zyban,zyc,zygoma,zymg,zzmacmac,zzn,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modeling

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

y = df['Kategori']
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)

### Naive Bayes Multinomial

In [52]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)

y_pred = mnb.predict(X_test)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.98
              precision    recall  f1-score   support

         ham       1.00      0.97      0.98       263
        spam       0.97      1.00      0.99       265

    accuracy                           0.98       528
   macro avg       0.99      0.98      0.98       528
weighted avg       0.99      0.98      0.98       528



### Decision Tree

In [54]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy Score: 0.94
              precision    recall  f1-score   support

         ham       0.94      0.94      0.94       263
        spam       0.94      0.94      0.94       265

    accuracy                           0.94       528
   macro avg       0.94      0.94      0.94       528
weighted avg       0.94      0.94      0.94       528



## Evaluation

In [58]:
from sklearn.model_selection import cross_val_score, cross_validate

# Lakukan cross-validation
scores = cross_val_score(mnb, X_test, y_test, cv=10, scoring='accuracy')  # 'cv=5' artinya 5-fold cross-validation

# Tampilkan hasilnya
print("Accuracy dari tiap fold:", scores)
print("Rata-rata accuracy:", scores.mean())
print("Standard deviation:", scores.std())

Accuracy dari tiap fold: [0.94339623 0.98113208 1.         0.9245283  0.96226415 1.
 0.98113208 1.         0.98076923 0.96153846]
Rata-rata accuracy: 0.9734760522496371
Standard deviation: 0.02418650499673794


In [60]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[247,  16],
       [ 16, 249]])