# Email Classification

In [None]:
import pandas as pd
df = pd.read_csv('./data/email_spam.csv')
df.info()

In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
df.groupby('Kategori').count()

## Preprocessing

### Membersihkan data dari yang bukan teks (angka, tanda baca)

In [None]:
import re
import string

clean_text = lambda text: re.sub(f'[{string.punctuation}0-9]', '', text)
df['Pesan'] = df['Pesan'].apply(clean_text)

df.sample(5)

### Mengubah text menjadi lowercase

In [None]:
df['Pesan'] = df['Pesan'].apply(lambda text: text.lower())
df.sample(5)

### Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# df['Pesan'] = df['Pesan'].apply(stemmer.stem)
# df.sample(5)

stemmer.stem('Kedewasaan')

In [None]:
df = pd.read_csv('data/email_stemming.csv')
df.sample(5)

### Tokenisasi

In [None]:
from nltk.tokenize import word_tokenize

df['Pesan'] = df['Pesan'].apply(word_tokenize)
df.sample(5)

### Remove Stopwords

In [None]:
# from nltk.corpus import stopwords

# remove_stopwords = lambda words: [word for word in words if word not in stopwords.words('indonesian')]

# df['Pesan'] = df['Pesan'].apply(remove_stopwords)
# df.sample(5)

In [None]:
df = pd.read_csv('data/email_stopword.csv')
df.sample(5)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Pesan'])

# Menampilkan fitur yang dihasilkan dari TF-IDF 
# feature_names = vectorizer.get_feature_names_out()

# Mengonversi hasil TF-IDF ke DataFrame agar mudah dibaca
# tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
vectorizer.get_feature_names_out()


## Modeling

In [None]:
from sklearn.model_selection import train_test_split

y = df['Kategori']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")

print(classification_report(y_test, y_pred))