# Email Classification

In [None]:
import pandas as pd
df = pd.read_csv('./data/email_spam.csv')
df.info()

In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
df.groupby('Kategori').count()

## Preprocessing

### Membersihkan data dari yang bukan teks (angka, tanda baca)

In [None]:
import re
import string

clean_text = lambda text: re.sub(f'[{string.punctuation}0-9]', '', text)
df['Pesan'] = df['Pesan'].apply(clean_text)

df.sample(5)

### Mengubah text menjadi lowercase

In [None]:
df['Pesan'] = df['Pesan'].apply(lambda text: text.lower())
df.sample(5)

### Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

df['Pesan'] = df['Pesan'].apply(stemmer.stem)
df.sample(5)

In [None]:
df = pd.read_csv('data/email_stemming.csv')
df.sample(5)

### Tokenisasi

In [None]:
from nltk.tokenize import word_tokenize

df['Pesan'] = df['Pesan'].apply(word_tokenize)
df.sample(5)

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords

remove_stopwords = lambda words: [word for word in words if word not in stopwords.words('indonesian')]

df['Pesan'] = df['Pesan'].apply(remove_stopwords)
df.sample(5)

In [None]:
df = pd.read_csv('data/email_stopword.csv')
df.sample(5)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Pesan'])

feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
tfidf_df

## Modeling

In [None]:
from sklearn.model_selection import train_test_split

y = df['Kategori']
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=42)

### Naive Bayes Multinomial

In [None]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
pred_mnb = mnb.predict(X_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
pred_dt = dt.predict(X_test)

### Logistik Regresion Biner

In [None]:
from sklearn.linear_model import LogisticRegression

lrb = LogisticRegression(random_state=42)
lrb.fit(X_train, y_train)
pred_lrb = lrb.predict(X_test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

## Model Evaluation

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

# Lakukan cross-validation
score_mnb = cross_val_score(mnb, X_test, y_test, cv=10, scoring='accuracy')  # 'cv=5' artinya 5-fold cross-validation
score_dt = cross_val_score(dt, X_test, y_test, cv=10, scoring='accuracy')
score_lrb = cross_val_score(lrb, X_test, y_test, cv=10, scoring='accuracy')
score_rf = cross_val_score(rf, X_test, y_test, cv=10, scoring='accuracy')

### Naive Bayes Multinomial

In [None]:
print(classification_report(y_test, pred_mnb))
print(f"Accuracy dari tiap fold: \n{score_mnb}")
print(f"Accuracy Score: {score_mnb.mean() * 100 :.2f}%")
print(f"matrix Confusion: \n{confusion_matrix(y_test, pred_mnb)}")

### Decision Tree

In [None]:
print(classification_report(y_test, pred_dt))
print(f"Accuracy dari tiap fold: \n{score_dt}")
print(f"Accuracy Score: {score_dt.mean() * 100 :.2f}%")
print(f"matrix Confusion: \n{confusion_matrix(y_test, pred_dt)}")

### Logistik Regresion Biner

In [None]:
print(classification_report(y_test, pred_lrb))
print(f"Accuracy dari tiap fold: \n{score_lrb}")
print(f"Accuracy Score: {score_lrb.mean() * 100 :.2f}%")
print(f"matrix Confusion: \n{confusion_matrix(y_test, pred_lrb)}")

### Random Forest

In [None]:
print(classification_report(y_test, pred_rf))
print(f"Accuracy dari tiap fold: \n{score_rf}")
print(f"Accuracy Score: {score_rf.mean() * 100 :.2f}%")
print(f"matrix Confusion: \n{confusion_matrix(y_test, pred_rf)}")