## masukan library

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


In [None]:
import nltk
nltk.download('stopwords')

## load dataset


In [None]:
indo = pd.read_csv('email_spam_indo.csv')
eng = pd.read_csv('spam.csv')

indo.columns = ["label", "text"]
eng.columns = ["label", "text"]

data = pd.concat([indo, eng], ignore_index=True)
data.head()

## text perporcessing

#case folding

In [None]:
import re

#membuat fungsi u/ case folding
def casefolding(text):
    text = text.lower()                                  #merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+','' , text)   #menghapus url
    text = re.sub(r'[-+]?[0-9]+','', text)              #menghapus angka
    text = re.sub(r'[^\w\s]', '', text)                 #menghapus tanda baca
    text = text.strip()
    return text
    

In [None]:
#membandingkan before dan after casefolding
raw_sample = data['text'].iloc[696]
case_folding = casefolding(raw_sample)

print('Before Case Folding : ', raw_sample)
print('After Case Folding  : ', case_folding)

## normalisasi kata

In [None]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any() 
    else word for word in text.split()
    ])
    text = str.lower(text)
    return text
    

In [None]:
# before and after normalisasi

raw_data = data['text'].iloc[696]
word_normal = text_normalize(case_folding)

print('raw data : ', raw_data)
print('after normalisasi : ', word_normal)


## filtering

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')
stopwords_eng = stopwords.words('english')

In [None]:
len(stopwords_ind)


In [None]:
len(stopwords_eng)

In [None]:
#melihat daftar topword dari nltk
stopwords_ind

In [None]:
stopwords_eng

In [None]:
#membuat fungsi stopwords removal

#menambahkan kata dalam stopwords
#more_stopword = ['tsel','gb','rb', 'btw']
#stopwords_ind = stopwords
all_stopwords = stopwords_ind + stopwords_eng

def remove_stopwords(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in all_stopwords:
            clean_words.append(word)
    return ' '.join(clean_words)

In [None]:
raw_sample = data['text'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stopwords(case_folding)

print('raw data : ', raw_sample)
print('case folding : ', case_folding)
print('stopwords removal : ', stopwords_removal)


## stemming

In [None]:
#merubah kata menjadi bentuk dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

#membuat fungsi stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [None]:
raw_sample = data['text'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stopwords(case_folding)
text_stemming = stemming(stopwords_removal)

print('raw data : ', raw_sample)
print('case folding : ', case_folding)
print('stopwords removal : ', stopwords_removal)
print('text stemming : ', text_stemming)

## text preprocessing pipeline


In [None]:
#membuat fungsi u/ menggabungkan semua proses text preprocessing

def text_preprocessing(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stopwords(text)
    text = stemming(text)
    return text

In [None]:
import swifter

data['clean_text'] = data['text'].swifter.apply(text_preprocessing)


In [None]:
#simpan ke file csv
data.to_csv('dataset_clean.csv')

## feature enginering

In [None]:
#pisahkan kolom feature dan target
import pandas as pd

clean_data = pd.read_csv('dataset_clean.csv')
clean_data = clean_data.dropna(subset=['clean_text', 'label'])
x = clean_data['clean_text']
y = clean_data['label']
x.shape, y.shape

In [None]:
x

In [None]:
y

## feature extraction (TF-IDF dan N-Gram)

In [None]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [None]:
#menampilakan vocabulary tf-idf
vec_TF_IDF.vocabulary_

In [None]:
#melihat jumlah fitur
print(len(vec_TF_IDF.get_feature_names_out()))

In [None]:
#melihat fitur
print(vec_TF_IDF.get_feature_names_out())

In [None]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(data=x1, columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf 

In [None]:
data_tabular_tf_idf.iloc[10:20,60:70]

## feature selection

In [None]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [None]:

from sklearn.feature_selection import SelectKBest, chi2

chi2_features = SelectKBest(chi2, k=5000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# reduce fitur
print('Original feature number:', x_train.shape[1])
print('Reduced feature number:', x_kbest_features.shape[1])




In [None]:
# gunakan chi2_features.scores_, bukan x_kbest_features.scores_
data = pd.DataFrame(chi2_features.scores_, columns=['Nilai'])
data

In [None]:
# menampilkan feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

data['Fitur'] = feature
data

In [None]:
#mengurutkan fitur berdasarkan nilai tertinggi
data.sort_values(by=['Nilai'], ascending=False)


In [None]:
mask = chi2_features.get_support()
mask

In [None]:
#menampilkan fitur yang terpilih berdasarkan nilai mask / nilai tertinggi yg sudah ditetapkan pada chi square

new_features = []
for bool, f in zip(mask, feature):
    if bool:
        new_features.append(f)
    selected_features = new_features
selected_features

In [None]:
new_selected_features = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_features:
        new_selected_features[k] =v

new_selected_features

In [None]:
len(new_selected_features)

In [None]:
pickle.dump(new_selected_features,open("new_selected_features_tf-idf.sav","wb"))

In [None]:
data_selected_features = pd.DataFrame(x_kbest_features, columns=selected_features)
data_selected_features

## modeling

In [None]:
#import library u/ modeling
import random
from sklearn.model_selection import train_test_split

#import algorithm naive bayes
from sklearn.naive_bayes import MultinomialNB

In [None]:

selected_x = data_selected_features  
selected_y = y   # target label



# Misal dataset sudah ada di variabel x dan y
x_train, x_test, y_train, y_test = train_test_split(
    selected_x, selected_y, test_size=0.2, random_state=42
)

print('banyaknya x_train  :', len(x_train))
print('banyaknya x_test   :', len(x_test))
print('banyaknya y_train  :', len(y_train))
print('banyaknya y_test   :', len(y_test))


In [None]:

nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)

y_pred = nb_model.predict(x_test)

## evaluasi model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

#Prediksi dengan data uji 
y_pred = nb_model.predict(x_test)

#Evaluasi
print("Akurasi :", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



In [None]:
pickle.dump(nb_model, open("model_email.sav", "wb"))