# Klasifikasi Data

In [8]:
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [9]:
# 1. Load Data
# Pastikan file csv berada di folder yang sama dengan script ini
df = pd.read_csv('data_bersih_filter_manual.csv')

print(f"Total data awal: {len(df)}")

Total data awal: 1473


##Filter data

In [10]:
# 2. Filter Data Labeled
# Kita membuang baris yang kolom 'label_manual'-nya kosong (NaN)
df_labeled = df.dropna(subset=['label_manual']).copy()

# Opsional: Mengubah format label dari float (1.0) menjadi integer (1) agar lebih rapi
df_labeled['label_manual'] = df_labeled['label_manual'].astype(int)

print(f"Total data setelah filter (hanya yang berlabel): {len(df_labeled)}")
print("Sebaran label:\n", df_labeled['label_manual'].value_counts())

Total data setelah filter (hanya yang berlabel): 267
Sebaran label:
 label_manual
 1    103
-1    101
 0     63
Name: count, dtype: int64


#Eksperimen 1 & 2

##Stemming

In [11]:
# 3. Proses Stemming dengan Sastrawi
# Membuat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk stemming satu kalimat
def stemming_text(text):
    if pd.isna(text): # Cek jika text kosong
        return ""
    return stemmer.stem(text)

# Terapkan ke kolom 'text_clean'
# Kita buat kolom baru 'text_stemmed' agar data asli tidak hilang
df_labeled['text_stemmed'] = df_labeled['text_clean'].apply(stemming_text)

In [12]:
# 4. Lihat Hasil
print("\nContoh hasil stemming:")
print(df_labeled[['text_clean', 'text_stemmed']].head())


Contoh hasil stemming:
                                          text_clean  \
1  bubar aja makan bergizi gratis itu bagus di ka...   
2  wajib diapresiasi presiden prabowo konsisten p...   
3                                     astaghfirullah   
6  saya aman makan bergizi gratis gratis wajah te...   
7  sdn randuagung utara dapat membagi banyak yang...   

                                        text_stemmed  
1  bubar aja makan gizi gratis itu bagus di kasih...  
2  wajib apresiasi presiden prabowo konsisten pri...  
3                                     astaghfirullah  
6  saya aman makan gizi gratis gratis wajah senyu...  
7   sdn randuagung utara dapat bagi banyak yang enak  


In [13]:
# Pastikan tidak ada nilai null di kolom text
df_labeled['text_stemmed'] = df_labeled['text_stemmed'].fillna('')

In [14]:
X = df_labeled['text_stemmed']
y = df_labeled['label_manual']

##Split Data

In [15]:
# 2. Split Data (80% Latih, 20% Uji)
# Karena data sedikit, kita pakai random_state agar hasil konsisten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## TF-IDF

In [16]:
# 3. Ekstrasi Fitur: TF-IDF
tfidf = TfidfVectorizer(max_features=1000) # Ambil 1000 kata terpenting saja
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

## Naive Bayes

In [17]:
# --- Eksperimen 1: Naive Bayes ---
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)
print("=== Hasil Naive Bayes ===")
print(f"Akurasi: {accuracy_score(y_test, y_pred_nb):.2f}")
print(classification_report(y_test, y_pred_nb))

=== Hasil Naive Bayes ===
Akurasi: 0.50
              precision    recall  f1-score   support

          -1       0.45      0.78      0.57        18
           0       0.00      0.00      0.00        14
           1       0.57      0.59      0.58        22

    accuracy                           0.50        54
   macro avg       0.34      0.46      0.38        54
weighted avg       0.38      0.50      0.43        54



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SVM

In [18]:
# --- Eksperimen 2: SVM ---
svm = SVC(kernel='linear') # Kernel linear biasanya bagus untuk teks
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print("\n=== Hasil SVM ===")
print(f"Akurasi: {accuracy_score(y_test, y_pred_svm):.2f}")
print(classification_report(y_test, y_pred_svm))


=== Hasil SVM ===
Akurasi: 0.50
              precision    recall  f1-score   support

          -1       0.42      0.78      0.55        18
           0       0.25      0.07      0.11        14
           1       0.71      0.55      0.62        22

    accuracy                           0.50        54
   macro avg       0.46      0.46      0.43        54
weighted avg       0.49      0.50      0.46        54

