In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Import beberapa model ML
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [4]:
# Load dataset
df = pd.read_csv('./tweets_with_label.csv')
df.head()

Unnamed: 0,username,content,timestamp,clean_text,tokens,filtered_tokens,sentiment
0,@sigittricahyono,"Lebih dari itu, masih banyak program lain yg s...",2025-04-24T06:59:11.000Z,lebih dari itu masih banyak program lain yg se...,"['lebih', 'dari', 'itu', 'masih', 'banyak', 'p...","['program', 'menimbulkan', 'efek', 'domino', '...",negative
1,@hejin234,Rapat Dpr yg membahas anggaran pendidikan dila...,2025-04-24T05:03:32.000Z,rapat dpr yg membahas anggaran pendidikan dila...,"['rapat', 'dpr', 'yg', 'membahas', 'anggaran',...","['rapat', 'dpr', 'membahas', 'anggaran', 'pend...",negative
2,@idthreedots,ini tu salah satu bukti ketololan pendidikan k...,2025-04-24T00:32:49.000Z,ini tu salah satu bukti ketololan pendidikan k...,"['ini', 'tu', 'salah', 'satu', 'bukti', 'ketol...","['salah', 'bukti', 'ketololan', 'pendidikan', ...",negative
3,@Joyparksyoungs,Lagi2 dampak efisiensi anggaran pendidikan. Ba...,2025-04-23T16:00:04.000Z,lagi dampak efisiensi anggaran pendidikan baru...,"['lagi', 'dampak', 'efisiensi', 'anggaran', 'p...","['dampak', 'efisiensi', 'anggaran', 'pendidika...",negative
4,@anxtrousm,"""kesenjangan pendidikan"" dan kebijakan pemerin...",2025-04-23T13:11:29.000Z,kesenjangan pendidikan dan kebijakan pemerinta...,"['kesenjangan', 'pendidikan', 'dan', 'kebijaka...","['kesenjangan', 'pendidikan', 'kebijakan', 'pe...",negative


In [5]:
# Pilih kolom yang dibutuhkan
X_raw = df['filtered_tokens']  # Bisa juga 'filtered_tokens' jika sudah preprocessed
y_raw = df['sentiment']

# Encoding label target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(X_raw)

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary model
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

In [10]:
# Tempat menyimpan semua report
all_reports = []

# Training & evaluasi model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Ambil report sebagai dictionary
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)

    # Ubah report ke DataFrame dan tambahkan kolom nama model
    report_df = pd.DataFrame(report).transpose()
    report_df['model'] = name  # Tambahkan nama model untuk referensi

    # Simpan untuk nanti
    all_reports.append(report_df.reset_index())

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    negative       0.75      1.00      0.86        30
     neutral       0.00      0.00      0.00        10

    accuracy                           0.75        40
   macro avg       0.38      0.50      0.43        40
weighted avg       0.56      0.75      0.64        40

              precision    recall  f1-score   support

    negative       0.75      1.00      0.86        30
     neutral       0.00      0.00      0.00        10

    accuracy                           0.75        40
   macro avg       0.38      0.50      0.43        40
weighted avg       0.56      0.75      0.64        40

              precision    recall  f1-score   support

    negative       0.79      1.00      0.88        30
     neutral       1.00      0.20      0.33        10

    accuracy                           0.80        40
   macro avg       0.89      0.60      0.61        40
weighted avg       0.84      0.80      0.75        40

              preci

In [9]:
# Gabungkan semua report
final_report = pd.concat(all_reports, ignore_index=True)

# Simpan ke satu file CSV
final_report.to_csv('all_classification_reports.csv', index=False)