In [None]:
# @title Step 1: Setup Awal dan Fungsi Utilitas
import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize # Meskipun diimpor, tidak digunakan langsung di sini. Dapat dihapus jika tidak diperlukan nanti.
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams # Meskipun diimpor, tidak digunakan langsung di sini. Dapat dihapus jika tidak diperlukan nanti.
from nltk.probability import FreqDist # Meskipun diimpor, tidak digunakan langsung di sini. Dapat dihapus jika tidak diperlukan nanti.
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pickle # Untuk menyimpan/memuat model, meskipun di Colab, lebih jarang digunakan karena sesi bersifat sementara.
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from google.colab import drive
import os

print("Step 1: Mengimpor library dan mendefinisikan fungsi utilitas.")

# Mount Google Drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/KULIAH/SEMESTER 6/ML/FP/dataset'
print("Google Drive berhasil di-mount.")

# Fungsi untuk menghitung jumlah kata dalam dokumen
def word_count(s):
    return len(s.split())

# Fungsi untuk menghapus tag HTML dari teks
def striphtml(text):
    p = re.compile('<.*?>')
    return p.sub('', text)

print("Fungsi utilitas 'word_count' dan 'striphtml' telah didefinisikan.")

Step 1: Mengimpor library dan mendefinisikan fungsi utilitas.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive berhasil di-mount.
Fungsi utilitas 'word_count' dan 'striphtml' telah didefinisikan.


In [None]:
print("Step 2: Memuat data dan pra-pemrosesan.")

import os
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Asumsi striphtml sudah didefinisikan
def striphtml(text):
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
file_path = os.path.join('/content/drive/MyDrive/', 'Twitter Dataset.csv')

# Memuat dataset
try:
    df = pd.read_csv(file_path)
    print(f"Dataset dimuat. Bentuk: {df.shape}")
    print("Melihat 5 baris pertama:")
    display(df.head())
except FileNotFoundError:
    print("ERROR: File 'twitter_dataset.csv' tidak ditemukan.")
    raise

# Pembersihan teks
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = striphtml(text).lower()  # Hapus HTML dan ubah ke huruf kecil
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

df['tweet'] = df['tweet'].apply(preprocess_text)
print("Pra-pemrosesan teks selesai (menghapus HTML, lowercase, lemmatisasi, stopwords).")

# Cek distribusi label
print("Distribusi label:")
print(df['label'].value_counts())

# Memisahkan dataset
midpoint = len(df) // 2
df1 = df.iloc[:midpoint]  # Untuk pelatihan oracle
df2 = df.iloc[midpoint:]  # Untuk eksperimen AL
print(f"Dataset dibagi: df1 (Oracle Training) shape={df1.shape}, df2 (AL Experiments) shape={df2.shape}")

# Inisialisasi dan fit CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase=True, ngram_range=(1, 2), max_features=5000)  # Tambah max_features
cv.fit(df1['tweet'])
print(f"CountVectorizer diinisialisasi dan di-fit. Jumlah fitur: {len(cv.vocabulary_)}")

Step 2: Memuat data dan pra-pemrosesan.
Dataset dimuat. Bentuk: (7920, 3)
Melihat 5 baris pertama:


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


Pra-pemrosesan teks selesai (menghapus HTML, lowercase, lemmatisasi, stopwords).
Distribusi label:
label
0    5894
1    2026
Name: count, dtype: int64
Dataset dibagi: df1 (Oracle Training) shape=(3960, 3), df2 (AL Experiments) shape=(3960, 3)
CountVectorizer diinisialisasi dan di-fit. Jumlah fitur: 5000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7920 non-null   int64 
 1   label   7920 non-null   int64 
 2   tweet   7920 non-null   object
dtypes: int64(2), object(1)
memory usage: 185.8+ KB


In [None]:
print("Step 3: Pelatihan Oracle Simulator.")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Data oracle
sampled_data_oracle = df1
y_oracle_full = sampled_data_oracle['label']
encoder_oracle = LabelEncoder()
y_oracle_encoded = encoder_oracle.fit_transform(y_oracle_full)

# Split data
X_train_oracle_text, X_test_oracle_text, y_train_oracle, y_test_oracle = train_test_split(
    sampled_data_oracle['tweet'], y_oracle_encoded, test_size=0.5, random_state=1
)

# Gunakan TfidfVectorizer (cv sudah diinisialisasi di Step 2, tapi kita transform ulang untuk konsistensi)
X_train_bow_oracle = cv.transform(X_train_oracle_text).toarray()
print(f"Data pelatihan Oracle di-vectorize. Bentuk BOW: {X_train_bow_oracle.shape}")

# Tuning hiperparameter
param_grid = {'C': [0.001, 0.01, 0.1, 1], 'penalty': ['l2']}
grid = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=1000), param_grid, cv=3)
grid.fit(X_train_bow_oracle, y_train_oracle)
oracle = grid.best_estimator_
print(f"Model Oracle (Logistic Regression) dilatih dengan parameter terbaik: {grid.best_params_}")

# Evaluasi oracle
X_test_bow_oracle = cv.transform(X_test_oracle_text).toarray()
y_pred_oracle = oracle.predict(X_test_bow_oracle)
print(f"Akurasi Oracle pada set pengujian: {accuracy_score(y_test_oracle, y_pred_oracle):.4f}")

# Fungsi answer (tetap sama, tapi pastikan kompatibel)
def answer(oracle_model, x_bow_subinstance):
    import numpy as np
    if x_bow_subinstance.shape[0] == 0 or x_bow_subinstance.sum() == 0:
        return -1
    if 1 - np.max(oracle_model.predict_proba(x_bow_subinstance)[0]) > 0.3:  # Turunkan ambang ke 0.3
        return -1
    return oracle_model.predict(x_bow_subinstance)[0]

print("Fungsi 'answer' didefinisikan dengan ambang batas 0.3.")

# Pemetaan biaya
COST_PER_K_WORDS = {10: 5.7, 25: 8.2, 50: 10.9, 75: 15.9, 100: 16.7, -1: 15.0}
print("Pemetaan biaya didefinisikan.")

Step 3: Pelatihan Oracle Simulator.
Data pelatihan Oracle di-vectorize. Bentuk BOW: (1980, 5000)
Model Oracle (Logistic Regression) dilatih dengan parameter terbaik: {'C': 1, 'penalty': 'l2'}
Akurasi Oracle pada set pengujian: 0.8490
Fungsi 'answer' didefinisikan dengan ambang batas 0.3.
Pemetaan biaya didefinisikan.


In [None]:
print("Step 4: Pelatihan Pengklasifikasi Netralitas.")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from scipy.sparse import vstack

# Inisialisasi TfidfVectorizer untuk netralitas
cv_neutrality_model = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), max_features=5000)

# Split data
X_train_neutrality_text, X_test_neutrality_text, y_train_neutrality_raw, y_test_neutrality_raw = train_test_split(
    df1['tweet'], df1['label'], test_size=0.5, random_state=42
)

cv_neutrality_model.fit(X_train_neutrality_text)

neutrality_data = []
k_options_for_neutrality_training = [10, 25, 50, 75, 100, -1]

for k_size in k_options_for_neutrality_training:
    if k_size == -1:
        X_k_text_subset = X_train_neutrality_text
    else:
        X_k_text_subset = X_train_neutrality_text.apply(lambda x: ' '.join(x.split()[:k_size]))

    X_k_bow_subset = cv_neutrality_model.transform(X_k_text_subset)

    for i in range(X_k_bow_subset.shape[0]):
        x_bow_single_instance = cv.transform([X_k_text_subset.iloc[i]])
        oracle_response = answer(oracle, x_bow_single_instance)
        is_non_neutral = 1 if oracle_response != -1 else 0
        neutrality_data.append({'subinstance_bow': X_k_bow_subset[i], 'is_non_neutral': is_non_neutral})

neutrality_df = pd.DataFrame(neutrality_data)

if not neutrality_df.empty:
    X_neutrality = vstack(neutrality_df['subinstance_bow'].tolist())
    y_neutrality = neutrality_df['is_non_neutral'].values

    # Terapkan SMOTE untuk menangani ketidakseimbangan
    smote = SMOTE(random_state=42)
    X_neutrality_balanced, y_neutrality_balanced = smote.fit_resample(X_neutrality, y_neutrality)

    neutrality_classifier = LogisticRegression(penalty='l2', C=0.01, solver='liblinear', max_iter=1000)
    neutrality_classifier.fit(X_neutrality_balanced, y_neutrality_balanced)
    print("Pengklasifikasi netralitas dilatih dengan SMOTE.")

    # Evaluasi
    X_test_neutrality = cv_neutrality_model.transform(X_test_neutrality_text)
    y_test_neutrality = [1 if label != -1 else 0 for label in encoder_oracle.transform(y_test_neutrality_raw)]
    y_pred_neutrality = neutrality_classifier.predict(X_test_neutrality)
    print(f"Akurasi Pengklasifikasi Netralitas: {accuracy_score(y_test_neutrality, y_pred_neutrality):.4f}")

    def get_prob_non_neutral(neutrality_model, x_bow_subinstance):
        if x_bow_subinstance.shape[0] == 0:
            return 0.0
        return neutrality_model.predict_proba(x_bow_subinstance)[0, 1]
    print("Fungsi 'get_prob_non_neutral' didefinisikan.")
else:
    print("Peringatan: neutrality_df kosong.")
    neutrality_classifier = None

Step 4: Pelatihan Pengklasifikasi Netralitas.
Pengklasifikasi netralitas dilatih dengan SMOTE.
Akurasi Pengklasifikasi Netralitas: 0.5192
Fungsi 'get_prob_non_neutral' didefinisikan.


In [None]:
print("Step 5: Mendefinisikan fungsi inti untuk eksperimen Active Learning.")

import numpy as np

def select_subinstance(main_classifier_model, oracle_model_for_neutrality, k_options=None, utility_func='uncertainty', X_test_full_text=None, U_indices=None):
    best_score = -np.inf
    selected_idx_in_U_pool_current = -1
    selected_k_size = -1

    if k_options is None:  # Static AAL or Traditional AL
        U_current_texts_for_selection = X_test_full_text.iloc[U_indices]
        U_current_bow_for_selection = cv.transform(U_current_texts_for_selection)

        if utility_func == 'uncertainty':
            if U_current_bow_for_selection.shape[0] == 0: return -1, -1
            probabilities = main_classifier_model.predict_proba(U_current_bow_for_selection)
            utilities = 1 - np.max(probabilities, axis=1)
        elif utility_func == 'constant':
            utilities = np.ones(U_current_bow_for_selection.shape[0])
        else:
            raise ValueError("utility_func must be 'uncertainty' or 'constant'.")

        if len(utilities) > 0:
            selected_idx_in_U_pool_current = np.argmax(utilities)

        return selected_idx_in_U_pool_current, k_options

    else:  # Dynamic AAL
        candidate_scores = []
        candidate_unlabeled_indices_in_U_indices = []
        candidate_k_sizes = []

        for u_idx_in_pool_current in range(len(U_indices)):
            actual_X_test_idx = U_indices[u_idx_in_pool_current]
            full_instance_text = X_test_full_text.iloc[actual_X_test_idx]

            for k_size_option in k_options:
                if k_size_option == -1:
                    x_sub_text = full_instance_text
                else:
                    x_sub_text = ' '.join(full_instance_text.split()[:k_size_option])

                if not x_sub_text.strip():
                    continue

                x_sub_bow_main = cv.transform([x_sub_text])

                if utility_func == 'uncertainty':
                    main_classifier_probs = main_classifier_model.predict_proba(x_sub_bow_main)
                    utility = 1 - np.max(main_classifier_probs)
                elif utility_func == 'constant':
                    utility = 1.0
                else:
                    raise ValueError("utility_func must be 'uncertainty' or 'constant'.")

                if oracle_model_for_neutrality is not None:
                    x_sub_bow_neutrality = cv_neutrality_model.transform([x_sub_text])
                    prob_non_neutral = get_prob_non_neutral(oracle_model_for_neutrality, x_sub_bow_neutrality)
                else:
                    prob_non_neutral = 1.0

                cost_of_subinstance = COST_PER_K_WORDS.get(k_size_option, 15)
                score = (utility * prob_non_neutral ** 2) / cost_of_subinstance  # Bobot prob_non_neutral lebih tinggi

                candidate_scores.append(score)
                candidate_unlabeled_indices_in_U_indices.append(u_idx_in_pool_current)
                candidate_k_sizes.append(k_size_option)

        if not candidate_scores:
            return -1, -1

        best_overall_idx_in_candidates = np.argmax(candidate_scores)
        selected_idx_in_U_pool_current = candidate_unlabeled_indices_in_U_indices[best_overall_idx_in_candidates]
        selected_k_size = candidate_k_sizes[best_overall_idx_in_candidates]

    return selected_idx_in_U_pool_current, selected_k_size

print("Fungsi 'select_subinstance' dioptimalkan dengan bobot prob_non_neutral.")

Step 5: Mendefinisikan fungsi inti untuk eksperimen Active Learning.
Fungsi 'select_subinstance' dioptimalkan dengan bobot prob_non_neutral.


In [None]:
print("Step 6: Menyiapkan data dan menjalankan eksperimen Active Learning.")

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy.sparse import vstack

# Filter empty texts to prevent iteration issues
df2 = df2[df2['tweet'].str.strip() != '']

# Siapkan data untuk AL
sampled_data_al = df2.sample(n=2400, random_state=14)
y_al = sampled_data_al['label']
encoder_al = LabelEncoder()
y_al = encoder_al.fit_transform(y_al)

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
budget = 3000  # 50 minutes
max_iterations = 1000  # Limit iterations

# Store results for Step 7
results = {'traditional_al': {'costs': [], 'aucs': []}, 'dynamic_aal': {'costs': [], 'aucs': [], 'k_sizes': []}}

for fold, (train_idx, test_idx) in enumerate(kf.split(sampled_data_al)):
    print(f"\n--- Fold {fold + 1} ---")
    X_train_al_text = sampled_data_al['tweet'].iloc[train_idx]
    X_test_al_text = sampled_data_al['tweet'].iloc[test_idx]
    y_train_al = y_al[train_idx]
    y_test_al = y_al[test_idx]

    print(f"Train set size: {len(X_train_al_text)}, Test set size: {len(X_test_al_text)}")
    print(f"Teks kosong di X_test_al_text: {(X_test_al_text == '').sum()}")

    # Vectorize
    X_train_al_bow = cv.transform(X_train_al_text)
    print(f"Data pelatihan awal AL di-vectorize. Bentuk BOW: {X_train_al_bow.shape}")

    # Traditional AL
    print("\n--- Running Traditional AL (Unc) ---")
    cost_traditional_al, auc_traditional_al, _ = run_active_learning_experiment(
        X_train_al_bow, y_train_al, X_test_al_text, y_test_al,
        oracle, neutrality_classifier, budget,
        strategy_type='traditional_al', k_fixed=-1, utility_func='uncertainty'
    )
    results['traditional_al']['costs'].append(cost_traditional_al)
    results['traditional_al']['aucs'].append(auc_traditional_al)

    # Dynamic AAL
    print("\n--- Running Dynamic AAL (Unc) ---")
    k_options_dynamic_aal = [10, 25, 50, 75, 100, -1]
    cost_dynamic_unc, auc_dynamic_unc, k_sizes_selected = run_active_learning_experiment(
        X_train_al_bow, y_train_al, X_test_al_text, y_test_al,
        oracle, neutrality_classifier, budget,
        strategy_type='dynamic', k_options_dynamic=k_options_dynamic_aal, utility_func='uncertainty'
    )
    results['dynamic_aal']['costs'].append(cost_dynamic_unc)
    results['dynamic_aal']['aucs'].append(auc_dynamic_unc)
    results['dynamic_aal']['k_sizes'].append(k_sizes_selected)

print("\n--- Eksperimen selesai ---")

Step 6: Menyiapkan data dan menjalankan eksperimen Active Learning.

--- Fold 1 ---
Train set size: 1600, Test set size: 800
Teks kosong di X_test_al_text: 0
Data pelatihan awal AL di-vectorize. Bentuk BOW: (1600, 5000)

--- Running Traditional AL (Unc) ---
Starting traditional_al experiment. Initial Labeled: 1600, Initial AUC: 0.8286
Query 1: Cost=15.00, AUC=0.8284, Labeled=1601, U_remaining=799, k_size=None
Query 2: Cost=30.00, AUC=0.8286, Labeled=1602, U_remaining=798, k_size=None
Query 3: Cost=45.00, AUC=0.8284, Labeled=1603, U_remaining=797, k_size=None
Query 4: Cost=60.00, AUC=0.8278, Labeled=1604, U_remaining=796, k_size=None
Query 5: Cost=75.00, AUC=0.8276, Labeled=1605, U_remaining=795, k_size=None
Query 6: Cost=90.00, AUC=0.8286, Labeled=1606, U_remaining=794, k_size=None
Query 7: Cost=105.00, AUC=0.8280, Labeled=1607, U_remaining=793, k_size=None
Query 8: Cost=120.00, AUC=0.8276, Labeled=1608, U_remaining=792, k_size=None
Query 9: Cost=135.00, AUC=0.8272, Labeled=1609, U_rem

Step 7: Plotting Hasil Perbandingan

In [None]:
print("Step 7: Membuat plot perbandingan dan histogram Dynamic AAL.")

import matplotlib.pyplot as plt
import numpy as np

# Asumsi k_sizes_selected dikumpulkan di run_active_learning_experiment
# Modifikasi run_active_learning_experiment untuk mengumpulkan k_sizes_selected
def run_active_learning_experiment(
    X_train_initial_bow, y_train_initial, X_test_full_text, y_test_true,
    oracle_model, neutrality_model, budget_seconds,
    strategy_type, k_fixed=None, utility_func=None, k_options_dynamic=None
):
    L_bow = X_train_initial_bow.copy()
    y_L = y_train_initial.copy()
    U_indices = list(range(len(X_test_full_text)))
    current_cost = 0
    auc_history = []
    cost_history = []
    k_sizes_selected = []

    from sklearn.multiclass import OneVsRestClassifier
    main_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=0.01, solver='liblinear', max_iter=1000))

    if L_bow.shape[0] > 0:
        main_classifier.fit(L_bow, y_L)
        X_test_bow_eval = cv.transform(X_test_full_text)
        y_test_pred_eval = main_classifier.predict_proba(X_test_bow_eval)[:, 1]
        auc_history.append(roc_auc_score(y_test_true, y_test_pred_eval))
        cost_history.append(current_cost)
    else:
        auc_history.append(0.5)
        cost_history.append(current_cost)

    print(f"Starting {strategy_type} experiment. Initial Labeled: {y_L.shape[0]}, Initial AUC: {auc_history[-1]:.4f}")

    while current_cost < budget_seconds and len(U_indices) > 0:
        if strategy_type == 'dynamic':
            selected_idx_in_U_pool_current, selected_k_size = select_subinstance(
                main_classifier, oracle_model_for_neutrality=neutrality_model, k_options=k_options_dynamic,
                utility_func=utility_func, X_test_full_text=X_test_full_text, U_indices=U_indices
            )
        elif strategy_type == 'traditional_al':
             selected_idx_in_U_pool_current, selected_k_size = select_subinstance(
                main_classifier, oracle_model_for_neutrality=None, k_options=None, # Traditional AL doesn't use neutrality or k_options
                utility_func=utility_func, X_test_full_text=X_test_full_text, U_indices=U_indices
            )
        else:
            raise ValueError("Invalid strategy_type")

        if selected_idx_in_U_pool_current == -1:
            print("No instance selected.")
            break

        # Get the actual index in the full dataset
        actual_X_test_idx = U_indices[selected_idx_in_U_pool_current]
        selected_instance_text = X_test_full_text.iloc[actual_X_test_idx]
        true_label = y_test_true[actual_X_test_idx]

        # Simulate oracle querying
        if selected_k_size == -1:
             subinstance_text_for_cost = selected_instance_text
        else:
             subinstance_text_for_cost = ' '.join(selected_instance_text.split()[:selected_k_size])

        words_in_subinstance = len(subinstance_text_for_cost.split())
        cost_for_this_query = COST_PER_K_WORDS.get(selected_k_size, COST_PER_K_WORDS[-1])

        current_cost += cost_for_this_query

        # Add the selected instance to L
        selected_instance_bow = cv.transform([selected_instance_text])
        L_bow = vstack([L_bow, selected_instance_bow])
        y_L = np.append(y_L, true_label)

        # Remove the selected instance from U
        U_indices.pop(selected_idx_in_U_pool_current)

        # Retrain the classifier
        if L_bow.shape[0] > 0:
            main_classifier.fit(L_bow, y_L)
            X_test_bow_eval = cv.transform(X_test_full_text)
            y_test_pred_eval = main_classifier.predict_proba(X_test_bow_eval)[:, 1]
            auc = roc_auc_score(y_test_true, y_test_pred_eval)
            auc_history.append(auc)
            cost_history.append(current_cost)
            k_sizes_selected.append(selected_k_size) # Store k_size
            print(f"Query {len(cost_history)-1}: Cost={current_cost:.2f}, AUC={auc:.4f}, Labeled={L_bow.shape[0]}, U_remaining={len(U_indices)}, k_size={selected_k_size}")
        else:
            # This case should not happen if L_bow starts with data or queries are successful
            auc_history.append(auc_history[-1] if auc_history else 0.5)
            cost_history.append(current_cost)
            k_sizes_selected.append(selected_k_size)

    # Ensure cost and AUC lists have the same length
    while len(cost_history) < len(auc_history):
        cost_history.append(current_cost)
    while len(auc_history) < len(cost_history):
        auc_history.append(auc_history[-1] if auc_history else 0.5)

    return cost_history, auc_history, k_sizes_selected


print("Fungsi 'run_active_learning_experiment' telah didefinisikan lengkap.")

Step 7: Membuat plot perbandingan dan histogram Dynamic AAL.
Fungsi 'run_active_learning_experiment' telah didefinisikan lengkap.


In [None]:
print("Step 6: Menyiapkan data dan menjalankan eksperimen Active Learning.")

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Filter empty texts to prevent iteration issues
df2 = df2[df2['tweet'].str.strip() != '']

# Siapkan data untuk AL
sampled_data_al = df2.sample(n=2400, random_state=14)
y_al = sampled_data_al['label']
encoder_al = LabelEncoder()
y_al = encoder_al.fit_transform(y_al)

# Cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
budget = 3000  # 50 minutes
max_iterations = 1000  # Limit iterations

# Store results for Step 7
results = {'traditional_al': {'costs': [], 'aucs': []}, 'dynamic_aal': {'costs': [], 'aucs': [], 'k_sizes': []}}

for fold, (train_idx, test_idx) in enumerate(kf.split(sampled_data_al)):
    print(f"\n--- Fold {fold + 1} ---")
    X_train_al_text = sampled_data_al['tweet'].iloc[train_idx]
    X_test_al_text = sampled_data_al['tweet'].iloc[test_idx]
    y_train_al = y_al[train_idx]
    y_test_al = y_al[test_idx]

    print(f"Train set size: {len(X_train_al_text)}, Test set size: {len(X_test_al_text)}")
    print(f"Teks kosong di X_test_al_text: {(X_test_al_text == '').sum()}")

    # Vectorize
    X_train_al_bow = cv.transform(X_train_al_text)
    print(f"Data pelatihan awal AL di-vectorize. Bentuk BOW: {X_train_al_bow.shape}")

    # Traditional AL
    print("\n--- Running Traditional AL (Unc) ---")
    cost_traditional_al, auc_traditional_al, _ = run_active_learning_experiment(
        X_train_al_bow, y_train_al, X_test_al_text, y_test_al,
        oracle, neutrality_classifier, budget,
        strategy_type='traditional_al', k_fixed=-1, utility_func='uncertainty'
    )
    results['traditional_al']['costs'].append(cost_traditional_al)
    results['traditional_al']['aucs'].append(auc_traditional_al)

    # Dynamic AAL
    print("\n--- Running Dynamic AAL (Unc) ---")
    k_options_dynamic_aal = [10, 25, 50, 75, 100, -1]
    cost_dynamic_unc, auc_dynamic_unc, k_sizes_selected = run_active_learning_experiment(
        X_train_al_bow, y_train_al, X_test_al_text, y_test_al,
        oracle, neutrality_classifier, budget,
        strategy_type='dynamic', k_options_dynamic=k_options_dynamic_aal, utility_func='uncertainty'
    )
    results['dynamic_aal']['costs'].append(cost_dynamic_unc)
    results['dynamic_aal']['aucs'].append(auc_dynamic_unc)
    results['dynamic_aal']['k_sizes'].append(k_sizes_selected)

print("\n--- Eksperimen selesai ---")

Step 6: Menyiapkan data dan menjalankan eksperimen Active Learning.

--- Fold 1 ---
Train set size: 1600, Test set size: 800
Teks kosong di X_test_al_text: 0
Data pelatihan awal AL di-vectorize. Bentuk BOW: (1600, 5000)

--- Running Traditional AL (Unc) ---
Starting traditional_al experiment. Initial Labeled: 1600, Initial AUC: 0.8286
Query 1: Cost=15.00, AUC=0.8284, Labeled=1601, U_remaining=799, k_size=None
Query 2: Cost=30.00, AUC=0.8286, Labeled=1602, U_remaining=798, k_size=None
Query 3: Cost=45.00, AUC=0.8284, Labeled=1603, U_remaining=797, k_size=None
Query 4: Cost=60.00, AUC=0.8278, Labeled=1604, U_remaining=796, k_size=None
Query 5: Cost=75.00, AUC=0.8276, Labeled=1605, U_remaining=795, k_size=None
Query 6: Cost=90.00, AUC=0.8286, Labeled=1606, U_remaining=794, k_size=None
Query 7: Cost=105.00, AUC=0.8280, Labeled=1607, U_remaining=793, k_size=None
Query 8: Cost=120.00, AUC=0.8276, Labeled=1608, U_remaining=792, k_size=None
Query 9: Cost=135.00, AUC=0.8272, Labeled=1609, U_rem

KeyboardInterrupt: 

In [None]:
# @title Step 7: Plotting Hasil Perbandingan
print("Step 7: Membuat plot perbandingan hasil eksperimen.")

plt.figure(figsize=(14, 9)) # Ukuran plot yang lebih besar untuk banyak garis

# Plot untuk Static AAL dengan berbagai nilai k (Uncertainty)
plt.plot(cost_static_k10_unc, auc_static_k10_unc, label='Static AAL (k=10, Unc)', linestyle='-', color='blue', alpha=0.8)
plt.plot(cost_static_k25_unc, auc_static_k25_unc, label='Static AAL (k=25, Unc)', linestyle='-', color='cyan', alpha=0.8)
plt.plot(cost_static_k100_unc, auc_static_k100_unc, label='Static AAL (k=100, Unc)', linestyle='-', color='darkblue', alpha=0.8)

# Plot untuk Traditional Active Learning
plt.plot(cost_traditional_al, auc_traditional_al, label='Traditional AL (Unc)', linestyle='--', color='red', linewidth=2)

# Plot untuk Dynamic AAL (Uncertainty)
plt.plot(cost_dynamic_unc, auc_dynamic_unc, label='Dynamic AAL (Unc)', linestyle='-', color='green', linewidth=3)

# Plot untuk Dynamic AAL (Constant Utility)
plt.plot(cost_dynamic_const, auc_dynamic_const, label='Dynamic AAL (Const)', linestyle=':', color='brown', linewidth=3)


plt.xlabel("Cost (seconds)")
plt.ylabel("AUC Score")
plt.title("Perbandingan Strategi Anytime Active Learning")
plt.legend(loc='lower right') # Posisi legend di pojok kanan bawah
plt.grid(True) # Menampilkan grid
plt.xscale('log') # Skala logaritmik untuk sumbu X (biaya) agar lebih mudah dibaca, karena biaya cenderung terdistribusi secara logaritmik.
plt.ylim(0.5, 1.0) # Batasi rentang sumbu Y dari 0.5 (kinerja acak) hingga 1.0 (sempurna)
plt.show()

print("Plot perbandingan telah ditampilkan.")

In [None]:
# @title Step 7: Plotting Hasil Perbandingan (HANYA DYNAMIC)
print("Step 7: Membuat plot perbandingan hasil eksperimen Dynamic AAL.")

plt.figure(figsize=(10, 7)) # Ukuran plot

# Plot untuk Dynamic AAL (Uncertainty)
plt.plot(cost_dynamic_unc, auc_dynamic_unc, label='Dynamic AAL (Unc)', linestyle='-', color='green', linewidth=2)

# Plot untuk Dynamic AAL (Constant Utility)
plt.plot(cost_dynamic_const, auc_dynamic_const, label='Dynamic AAL (Const)', linestyle=':', color='brown', linewidth=2)


plt.xlabel("Cost (seconds)")
plt.ylabel("AUC Score")
plt.title("Perbandingan Strategi Dynamic Anytime Active Learning")
plt.legend(loc='lower right')
plt.grid(True)
plt.xscale('log')
plt.ylim(0.5, 1.0)
plt.show()

print("Plot perbandingan Dynamic AAL telah ditampilkan.")