In [20]:
import os
import re
import time
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import google.genai as genai

In [21]:
file_paths = ['df_antara_cleaned.csv', 'df_merdeka_cleaned.csv', 'df_okezone_cleaned.csv','df_republika_cleaned.csv']
df = pd.concat(
    (pd.read_csv(f) for f in file_paths),
    ignore_index=True 
)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   url         1615 non-null   object
 1   domain      1615 non-null   object
 2   title       1615 non-null   object
 3   date        1615 non-null   object
 4   clean_text  1615 non-null   object
dtypes: object(5)
memory usage: 63.2+ KB


In [23]:
df.head()

Unnamed: 0,url,domain,title,date,clean_text
0,https://www.antaranews.com/berita/5182401/teru...,antaranews.com,"Terus meroket, emas di Pegadaian ada yang sent...",2025-10-18,harga emas yang dikutip dari laman resmi sahab...
1,https://www.antaranews.com/berita/5135121/harg...,antaranews.com,Harga tiga produk emas di Pegadaian Jumat ini ...,2025-09-26,harga emas yang dikutip dari laman resmi sahab...
2,https://www.antaranews.com/berita/5014569/harg...,antaranews.com,"Harga terbaru emas Pegadaian, Galeri24-Antam t...",2025-08-05,emas galeri24 turun rp2.000 dari angka rp1.932...
3,https://www.antaranews.com/berita/4857989/harg...,antaranews.com,Harga emas di Pegadaian pada 26 Mei kompak stabil,2025-05-26,harga emas yang dikutip dari laman resmi pegad...
4,https://www.antaranews.com/berita/5123476/harg...,antaranews.com,"Harga emas Antam, Galeri24, UBS di Pegadaian h...",2025-09-21,harga emas yang dikutip dari laman resmi sahab...


In [None]:
# Load all keys
load_dotenv(override=True)
API_KEYS = [
    os.getenv("GEMINI_KEY_1"),
    os.getenv("GEMINI_KEY_2"),
    os.getenv("GEMINI_KEY_3"),
    os.getenv("GEMINI_KEY_4"),
    os.getenv("GEMINI_KEY_5"),
    os.getenv("GEMINI_KEY_6"),
]
MODEL_ID = "models/gemini-2.5-flash"

# Initialize first client
key_index = 0
client = genai.Client(api_key=API_KEYS[key_index])

# Utility
def switch_api_key():
    """Switch to next available key. Return False if none left."""
    global key_index, client
    if key_index + 1 < len(API_KEYS):
        key_index += 1
        new_key = API_KEYS[key_index]
        client = genai.Client(api_key=new_key)
        print(f"Switched to API key #{key_index+1}")
        return True
    else:
        print("All API keys exhausted.")
        return False

In [None]:
# Batch classify function 
def batch_classify_inflation(texts, max_retries=3):
    joined = "\n\n".join([f"{i+1}. {t[:1000]}" for i, t in enumerate(texts)])
    prompt = f"""
    Kamu adalah analis ekonomi makro yang menilai berita tentang inflasi di Indonesia khususnya pada index Indeks Harga Konsumen (CPI) di Indonesia.

    Untuk setiap teks berita berikut, berikan:
    - Label: Inflation, Deflation, atau Neutral
    - Alasan singkat (1 kalimat) mengapa kamu memilih label itu.

    Format jawaban (gunakan tanda | sebagai pemisah label dan alasan):
    1. Inflation | Kenaikan harga minyak mendorong tekanan inflasi.
    2. Deflation | Penurunan permintaan menyebabkan harga turun.
    3. Neutral | Hanya laporan data tanpa indikasi tekanan harga.

    Teks berita:
    {joined}
    """

    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=MODEL_ID,
                contents=prompt,
            )

            raw_output = getattr(response, "text", None)
            if raw_output is None and hasattr(response, "candidates"):
                raw_output = response.candidates[0].content.parts[0].text
            if raw_output is None:
                raw_output = str(response)

            matches = re.findall(
                r'^\s*\d+\.\s*([A-Za-z]+)\s*\|\s*(.+)$',
                raw_output,
                flags=re.MULTILINE,
            )

            labels, reasons = [], []
            for label, reason in matches:
                label = label.capitalize()
                if label not in ["Inflation", "Deflation", "Neutral"]:
                    label = "Neutral"
                labels.append(label)
                reasons.append(reason.strip())

            # Ensure batch length consistency
            n = len(texts)
            while len(labels) < n:
                labels.append("Neutral")
                reasons.append("Tidak ada alasan diberikan.")
            if len(labels) > n:
                labels, reasons = labels[:n], reasons[:n]
            return labels, reasons

        except Exception as e:
            err = str(e)
            print(f"Exception in batch_classify_inflation: {type(e).__name__} → {err}")

            if "RESOURCE_EXHAUSTED" in err:
                print(f"API key #{key_index+1} quota exhausted.")
                if not switch_api_key():
                    raise RuntimeError("All keys exhausted – stop job now.")
                else:
                    time.sleep(10)
                    return batch_classify_inflation(texts)  # retry with new key

            elif "503" in err or "UNAVAILABLE" in err:
                wait = 60 * (attempt + 1)
                print(f"Gemini overloaded, waiting {wait}s before retry...")
                time.sleep(wait)
                continue

            else:
                print("Unhandled error, retrying after short delay...")
                time.sleep(5)
                continue

    print("All retries failed. Defaulting to Neutral.")
    n = len(texts)
    return ["Neutral"] * n, ["Server overloaded or quota limit."] * n

In [26]:
def safe_batch_label(df, batch_size=5, checkpoint_path="checkpoint_bisnis.csv",
                     delay_sec=6, retry_wait=60, max_retries=3):

    if "label" not in df.columns:
        df["label"] = [None] * len(df)
    if "label_reason" not in df.columns:
        df["label_reason"] = [None] * len(df)

    start_i = 0

    # Resume safely
    if os.path.exists(checkpoint_path):
        print(f"Resuming from checkpoint: {checkpoint_path}")
        df_ckpt = pd.read_csv(checkpoint_path)
        if len(df_ckpt) == len(df):
            for col in ["label", "label_reason"]:
                if col in df_ckpt.columns:
                    df[col] = df_ckpt[col]
        unlabeled_idx = df["label"].isna()
        if unlabeled_idx.any():
            start_i = unlabeled_idx.idxmax()
        else:
            print("All rows already labeled — nothing to resume.")
            return df
        print(f"Resuming from row {start_i}")

    # Main loop
    for i in tqdm(range(start_i, len(df), batch_size)):
        batch = df["clean_text"].iloc[i:i+batch_size].tolist()
        n_batch = len(batch)

        try:
            labels, reasons = batch_classify_inflation(batch)
        except RuntimeError as e:
            print(f"{e}. Saving checkpoint and stopping.")
            tmp_path = checkpoint_path + ".tmp"
            df.to_csv(tmp_path, index=False)
            os.replace(tmp_path, checkpoint_path)
            print(f"Saved progress before stopping at row {i}.")
            return df

        if not labels:
            labels = ["Neutral"] * n_batch
            reasons = ["Failed request"] * n_batch

        # Match batch size
        if len(labels) != n_batch:
            labels = (labels + ["Neutral"] * n_batch)[:n_batch]
        if len(reasons) != n_batch:
            reasons = (reasons + ["Unknown"] * n_batch)[:n_batch]

        df.loc[i:i+batch_size-1, "label"] = labels
        df.loc[i:i+batch_size-1, "label_reason"] = reasons

        # Save checkpoint every 100 rows or at end
        if (i % 100 == 0) or (i + batch_size >= len(df)):
            tmp_path = checkpoint_path + ".tmp"
            df.to_csv(tmp_path, index=False)
            os.replace(tmp_path, checkpoint_path)
            print(f"Checkpoint saved safely at row {i}")

        time.sleep(delay_sec)

    os.makedirs("fidelity_check_model", exist_ok=True)
    out_path = "fidelity_check_model/data/df_labeled.csv"
    df.to_csv(out_path, index=False)
    print(f"Labeling complete → saved to {out_path}")
    return df

In [27]:
df_labeled = safe_batch_label(df)

Resuming from checkpoint: checkpoint_bisnis.csv
Resuming from row 205


  0%|          | 0/282 [00:00<?, ?it/s]

  7%|▋         | 19/282 [05:00<1:03:03, 14.39s/it]

Checkpoint saved safely at row 300


 14%|█▍        | 39/282 [10:38<1:09:54, 17.26s/it]

Checkpoint saved safely at row 400


 21%|██        | 59/282 [15:54<57:54, 15.58s/it]  

Checkpoint saved safely at row 500


 28%|██▊       | 79/282 [21:12<49:18, 14.57s/it]  

Checkpoint saved safely at row 600


 35%|███▌      | 99/282 [26:36<47:09, 15.46s/it]  

Checkpoint saved safely at row 700


 42%|████▏     | 119/282 [31:47<48:45, 17.95s/it]

Checkpoint saved safely at row 800


 49%|████▉     | 139/282 [37:13<39:21, 16.51s/it]

Checkpoint saved safely at row 900


 56%|█████▋    | 159/282 [42:41<34:25, 16.79s/it]

Checkpoint saved safely at row 1000


 63%|██████▎   | 179/282 [47:23<22:46, 13.27s/it]

Checkpoint saved safely at row 1100


 71%|███████   | 199/282 [52:21<21:11, 15.32s/it]

Checkpoint saved safely at row 1200


 73%|███████▎  | 207/282 [54:40<22:44, 18.19s/it]

Exception in batch_classify_inflation: ClientError → 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250, model: gemini-2.5-flash\nPlease retry in 1.686629862s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'

 78%|███████▊  | 219/282 [57:41<15:27, 14.72s/it]

Checkpoint saved safely at row 1300


 85%|████████▍ | 239/282 [1:02:38<10:58, 15.30s/it]

Checkpoint saved safely at row 1400


 92%|█████████▏| 259/282 [1:07:41<05:42, 14.88s/it]

Checkpoint saved safely at row 1500


 99%|█████████▉| 279/282 [1:12:41<00:46, 15.52s/it]

Checkpoint saved safely at row 1600


100%|█████████▉| 281/282 [1:13:15<00:16, 16.35s/it]

Checkpoint saved safely at row 1610


100%|██████████| 282/282 [1:13:31<00:00, 15.64s/it]

Labeling complete → saved to fidelity_check_model/data/df_labeled.csv





In [28]:
df_labeled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615 entries, 0 to 1614
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           1615 non-null   object
 1   domain        1615 non-null   object
 2   title         1615 non-null   object
 3   date          1615 non-null   object
 4   clean_text    1615 non-null   object
 5   label         1615 non-null   object
 6   label_reason  1615 non-null   object
dtypes: object(7)
memory usage: 88.4+ KB


In [31]:
df[['clean_text','label','label_reason']].head()

Unnamed: 0,clean_text,label,label_reason
0,harga emas yang dikutip dari laman resmi sahab...,Neutral,Pergerakan harga emas adalah indikator pasar a...
1,harga emas yang dikutip dari laman resmi sahab...,Neutral,Perubahan harga emas merupakan fluktuasi pada ...
2,emas galeri24 turun rp2.000 dari angka rp1.932...,Neutral,Harga emas adalah aset investasi yang tidak te...
3,harga emas yang dikutip dari laman resmi pegad...,Neutral,Stabilitas harga emas mencerminkan pasar aset ...
4,harga emas yang dikutip dari laman resmi sahab...,Neutral,"Kenaikan harga emas, sebagai aset investasi, t..."


In [32]:
df['label'].value_counts()

label
Neutral      1139
Inflation     261
Deflation     215
Name: count, dtype: int64

Very unbalance result of a new dataset.