Note: All labelling code are the same, we split this task because of its time consuming.

In [None]:
import os
import re
import time
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import google.genai as genai

# Load all keys 
load_dotenv(override=True)
API_KEYS = [
    os.getenv("GEMINI_KEY_1"),
    os.getenv("GEMINI_KEY_2"),
    os.getenv("GEMINI_KEY_3"),
    os.getenv("GEMINI_KEY_4"),
    os.getenv("GEMINI_KEY_5"),
    os.getenv("GEMINI_KEY_6"),
]
MODEL_ID = "models/gemini-2.5-flash"

# Initialize first client 
key_index = 0
client = genai.Client(api_key=API_KEYS[key_index])

# Utility 
def switch_api_key():
    """Switch to next available key. Return False if none left."""
    global key_index, client
    if key_index + 1 < len(API_KEYS):
        key_index += 1
        new_key = API_KEYS[key_index]
        client = genai.Client(api_key=new_key)
        print(f"Switched to API key #{key_index+1}")
        return True
    else:
        print("All API keys exhausted.")
        return False

In [None]:
# Batch classify function
def batch_classify_inflation(texts, max_retries=3):
    joined = "\n\n".join([f"{i+1}. {t[:1000]}" for i, t in enumerate(texts)])
    prompt = f"""
    Kamu adalah analis ekonomi makro yang menilai berita tentang inflasi di Indonesia khususnya pada index Indeks Harga Konsumen (CPI) di Indonesia.

    Untuk setiap teks berita berikut, berikan:
    - Label: Inflation, Deflation, atau Neutral
    - Alasan singkat (1 kalimat) mengapa kamu memilih label itu.

    Format jawaban (gunakan tanda | sebagai pemisah label dan alasan):
    1. Inflation | Kenaikan harga minyak mendorong tekanan inflasi.
    2. Deflation | Penurunan permintaan menyebabkan harga turun.
    3. Neutral | Hanya laporan data tanpa indikasi tekanan harga.

    Teks berita:
    {joined}
    """

    for attempt in range(max_retries):
        try:
            response = client.models.generate_content(
                model=MODEL_ID,
                contents=prompt,
            )

            raw_output = getattr(response, "text", None)
            if raw_output is None and hasattr(response, "candidates"):
                raw_output = response.candidates[0].content.parts[0].text
            if raw_output is None:
                raw_output = str(response)

            matches = re.findall(
                r'^\s*\d+\.\s*([A-Za-z]+)\s*\|\s*(.+)$',
                raw_output,
                flags=re.MULTILINE,
            )

            labels, reasons = [], []
            for label, reason in matches:
                label = label.capitalize()
                if label not in ["Inflation", "Deflation", "Neutral"]:
                    label = "Neutral"
                labels.append(label)
                reasons.append(reason.strip())

            # Ensure batch length consistency
            n = len(texts)
            while len(labels) < n:
                labels.append("Neutral")
                reasons.append("Tidak ada alasan diberikan.")
            if len(labels) > n:
                labels, reasons = labels[:n], reasons[:n]
            return labels, reasons

        except Exception as e:
            err = str(e)
            print(f"Exception in batch_classify_inflation: {type(e).__name__} → {err}")

            if "RESOURCE_EXHAUSTED" in err:
                print(f"API key #{key_index+1} quota exhausted.")
                if not switch_api_key():
                    raise RuntimeError("All keys exhausted – stop job now.")
                else:
                    time.sleep(10)
                    return batch_classify_inflation(texts)  # retry with new key

            elif "503" in err or "UNAVAILABLE" in err:
                wait = 60 * (attempt + 1)
                print(f"Gemini overloaded, waiting {wait}s before retry...")
                time.sleep(wait)
                continue

            else:
                print("Unhandled error, retrying after short delay...")
                time.sleep(5)
                continue

    print("All retries failed. Defaulting to Neutral.")
    n = len(texts)
    return ["Neutral"] * n, ["Server overloaded or quota limit."] * n

In [6]:
def safe_batch_label(df, batch_size=5, checkpoint_path="checkpoint_bisnis.csv",
                     delay_sec=6, retry_wait=60, max_retries=3):

    if "label" not in df.columns:
        df["label"] = [None] * len(df)
    if "label_reason" not in df.columns:
        df["label_reason"] = [None] * len(df)

    start_i = 0

    # Resume safely
    if os.path.exists(checkpoint_path):
        print(f"Resuming from checkpoint: {checkpoint_path}")
        df_ckpt = pd.read_csv(checkpoint_path)
        if len(df_ckpt) == len(df):
            for col in ["label", "label_reason"]:
                if col in df_ckpt.columns:
                    df[col] = df_ckpt[col]
        unlabeled_idx = df["label"].isna()
        if unlabeled_idx.any():
            start_i = unlabeled_idx.idxmax()
        else:
            print("All rows already labeled — nothing to resume.")
            return df
        print(f"Resuming from row {start_i}")

    # Main loop
    for i in tqdm(range(start_i, len(df), batch_size)):
        batch = df["clean_text"].iloc[i:i+batch_size].tolist()
        n_batch = len(batch)

        try:
            labels, reasons = batch_classify_inflation(batch)
        except RuntimeError as e:
            print(f"{e}. Saving checkpoint and stopping.")
            tmp_path = checkpoint_path + ".tmp"
            df.to_csv(tmp_path, index=False)
            os.replace(tmp_path, checkpoint_path)
            print(f"Saved progress before stopping at row {i}.")
            return df

        if not labels:
            labels = ["Neutral"] * n_batch
            reasons = ["Failed request"] * n_batch

        # Match batch size
        if len(labels) != n_batch:
            labels = (labels + ["Neutral"] * n_batch)[:n_batch]
        if len(reasons) != n_batch:
            reasons = (reasons + ["Unknown"] * n_batch)[:n_batch]

        df.loc[i:i+batch_size-1, "label"] = labels
        df.loc[i:i+batch_size-1, "label_reason"] = reasons

        # Save checkpoint every 100 rows or at end
        if (i % 100 == 0) or (i + batch_size >= len(df)):
            tmp_path = checkpoint_path + ".tmp"
            df.to_csv(tmp_path, index=False)
            os.replace(tmp_path, checkpoint_path)
            print(f"Checkpoint saved safely at row {i}")

        time.sleep(delay_sec)

    os.makedirs("result", exist_ok=True)
    out_path = "result/df_bisnis_labeled.csv"
    df.to_csv(out_path, index=False)
    print(f"Labeling complete → saved to {out_path}")
    return df

In [None]:
# RUN 
df_bisnis = pd.read_csv("result/df_bisnis.csv")
df_bisnis_labeled = safe_batch_label(df_bisnis)

  0%|          | 0/1103 [00:00<?, ?it/s]

Checkpoint saved safely at row 0


  1%|▏         | 14/1103 [04:43<5:45:27, 19.03s/it]

Exception in batch_classify_inflation: ClientError → 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250\nPlease retry in 7.251233462s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '250'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https:

  2%|▏         | 20/1103 [06:56<6:54:53, 22.99s/it]

Checkpoint saved safely at row 100


  4%|▎         | 40/1103 [13:29<6:21:12, 21.52s/it]

Checkpoint saved safely at row 200


  5%|▌         | 60/1103 [20:10<5:49:42, 20.12s/it]

Checkpoint saved safely at row 300


  7%|▋         | 80/1103 [26:04<4:47:32, 16.86s/it]

Checkpoint saved safely at row 400


  9%|▉         | 100/1103 [32:26<5:03:23, 18.15s/it]

Checkpoint saved safely at row 500


 11%|█         | 120/1103 [38:23<5:04:19, 18.58s/it]

Checkpoint saved safely at row 600


 13%|█▎        | 140/1103 [44:30<4:52:36, 18.23s/it]

Checkpoint saved safely at row 700


 15%|█▍        | 160/1103 [51:02<5:11:30, 19.82s/it]

Checkpoint saved safely at row 800


 16%|█▋        | 180/1103 [57:08<4:46:31, 18.63s/it]

Checkpoint saved safely at row 900


 18%|█▊        | 200/1103 [1:03:35<4:47:48, 19.12s/it]

Checkpoint saved safely at row 1000


 20%|█▉        | 220/1103 [1:09:58<4:50:34, 19.74s/it]

Checkpoint saved safely at row 1100


 22%|██▏       | 240/1103 [1:16:15<4:51:09, 20.24s/it]

Checkpoint saved safely at row 1200


 24%|██▎       | 260/1103 [1:22:32<4:50:18, 20.66s/it]

Checkpoint saved safely at row 1300


 25%|██▌       | 280/1103 [1:28:43<4:16:33, 18.70s/it]

Checkpoint saved safely at row 1400


 27%|██▋       | 300/1103 [1:35:31<4:43:26, 21.18s/it]

Checkpoint saved safely at row 1500


 29%|██▉       | 320/1103 [1:41:58<3:54:27, 17.97s/it]

Checkpoint saved safely at row 1600


 31%|███       | 340/1103 [1:48:58<4:30:50, 21.30s/it]

Checkpoint saved safely at row 1700


 33%|███▎      | 360/1103 [1:55:03<3:35:36, 17.41s/it]

Checkpoint saved safely at row 1800


 34%|███▍      | 380/1103 [2:01:53<4:00:08, 19.93s/it]

Checkpoint saved safely at row 1900


 36%|███▋      | 400/1103 [2:08:11<3:38:48, 18.67s/it]

Checkpoint saved safely at row 2000


 38%|███▊      | 420/1103 [2:14:43<3:57:12, 20.84s/it]

Checkpoint saved safely at row 2100


 40%|███▉      | 440/1103 [2:21:47<3:49:56, 20.81s/it]

Checkpoint saved safely at row 2200


 42%|████▏     | 460/1103 [2:28:20<3:50:24, 21.50s/it]

Checkpoint saved safely at row 2300


 42%|████▏     | 468/1103 [2:30:55<3:31:12, 19.96s/it]

Exception in batch_classify_inflation: ClientError → 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250\nPlease retry in 55.222293781s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '250'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https

 44%|████▎     | 480/1103 [2:34:55<3:18:46, 19.14s/it]

Checkpoint saved safely at row 2400


 45%|████▌     | 500/1103 [2:41:27<3:08:20, 18.74s/it]

Checkpoint saved safely at row 2500


 47%|████▋     | 520/1103 [2:48:19<3:17:25, 20.32s/it]

Checkpoint saved safely at row 2600


 49%|████▉     | 540/1103 [2:54:31<2:40:46, 17.13s/it]

Checkpoint saved safely at row 2700


 51%|█████     | 560/1103 [3:00:58<3:03:32, 20.28s/it]

Checkpoint saved safely at row 2800


 53%|█████▎    | 580/1103 [3:07:59<2:55:32, 20.14s/it]

Checkpoint saved safely at row 2900


 54%|█████▍    | 600/1103 [3:14:23<2:46:58, 19.92s/it]

Checkpoint saved safely at row 3000


 56%|█████▌    | 620/1103 [3:21:29<2:41:50, 20.11s/it]

Checkpoint saved safely at row 3100


 58%|█████▊    | 640/1103 [3:28:01<2:43:49, 21.23s/it]

Checkpoint saved safely at row 3200


 60%|█████▉    | 660/1103 [3:35:08<2:58:48, 24.22s/it]

Checkpoint saved safely at row 3300


 62%|██████▏   | 680/1103 [3:41:19<2:20:46, 19.97s/it]

Checkpoint saved safely at row 3400


 63%|██████▎   | 700/1103 [3:48:03<2:14:02, 19.96s/it]

Checkpoint saved safely at row 3500


 65%|██████▌   | 718/1103 [3:53:56<2:05:23, 19.54s/it]

Exception in batch_classify_inflation: ClientError → 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250\nPlease retry in 54.788010124s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '250'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https

 65%|██████▌   | 720/1103 [3:54:38<2:07:37, 19.99s/it]

Checkpoint saved safely at row 3600


 67%|██████▋   | 740/1103 [4:00:56<1:56:18, 19.22s/it]

Checkpoint saved safely at row 3700


 69%|██████▉   | 760/1103 [4:07:23<1:59:51, 20.97s/it]

Checkpoint saved safely at row 3800


 71%|███████   | 780/1103 [4:13:54<1:48:23, 20.14s/it]

Checkpoint saved safely at row 3900


 73%|███████▎  | 800/1103 [4:20:37<1:49:25, 21.67s/it]

Checkpoint saved safely at row 4000


 74%|███████▍  | 820/1103 [4:27:29<1:39:34, 21.11s/it]

Checkpoint saved safely at row 4100


 76%|███████▌  | 840/1103 [4:34:21<1:26:59, 19.85s/it]

Checkpoint saved safely at row 4200


 78%|███████▊  | 860/1103 [4:40:35<1:05:39, 16.21s/it]

Checkpoint saved safely at row 4300


 80%|███████▉  | 880/1103 [4:47:53<1:30:41, 24.40s/it]

Checkpoint saved safely at row 4400


 82%|████████▏ | 900/1103 [4:54:35<1:06:51, 19.76s/it]

Checkpoint saved safely at row 4500


 83%|████████▎ | 920/1103 [5:01:25<57:13, 18.76s/it]  

Checkpoint saved safely at row 4600


 85%|████████▌ | 940/1103 [5:08:36<1:02:52, 23.14s/it]

Checkpoint saved safely at row 4700


 87%|████████▋ | 960/1103 [5:15:05<47:43, 20.02s/it]  

Checkpoint saved safely at row 4800


 88%|████████▊ | 969/1103 [5:18:01<42:56, 19.23s/it]

Exception in batch_classify_inflation: ClientError → 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 250\nPlease retry in 48.783491572s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '250'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https

 89%|████████▉ | 980/1103 [5:21:45<36:42, 17.91s/it]

Checkpoint saved safely at row 4900


 91%|█████████ | 1000/1103 [5:28:00<37:29, 21.84s/it]

Checkpoint saved safely at row 5000


 92%|█████████▏| 1020/1103 [5:34:55<28:47, 20.81s/it]

Checkpoint saved safely at row 5100


 94%|█████████▍| 1040/1103 [5:41:29<21:38, 20.61s/it]

Checkpoint saved safely at row 5200


 96%|█████████▌| 1060/1103 [5:49:10<15:34, 21.73s/it]

Checkpoint saved safely at row 5300


 98%|█████████▊| 1080/1103 [5:55:28<06:46, 17.69s/it]

Checkpoint saved safely at row 5400


100%|█████████▉| 1100/1103 [6:01:59<01:02, 20.91s/it]

Checkpoint saved safely at row 5500


100%|█████████▉| 1102/1103 [6:02:39<00:20, 20.51s/it]

Checkpoint saved safely at row 5510


100%|██████████| 1103/1103 [6:02:58<00:00, 19.74s/it]


Labeling complete → saved to result/df_bisnis_labeled.csv
