In [2]:
import os
import json
import time
import re
import csv
from typing import Dict, List, Optional
import google.generativeai as genai


def load_output_json(path: str) -> Dict[str, Dict]:
    """
    Load existing output JSON if available, else return empty dict.
    """
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}


def save_output_json(data: Dict[str, Dict], path: str) -> None:
    """
    Save the output data to JSON file.
    """
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def log_to_csv(log_file: str, filename: str, status: str, count: int):
    header = ['filename', 'status', 'paraphrase_count']
    file_exists = os.path.exists(log_file)

    with open(log_file, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(header)
        writer.writerow([filename, status, count])

def paraphrase_text(model: genai.GenerativeModel, original_text: str, delay: int = 5, ) -> Optional[List[str]]:
    """
    Use Gemini model to generate 15 paraphrases of the original_text
    following specified guidelines. Returns None on failure.
    """
    prompt = f"""
        Parafrasakan kalimat berikut sebanyak tepat 15 kali tidak kurang dan tidak lebih untuk menghasilkan variasi deskripsi.
        Pastikan output parafrasa memiliki kalimat yang utuh dan tidak rumpang (ditandai dengan akhiran tanda titik).

        Ikuti teknik parafrasa berikut:
        - Parafrasa Ekuivalen: Ganti kata/frasa dengan sinonim yang tepat dan alami.
        - Parafrasa Kontraksi: Ringkas kalimat hanya jika tidak menghilangkan informasi penting atau mengurangi kejelasan.
        - Parafrasa Amplifikasi: Tambahkan detail hanya jika itu memperjelas satuan data spesifik dan sangat jelas tersirat dalam data asli.

        Pertahankan:
        - Gaya penulisan: Gunakan gaya bahasa deskriptif, naratif, dengan konjungsin dan kata keterangan sesuai kaidah bahasa Indonesia yang baik, serta hasilnya tidak jauh mengubah dengan deskripsi asli.
        - Akurasi: Pastikan semua nilai, angka, informasi faktual tetap sama persis, dan disebutkan semuanya.
        - Format: Jangan gunakan markdown, quote, tanda kurung untuk satuan unit, atau format khusus pada hasil parafrasa.
        - Konteks: Deskripsi ini akan digunakan untuk memberikan gambaran utuh terkait sebuah gambar grafik.

        Teks asli: """ + original_text + """
        Hasil Parafrasa:
        1. Parafrasa 1
        2. Parafrasa 2
        dst.
        """
    try:
        response = model.generate_content(prompt)

        if hasattr(response, 'candidates') and response.candidates:
            content = response.candidates[0].content
        if hasattr(content, 'parts') and content.parts:
            text = content.parts[0].text.strip()
        else:
            text = str(content).strip()

        # post-processing respons
        lines = text.strip().split("\n")

        paraphrases = []
        for line in lines:
            line = line.strip()
            if not line:
                continue

            line = re.sub(r'^\s*\[?\d+\.\s*Parafrasa\s*\d+\]?\s*', '', line)
            line = re.sub(r'^\s*\.\s*', '', line)

            if len(line) > 20:  # filter singkat
                paraphrases.append(line)

        if len(paraphrases) < 15:
            message = f"[Warning] Only extracted {len(paraphrases)} paraphrases."
        else:
            message = f"[Success] Extracted {len(paraphrases)} paraphrases."

        print(message)

        return paraphrases
    except Exception as e:
        print(f"[Error] Paraphrasing failed for text '{original_text[:30]}...': {e}")
        return None
    finally:
        time.sleep(delay)

def process_folder(
    input_folder: str,
    output_json: str,
    api_key: str,
    model_name: str = "gemini-2.0-flash",
    delay: int = 5,
    log_file: str = "paraphrase_log_0.2.0.csv"
) -> None:
    """
    Process each .txt file in input_folder, generate paraphrases,
    and save/update output_json incrementally.
    Skips files already processed. On error, does not mark as processed.
        )
    """
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model_name)

    data = load_output_json(output_json)

    for fname in os.listdir(input_folder):
        if not fname.lower().endswith('.txt'):
            continue
        key = os.path.splitext(fname)[0]
        if key in data:
            print(f"Skip '{fname}', already processed.")
            continue

        file_path = os.path.join(input_folder, fname)
        with open(file_path, 'r', encoding='utf-8') as f:
            original_text = f.read().strip()

        print(f"Processing '{fname}'...")
        paraphrases = paraphrase_text(model, original_text, delay)

        if paraphrases is None:
            print(f"Skipping save for '{fname}' due to error. Will retry later.")
            log_to_csv(log_file, fname, "Error", 0)
            continue

        count = len(paraphrases)
        status = "Success" if count >= 15 else "Warning"

        data[key] = {
            'original_text': original_text,
            'paraphrases': paraphrases
        }

        save_output_json(data, output_json)
        print(f"Saved paraphrases for '{fname}'.")
        log_to_csv(log_file, fname, status, count)

In [None]:
process_folder(
    input_folder='base_tambahan_reasoning',
    output_json='paraphrases_tambahan_reasoning.json',
    api_key='API_KEY'
)

Skip 'T0001.txt', already processed.
Skip 'T0002.txt', already processed.
Skip 'T0003.txt', already processed.
Skip 'T0004.txt', already processed.
Skip 'T0005.txt', already processed.
Skip 'T0006.txt', already processed.
Skip 'T0007.txt', already processed.
Skip 'T0008.txt', already processed.
Skip 'T0009.txt', already processed.
Skip 'T0010.txt', already processed.
Skip 'T0011.txt', already processed.
Skip 'T0012.txt', already processed.
Skip 'T0013.txt', already processed.
Skip 'T0014.txt', already processed.
Skip 'T0015.txt', already processed.
Skip 'T0016.txt', already processed.
Skip 'T0017.txt', already processed.
Skip 'T0018.txt', already processed.
Skip 'T0019.txt', already processed.
Skip 'T0020.txt', already processed.
Skip 'T0021.txt', already processed.
Skip 'T0022.txt', already processed.
Skip 'T0023.txt', already processed.
Skip 'T0024.txt', already processed.
Skip 'T0025.txt', already processed.
Skip 'T0026.txt', already processed.
Skip 'T0027.txt', already processed.
S