In [None]:
"""
Process all CSV files (strict alphabetical order) under ~/Desktop/history of sport.
For each file:
- Keep the original 'filename' column.
- 'content' column stores the raw text (with newlines removed).
- 'English translation' column stores the translated text (chunked + recursive splitting down to single characters to guarantee non-empty output).

Results are saved to ~/Desktop/sports/{original_csv_filename}_translated.csv
"""

import sys
import threading
import time
import re
import csv
from pathlib import Path

import pandas as pd
from groq import Groq 

csv.field_size_limit(sys.maxsize)

status = {"done": False, "total": 0, "count": 0}
_lock = threading.Lock()

def _report():
    while True:
        with _lock:
            if status["done"]:
                break
            c, t = status["count"], status["total"]
        print(f"[{time.strftime('%H:%M:%S')}] Translated rows: {c}/{t}", flush=True)
        time.sleep(5)

# —— Configurations —— #
GROQ_API_KEY = "YOUR_API_KEY_HERE"
API_MODEL    = "llama-3.1-8b-instant"
INPUT_DIR    = Path.home() / "Desktop/history of sport"
OUTPUT_DIR   = Path.home() / "Desktop/sports"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

client = Groq(api_key=GROQ_API_KEY)

def flatten(text: str) -> str:
    s = str(text or "")
    s = re.sub(r'[\r\n\u2028\u2029]+', ' ', s)
    return re.sub(r'\s{2,}', ' ', s).strip()

MAX_CHARS = 20000

def safe_translate(chunk: str) -> str:
    """Translate a chunk safely with fallback to recursive splitting."""
    if not chunk:
        return ""
    prompt = "You are a precise multilingual→English translator. Return ONLY the English translation."
    try:
        rsp = client.chat.completions.create(
            model=API_MODEL,
            messages=[
                {"role": "system", "content": prompt},
                {"role": "user",   "content": chunk},
            ],
            temperature=0.0
        )
        out = rsp.choices[0].message.content.strip()
        out = out.replace("\n", " ")
        out = re.sub(r'\s{2,}', ' ', out)
        return out
    except Exception as e:
        print(f"[Error] Translation failed (len={len(chunk)}): {e}", file=sys.stderr)
        if len(chunk) == 1:
            return chunk
        mid = len(chunk) // 2
        return safe_translate(chunk[:mid]) + safe_translate(chunk[mid:])

def translate_text(text: str) -> str:
    chunks = [text[i : i + MAX_CHARS] for i in range(0, len(text), MAX_CHARS)]
    translations = []
    for chunk in chunks:
        translations.append(safe_translate(chunk))
        time.sleep(0.1)
    return "".join(translations)

def main():
    threading.Thread(target=_report, daemon=True).start()

    files = sorted(INPUT_DIR.glob("*.csv"), key=lambda p: p.name)

    if not files:
        print("❌ No CSV files found", file=sys.stderr)
        return

    total = sum(len(pd.read_csv(f, encoding="utf-8-sig", dtype=str)) for f in files)
    with _lock:
        status["total"] = total

    cache = {}

    for f in files:
        df = pd.read_csv(f, encoding="utf-8-sig", dtype=str)
        out_rows = []
        for _, row in df.iterrows():
            fname = row.get("filename", "")
            raw   = flatten(row.get("content", ""))
            key   = (fname, raw)
            if key in cache:
                tr = cache[key]
            else:
                tr = translate_text(raw)
                cache[key] = tr

            with _lock:
                status["count"] += 1

            out_rows.append({
                "filename": fname,
                "content":  raw,
                "English translation": tr
            })

        out_df = pd.DataFrame(out_rows, columns=["filename","content","English translation"])
        target = OUTPUT_DIR / f"{f.stem}_translated.csv"
        out_df.to_csv(
            target,
            index=False,
            quoting=csv.QUOTE_ALL,
            line_terminator="\r\n",
            encoding="utf-8-sig"
        )
        print(f"✅ Saved: {target}")

    with _lock:
        status["done"] = True

if __name__ == "__main__":
    main()