In [27]:
import json
from pathlib import Path

# --------------------------
# CONFIG
# --------------------------
# ‚úÖ Save output in the same folder as the input JSON
output_dir = Path(r"C:\RESUME-AI-PROJECT\examples\resumes")
output_dir.mkdir(parents=True, exist_ok=True)

# ‚úÖ Input can be .json or .jsonl
input_file = Path(r"C:\RESUME-AI-PROJECT\data\uploads\resume1.pdf")

# --------------------------
# MAIN
# --------------------------
if not input_file.exists():
    raise FileNotFoundError(f"‚ùå Input file not found: {input_file}")

print(f"üìÑ Reading from: {input_file}")

count, skipped = 0, 0

def process_record(record, index):
    global count, skipped

    # Allow both 'source' or 'file' field names
    file_key = record.get("source") or record.get("file")
    if not file_key:
        print(f"‚ö†Ô∏è Record {index}: Missing 'source' or 'file' key ‚Äî skipped.")
        skipped += 1
        return

    text_content = record.get("text", "").strip()
    if not text_content:
        print(f"‚ö†Ô∏è Record {index}: Missing 'text' content ‚Äî skipped.")
        skipped += 1
        return

    name = Path(file_key).stem + ".txt"
    out_path = output_dir / name
    out_path.write_text(text_content, encoding="utf8")
    count += 1


# --- Handle JSON vs JSONL automatically ---
try:
    text = input_file.read_text(encoding="utf8").strip()

    if text.startswith("["):
        # ‚úÖ Regular JSON array
        data = json.loads(text)
        for i, record in enumerate(data, start=1):
            process_record(record, i)
    else:
        # ‚úÖ JSONL (one object per line)
        with open(input_file, "r", encoding="utf8") as f:
            for i, line in enumerate(f, start=1):
                line = line.strip()
                if not line:
                    continue
                try:
                    record = json.loads(line)
                    process_record(record, i)
                except json.JSONDecodeError as e:
                    print(f"‚ö†Ô∏è Line {i}: Invalid JSON, skipped ({e})")
                    skipped += 1

except Exception as e:
    print(f"‚ùå Failed to process file: {e}")

# --------------------------
# SUMMARY
# --------------------------
print(f"\n‚úÖ Conversion complete!")
print(f"üìÅ {count} text files saved to: {output_dir.resolve()}")
print(f"üö´ {skipped} records skipped due to errors or missing data.")


üìÑ Reading from: C:\RESUME-AI-PROJECT\data\uploads\resume1.pdf
‚ùå Failed to process file: 'utf-8' codec can't decode byte 0xf6 in position 10: invalid start byte

‚úÖ Conversion complete!
üìÅ 0 text files saved to: C:\RESUME-AI-PROJECT\examples\resumes
üö´ 0 records skipped due to errors or missing data.
