In [8]:
import json
import pandas as pd
import re
import unicodedata
from json.decoder import JSONDecodeError
from sklearn.model_selection import train_test_split

def clean_text(text):
    """Clean text from ambiguous characters and normalize formatting"""
    text = re.sub(r"Was thisanswerhelpful\?YesNo[+-]?\d*", "", text)
    text = re.sub(r"\b(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday).*?(AM|PM)\b", "", text)
    text = unicodedata.normalize('NFKC', text)
    text = re.sub(r'[\x00-\x1F\x7F-\x9F\u200B-\u200F\u2028-\u202E]', '', text)
    text = re.sub(r'[–—•®©™“”´‘’`]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_json_with_validation(file_path):
    """Load JSON file with validation and error reporting"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except JSONDecodeError as e:
        print(f"JSON Syntax Error: {e}")
        print(f"Error location: Line {e.lineno}, Column {e.colno}")
        print("Please check for:")
        print("- Missing commas between items")
        print("- Unclosed brackets or braces")
        print("- Trailing commas in arrays/objects")
        print("- Special characters that need escaping")
        raise

# Load data with validation
try:
    data = load_json_with_validation("car_questions_live.json")
except Exception as e:
    print(f"Failed to load JSON file: {str(e)}")
    exit(1)

processed_data = []

for idx, entry in enumerate(data, 1):
    try:
        if any([
            "ERROR 404" in entry["instruction"],
            "ERROR 404" in entry["output"],
            "No question content available" in entry["instruction"],
            entry["output"].strip() in {"A:", "A: "}
        ]):
            continue

        instruction = clean_text(entry["instruction"].split("\n")[0].replace("Q: ", ""))
        output = clean_text(entry["output"].replace("A: ", ""))
        
        if not instruction or not output:
            continue
            
        prompt = f"{instruction}\n{entry['input']}" if entry["input"].strip() else instruction
        
        processed_data.append({
            "prompt": clean_text(prompt),
            "response": output
        })
        
    except KeyError as e:
        print(f"Missing key {str(e)} in entry {idx}")
    except Exception as e:
        print(f"Error processing entry {idx}: {str(e)}")

# Create DataFrame and split
df = pd.DataFrame(processed_data)
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Save results
train_df.to_json("car_repair_train.jsonl", orient="records", lines=True)
val_df.to_json("car_repair_val.jsonl", orient="records", lines=True)

print(f"Successfully processed {len(df)} entries")

JSON Syntax Error: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
Error location: Line 2, Column 1
Please check for:
- Missing commas between items
- Unclosed brackets or braces
- Trailing commas in arrays/objects
- Special characters that need escaping
Failed to load JSON file: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)


NameError: name 'data' is not defined