In [1]:
import json

In [2]:
def is_valid_error_object(obj):
    required_keys = {
        "id", "start", "end", "type", "subType",
        "incorrectText", "correctedText", "explanation"
    }
    return isinstance(obj, dict) and required_keys.issubset(obj.keys())

In [3]:
def clean_and_format_json(raw_json_str):
    try:
        # Try to parse directly
        data = json.loads(raw_json_str)
    except json.JSONDecodeError:
        # Attempt to repair common issues
        import ast
        try:
            data = ast.literal_eval(raw_json_str)
        except Exception as e:
            print("Failed to repair JSON:", e)
            return None

    if "errorList" in data and isinstance(data["errorList"], list):
        cleaned_list = [err for err in data["errorList"] if is_valid_error_object(err)]
        data["errorList"] = cleaned_list

    formatted_json = json.dumps(data, indent=2, ensure_ascii=False)
    return formatted_json

In [32]:
import json
import re

def is_valid_error_object(obj):
    required_keys = {
        "id", "start", "end", "type", "subType",
        "incorrectText", "correctedText", "explanation"
    }
    return isinstance(obj, dict) and required_keys.issubset(obj)

def clean_and_format_json(raw_json_string):
    try:
        # Step 1: Unescape newline and quotes
        cleaned_str = raw_json_string.encode().decode('unicode_escape')

        # Step 2: Remove leading/trailing whitespace and redundant newline at end
        cleaned_str = cleaned_str.strip()

        # Step 3: Try to parse the JSON
        data = json.loads(cleaned_str)

        # Step 4: Clean up invalid entries
        if "errorList" in data and isinstance(data["errorList"], list):
            data["errorList"] = [
                obj for obj in data["errorList"] if is_valid_error_object(obj)
            ]

        # Step 5: Return pretty JSON
        return json.dumps(data, indent=2, ensure_ascii=False)

    except Exception as e:
        print("Error during JSON cleaning/parsing:", e)
        return None

# Example usage:
raw_json = '''{\n  \"errorList\": [\n    {\n      \"id\": \"err-1\",\n      \"start\": 13,\n      \"end\": 15,\n      \"type\": \"Grammar\",\n      \"subType\": \"Article\",\n      \"incorrectText\": \"In\",\n      \"correctedText\": \"It\",\n      \"explanation\": \"The sentence requires a subject, not 'In'.\"\n    },\n    {\n      \"id\": \"err-2\",\n      \"start\": 25,\n      \"end\": 28,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"one of the most important parts\",\n      \"correctedText\": \"a key component\",\n      \"explanation\": \"More formal and precise language is needed for a discussion of political discourse.\"\n    },\n    {\n      \"id\": \"err-3\",\n      \"start\": 35,\n      \"end\": 39,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"has become\",\n      \"correctedText\": \"has evolved\",\n      \"explanation\": \"‘Evolved’ is a more appropriate term for describing the change in social media’s role.\"\n    },\n    {\n      \"id\": \"err-4\",\n      \"start\": 43,\n      \"end\": 48,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Precision\",\n      \"incorrectText\": \"accessibility\",\n      \"correctedText\": \"availability\",\n      \"explanation\": \"'Accessibility' is too vague; 'availability' is more precise in this context.\"\n    },\n    {\n      \"id\": \"err-5\",\n      \"start\": 53,\n      \"end\": 59,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"creates\",\n      \"correctedText\": \"contributes to\",\n      \"explanation\": \"‘Contributes to’ is a more nuanced and accurate description of the process.\"\n    },\n    {\n      \"id\": \"err-6\",\n      \"start\": 63,\n      \"end\": 68,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"echo chambers\",\n      \"correctedText\": \"filter bubbles\",\n      \"explanation\": \"'Echo chambers' is a common term, but 'filter bubbles' is more descriptive of the algorithmic process.\"\n    },\n    {\n      \"id\": \"err-7\",\n      \"start\": 74,\n      \"end\": 79,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"contributes to\",\n      \"correctedText\": \"leads to\",\n      \"explanation\": \"'Leads to' is a stronger and more direct way to express the cause-and-effect relationship.\"\n    },\n    {\n      \"id\": \"err-8\",\n      \"start\": 84,\n      \"end\": 89,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"misinformation\",\n      \"correctedText\": \"false narratives\",\n      \"explanation\": \"'Misinformation' is a broad term; 'false narratives' is more specific and descriptive.\"\n    },\n    {\n      \"id\": \"err-9\",\n      \"start\": 96,\n      \"end\": 99,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"share\",\n      \"correctedText\": \"disseminate\",\n      \"explanation\": \"'Disseminate' is a more formal and appropriate term for spreading information.\"\n    },\n    {\n      \"id\": \"err-10\",\n      \"start\": 104,\n      \"end\": 109,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"harassment\",\n      \"correctedText\": \"abuse\",\n      \"explanation\": \"'Harassment' is too broad; 'abuse' is more specific to the context of online interactions.\"\n    },\n    {\n      \"id\": \"err-11\",\n      \"start\": 116,\n      \"end\": 119,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"debate\",\n      \"explanation\": \"'Discourse' is too academic; 'debate' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-12\",\n      \"start\": 131,\n      \"end\": 133,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-13\",\n      \"start\": 146,\n      \"end\": 149,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-14\",\n      \"start\": 163,\n      \"end\": 167,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-15\",\n      \"start\": 173,\n      \"end\": 177,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-16\",\n      \"start\": 187,\n      \"end\": 189,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-17\",\n      \"start\": 197,\n      \"end\": 201,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-18\",\n      \"start\": 208,\n      \"end\": 212,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-19\",\n      \"start\": 218,\n      \"end\": 221,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-20\",\n      \"start\": 227,\n      \"end\": 231,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-21\",\n      \"start\": 233,\n      \"end\": 237,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-22\",\n      \"start\": 243,\n      \"end\": 247,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-23\",\n      \"start\": 253,\n      \"end\": 257,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-24\",\n      \"start\": 263,\n      \"end\": 267,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-25\",\n      \"start\": 273,\n      \"end\": 277,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-26\",\n      \"start\": 283,\n      \"end\": 287,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-27\",\n      \"start\": 293,\n      \"end\": 297,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-28\",\n      \"start\": 303,\n      \"end\": 307,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-29\",\n      \"start\": 313,\n      \"end\": 317,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-30\",\n      \"start\": 317,\n      \"end\": 321,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    }\n  ]\n}\n'''  # replace this with your full string

result = clean_and_format_json(raw_json)
if result:
    print(result)



{
  "errorList": [
    {
      "id": "err-1",
      "start": 13,
      "end": 15,
      "type": "Grammar",
      "subType": "Article",
      "incorrectText": "In",
      "correctedText": "It",
      "explanation": "The sentence requires a subject, not 'In'."
    },
    {
      "id": "err-2",
      "start": 25,
      "end": 28,
      "type": "Vocabulary",
      "subType": "Word Choice",
      "incorrectText": "one of the most important parts",
      "correctedText": "a key component",
      "explanation": "More formal and precise language is needed for a discussion of political discourse."
    },
    {
      "id": "err-3",
      "start": 35,
      "end": 39,
      "type": "Grammar",
      "subType": "Verb Form",
      "incorrectText": "has become",
      "correctedText": "has evolved",
      "explanation": "âEvolvedâ is a more appropriate term for describing the change in social mediaâs role."
    },
    {
      "id": "err-4",
      "start": 43,
      "end": 48,
      "type": "Voc

In [4]:
# Example usage
raw_json = '''
{
  "errorList": [
    {
      "id": "err-1",
      "start": 0,
      "end": 23,
      "type": "Grammar",
      "subType": "Subject-Verb Agreement",
      "incorrectText": "In today's modern society, social media has become one of the most important parts of people's lives.",
      "correctedText": "In today's modern society, social media has become an essential part of people's lives.",
      "explanation": "'Parts' should be 'part' as it refers to a singular concept."
    },
    {
      "id": "err-2",
      "start": 107,
      "end": 135,
      "type": "Grammar",
      "subType": "Tense",
      "incorrectText": "political discourse is no longer limited to leaders or experts—everyone can now participate.",
      "correctedText": "political discourse has expanded beyond leaders and experts, allowing everyone to participate.",
      "explanation": "'Is no longer limited' should be 'has expanded beyond' for consistency with the present perfect tense."
    }
  ]
}
'''

cleaned_json = clean_and_format_json(raw_json)
print(cleaned_json)

{
  "errorList": [
    {
      "id": "err-1",
      "start": 0,
      "end": 23,
      "type": "Grammar",
      "subType": "Subject-Verb Agreement",
      "incorrectText": "In today's modern society, social media has become one of the most important parts of people's lives.",
      "correctedText": "In today's modern society, social media has become an essential part of people's lives.",
      "explanation": "'Parts' should be 'part' as it refers to a singular concept."
    },
    {
      "id": "err-2",
      "start": 107,
      "end": 135,
      "type": "Grammar",
      "subType": "Tense",
      "incorrectText": "political discourse is no longer limited to leaders or experts—everyone can now participate.",
      "correctedText": "political discourse has expanded beyond leaders and experts, allowing everyone to participate.",
      "explanation": "'Is no longer limited' should be 'has expanded beyond' for consistency with the present perfect tense."
    }
  ]
}


In [33]:
## Upto here the code is great.

In [8]:
import json
import re

raw_json = '''{\n  \"errorList\": [\n    {\n      \"id\": \"err-1\",\n      \"start\": 13,\n      \"end\": 15,\n      \"type\": \"Grammar\",\n      \"subType\": \"Article\",\n      \"incorrectText\": \"In\",\n      \"correctedText\": \"It\",\n      \"explanation\": \"The sentence requires a subject, not 'In'.\"\n    },\n    {\n      \"id\": \"err-2\",\n      \"start\": 25,\n      \"end\": 28,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"one of the most important parts\",\n      \"correctedText\": \"a key component\",\n      \"explanation\": \"More formal and precise language is needed for a discussion of political discourse.\"\n    },\n    {\n      \"id\": \"err-3\",\n      \"start\": 35,\n      \"end\": 39,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"has become\",\n      \"correctedText\": \"has evolved\",\n      \"explanation\": \"‘Evolved’ is a more appropriate term for describing the change in social media’s role.\"\n    },\n    {\n      \"id\": \"err-4\",\n      \"start\": 43,\n      \"end\": 48,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Precision\",\n      \"incorrectText\": \"accessibility\",\n      \"correctedText\": \"availability\",\n      \"explanation\": \"'Accessibility' is too vague; 'availability' is more precise in this context.\"\n    },\n    {\n      \"id\": \"err-5\",\n      \"start\": 53,\n      \"end\": 59,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"creates\",\n      \"correctedText\": \"contributes to\",\n      \"explanation\": \"‘Contributes to’ is a more nuanced and accurate description of the process.\"\n    },\n    {\n      \"id\": \"err-6\",\n      \"start\": 63,\n      \"end\": 68,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"echo chambers\",\n      \"correctedText\": \"filter bubbles\",\n      \"explanation\": \"'Echo chambers' is a common term, but 'filter bubbles' is more descriptive of the algorithmic process.\"\n    },\n    {\n      \"id\": \"err-7\",\n      \"start\": 74,\n      \"end\": 79,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"contributes to\",\n      \"correctedText\": \"leads to\",\n      \"explanation\": \"'Leads to' is a stronger and more direct way to express the cause-and-effect relationship.\"\n    },\n    {\n      \"id\": \"err-8\",\n      \"start\": 84,\n      \"end\": 89,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"misinformation\",\n      \"correctedText\": \"false narratives\",\n      \"explanation\": \"'Misinformation' is a broad term; 'false narratives' is more specific and descriptive.\"\n    },\n    {\n      \"id\": \"err-9\",\n      \"start\": 96,\n      \"end\": 99,\n      \"type\": \"Grammar\",\n      \"subType\": \"Verb Form\",\n      \"incorrectText\": \"share\",\n      \"correctedText\": \"disseminate\",\n      \"explanation\": \"'Disseminate' is a more formal and appropriate term for spreading information.\"\n    },\n    {\n      \"id\": \"err-10\",\n      \"start\": 104,\n      \"end\": 109,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"harassment\",\n      \"correctedText\": \"abuse\",\n      \"explanation\": \"'Harassment' is too broad; 'abuse' is more specific to the context of online interactions.\"\n    },\n    {\n      \"id\": \"err-11\",\n      \"start\": 116,\n      \"end\": 119,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"debate\",\n      \"explanation\": \"'Discourse' is too academic; 'debate' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-12\",\n      \"start\": 131,\n      \"end\": 133,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-13\",\n      \"start\": 146,\n      \"end\": 149,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-14\",\n      \"start\": 163,\n      \"end\": 167,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-15\",\n      \"start\": 173,\n      \"end\": 177,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-16\",\n      \"start\": 187,\n      \"end\": 189,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-17\",\n      \"start\": 197,\n      \"end\": 201,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-18\",\n      \"start\": 208,\n      \"end\": 212,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-19\",\n      \"start\": 218,\n      \"end\": 221,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-20\",\n      \"start\": 227,\n      \"end\": 231,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-21\",\n      \"start\": 233,\n      \"end\": 237,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-22\",\n      \"start\": 243,\n      \"end\": 247,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-23\",\n      \"start\": 253,\n      \"end\": 257,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-24\",\n      \"start\": 263,\n      \"end\": 267,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-25\",\n      \"start\": 273,\n      \"end\": 277,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-26\",\n      \"start\": 283,\n      \"end\": 287,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-27\",\n      \"start\": 293,\n      \"end\": 297,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-28\",\n      \"start\": 303,\n      \"end\": 307,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    },\n    {\n      \"id\": \"err-29\",\n      \"start\": 313,\n      \"end\": 317,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"over\",\n      \"correctedText\": \"excessive\",\n      \"explanation\": \"'Over' is too vague; 'excessive' is more descriptive.\"\n    },\n    {\n      \"id\": \"err-30\",\n      \"start\": 317,\n      \"end\": 321,\n      \"type\": \"Vocabulary\",\n      \"subType\": \"Word Choice\",\n      \"incorrectText\": \"discourse\",\n      \"correctedText\": \"discussion\",\n      \"explanation\": \"'Discourse' is too academic; 'discussion' is more accessible and appropriate for this context.\"\n    }\n  ]\n}\n'''  # Your JSON string (as shown)

# Step 1: Extract all possible JSON-like objects from inside the "errorList" array
object_matches = re.findall(r'\{[^{}]+\}', raw_json)

valid_objects = []
for obj_str in object_matches:
    try:
        # Try parsing each object to ensure it's valid JSON
        parsed = json.loads(obj_str)
        required_keys = {"id", "start", "end", "type", "subType", "incorrectText", "correctedText", "explanation"}
        # Only keep the object if it has all required keys
        if required_keys.issubset(parsed.keys()):
            valid_objects.append(parsed)
    except json.JSONDecodeError:
        continue  # Skip invalid JSON fragments

# Step 2: Reconstruct a clean JSON
clean_json = {"errorList": valid_objects}

# Optional: pretty print
print(json.dumps(clean_json, indent=2,ensure_ascii=False))


{
  "errorList": [
    {
      "id": "err-1",
      "start": 13,
      "end": 15,
      "type": "Grammar",
      "subType": "Article",
      "incorrectText": "In",
      "correctedText": "It",
      "explanation": "The sentence requires a subject, not 'In'."
    },
    {
      "id": "err-2",
      "start": 25,
      "end": 28,
      "type": "Vocabulary",
      "subType": "Word Choice",
      "incorrectText": "one of the most important parts",
      "correctedText": "a key component",
      "explanation": "More formal and precise language is needed for a discussion of political discourse."
    },
    {
      "id": "err-3",
      "start": 35,
      "end": 39,
      "type": "Grammar",
      "subType": "Verb Form",
      "incorrectText": "has become",
      "correctedText": "has evolved",
      "explanation": "‘Evolved’ is a more appropriate term for describing the change in social media’s role."
    },
    {
      "id": "err-4",
      "start": 43,
      "end": 48,
      "type": "Vocabular

In [4]:
JSON_STR='''
In today's modern society, social media has become one of the most important parts of people's lives. Its role has evolved from simply being a communication tool to a platform for sharing opinions, news, and even political beliefs. With the rise of platforms like Twitter, Facebook, and Instagram, political discourse is no longer limited to leaders or experts—everyone can now participate. While this shift has brought some benefits, it has also created several problems in how people discuss and understand politics. First of all, social media has made it very easy for everyone to share their opinions. Before its rise, people relied on newspapers or television to learn about political events. Now, anyone can post an opinion or share a news article without verifying its accuracy. This accessibility has increased political engagement, but it has also raised the risk of misinformation spreading rapidly. Sometimes, a tweet or post goes viral within minutes—even if it’s completely false. This can heavily influence public opinion and decision-making. Another significant issue is the creation of *echo chambers*. This occurs when a person only sees posts, videos, or information that align with their existing beliefs. Most social media platforms use algorithms that show users content based on their past behavior. So, if someone supports a particular political party, they’re more likely to see content that reinforces their views and less of the opposing side. As a result, people may start believing their perspective is the only correct one, which makes political conversations more hostile and less open-minded. The rise of fake news and misinformation is also a serious concern. Political groups or malicious actors can easily create and spread false stories online. Many users don’t verify facts before sharing a post—they believe it because it matches their opinion or was shared by someone they trust. During elections, this becomes especially dangerous, as fake news can influence how people vote or view a candidate. Several recent elections around the world have shown the harmful effects of misinformation. Social media also enables the spread of online harassment. Political leaders, journalists, and even ordinary people often face abusive comments or threats for sharing opinions that others disagree with. This kind of hostile environment can discourage individuals from expressing their views. In a healthy democracy, people should feel safe discussing politics without fear of backlash or intimidation. However, it's not all negative. Social media has also played a major role in raising awareness about important political issues. Movements like #BlackLivesMatter and global climate change protests gained widespread attention and support thanks to social media platforms. People across the world can now connect with and support causes in other countries. It has amplified voices that were previously unheard, and has especially encouraged younger generations to become more politically active and vocal. In some cases, political leaders use social media to communicate directly with the public. This removes the traditional gatekeepers like news channels and allows leaders to explain policies, share updates, or respond to citizens. This can increase transparency and accessibility. However, it also depends on how responsibly leaders use these platforms. If used to spread hate or false information, the impact can be harmful. Another downside is the oversimplification of complex ideas. Political discussions require depth, evidence, and thoughtful reflection. But social media often favors short posts—like a 280-character tweet—which can make debates shallow. Many users focus on "winning" arguments instead of trying to understand other viewpoints. Finally, social media has influenced how news is produced and shared. Many news outlets now prioritize content that gets more likes and shares, sometimes at the expense of accuracy. Sensational headlines and clickbait have become common. This turns political coverage into entertainment and shifts attention away from serious issues. In conclusion, social media has transformed political discourse in significant ways. It has made political conversations more open and participatory, but also more polarized and emotionally charged. Challenges like fake news, echo chambers, online abuse, and shallow debates must be addressed. At the same time, the ability to raise awareness and include diverse voices is a powerful benefit. Moving forward, we must use social media responsibly—through education, critical thinking, fact-checking, and respectful dialogue—to ensure that its impact on politics is more positive than negative.
'''

In [22]:
import json
import re

# JSON data (errorList) and text string
# json_data = json.dumps(clean_json, indent=2,ensure_ascii=False)  # Paste your JSON here
json_data = clean_json
# JSON_STR = '''In today's modern society, social media has become one of the most important parts of people's lives. ...'''  # Your full text here

used_ranges = set()

def is_overlapping(a_start, a_end, b_start, b_end):
    return not (a_end <= b_start or a_start >= b_end)

def find_next_occurrence(text, phrase, used_ranges):
    for match in re.finditer(re.escape(phrase), text):
        start, end = match.start(), match.end()
        if not any(is_overlapping(start, end, us, ue) for us, ue in used_ranges):
            used_ranges.add((start, end))
            return start, end
    return -1, -1  # Not found

# Update error positions
for err in json_data["errorList"]:
    phrase = err["incorrectText"]
    start, end = find_next_occurrence(JSON_STR, phrase, used_ranges)
    if start != -1:
        err["start"] = start
        err["end"] = end
    else:
        err["start"] = -1
        err["end"] = -1
        print(f"'{err['id']}'  Warning: '{phrase}' not found uniquely in text.")

# Print updated JSON
print(json.dumps(json_data, indent=2, ensure_ascii=False))


{
  "errorList": [
    {
      "id": "err-1",
      "start": 1,
      "end": 3,
      "type": "Grammar",
      "subType": "Article",
      "incorrectText": "In",
      "correctedText": "It",
      "explanation": "The sentence requires a subject, not 'In'."
    },
    {
      "id": "err-2",
      "start": 52,
      "end": 83,
      "type": "Vocabulary",
      "subType": "Word Choice",
      "incorrectText": "one of the most important parts",
      "correctedText": "a key component",
      "explanation": "More formal and precise language is needed for a discussion of political discourse."
    },
    {
      "id": "err-3",
      "start": 41,
      "end": 51,
      "type": "Grammar",
      "subType": "Verb Form",
      "incorrectText": "has become",
      "correctedText": "has evolved",
      "explanation": "‘Evolved’ is a more appropriate term for describing the change in social media’s role."
    },
    {
      "id": "err-4",
      "start": 792,
      "end": 805,
      "type": "Vocabular

In [21]:
JSON_STR[253:257]

'tfor'

In [31]:
# Assuming `json_data` is still your dictionary, not a string
for json_d in json_data['errorList']:
    if json_d['start'] != -1 and json_d['end'] != -1:
        start = json_d['start']
        end = json_d['end']
        print(f"{JSON_STR[start:end]} ----- incorrectText= {json_d['incorrectText']} \n")


In ----- incorrectText= In 

one of the most important parts ----- incorrectText= one of the most important parts 

has become ----- incorrectText= has become 

accessibility ----- incorrectText= accessibility 

echo chambers ----- incorrectText= echo chambers 

misinformation ----- incorrectText= misinformation 

share ----- incorrectText= share 

harassment ----- incorrectText= harassment 

discourse ----- incorrectText= discourse 

over ----- incorrectText= over 

discourse ----- incorrectText= discourse 

over ----- incorrectText= over 



In [35]:
import json
import re

json_data = clean_json  # already a parsed JSON dict
used_ranges = set()
last_index = 0

def is_overlapping(start, end, used_ranges):
    return any(not (end <= us or start >= ue) for us, ue in used_ranges)

def find_next_ordered_occurrence(text, phrase, used_ranges, start_index):
    for match in re.finditer(re.escape(phrase), text[start_index:]):
        start = match.start() + start_index
        end = match.end() + start_index
        if not is_overlapping(start, end, used_ranges):
            used_ranges.add((start, end))
            return start, end
    return -1, -1

# === MAIN LOOP ===
for err in json_data["errorList"]:
    phrase = err["incorrectText"]
    start, end = find_next_ordered_occurrence(JSON_STR, phrase, used_ranges, last_index)

    if start != -1:
        err["start"] = start
        err["end"] = end
        last_index = end  # advance pointer to after current match
        print(f"✅ Matched: '{JSON_STR[start:end]}' for incorrectText='{phrase}'")
    else:
        err["start"] = -1
        err["end"] = -1
        print(f"❌ Could not match: '{phrase}'")

# Optional: print updated result
print(json.dumps(json_data, indent=2, ensure_ascii=False))


✅ Matched: 'In' for incorrectText='In'
✅ Matched: 'one of the most important parts' for incorrectText='one of the most important parts'
❌ Could not match: 'has become'
✅ Matched: 'accessibility' for incorrectText='accessibility'
❌ Could not match: 'creates'
✅ Matched: 'echo chambers' for incorrectText='echo chambers'
❌ Could not match: 'contributes to'
✅ Matched: 'misinformation' for incorrectText='misinformation'
✅ Matched: 'share' for incorrectText='share'
✅ Matched: 'harassment' for incorrectText='harassment'
✅ Matched: 'discourse' for incorrectText='discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'
❌ Could not match: 'discourse'
❌ Could not match: 'over'
❌ Could not match: 'discourse'

In [37]:
import json
import re

# Your essay
essay = """In today's modern society, social media has become one of the most important parts of people's lives. Its role has evolved from simply being a communication tool to a platform for sharing opinions, news, and even political beliefs. With the rise of platforms like Twitter, Facebook, and Instagram, political discourse is no longer limited to leaders or experts—everyone can now participate. While this shift has brought some benefits, it has also created several problems in how people discuss and understand politics. First of all, social media has made it very easy for everyone to share their opinions. Before its rise, people relied on newspapers or television to learn about political events. Now, anyone can post an opinion or share a news article without verifying its accuracy. This accessibility has increased political engagement, but it has also raised the risk of misinformation spreading rapidly. Sometimes, a tweet or post goes viral within minutes—even if it’s completely false. This can heavily influence public opinion and decision-making. Another significant issue is the creation of *echo chambers*. This occurs when a person only sees posts, videos, or information that align with their existing beliefs. Most social media platforms use algorithms that show users content based on their past behavior. So, if someone supports a particular political party, they’re more likely to see content that reinforces their views and less of the opposing side. As a result, people may start believing their perspective is the only correct one, which makes political conversations more hostile and less open-minded. The rise of fake news and misinformation is also a serious concern. Political groups or malicious actors can easily create and spread false stories online. Many users don’t verify facts before sharing a post—they believe it because it matches their opinion or was shared by someone they trust. During elections, this becomes especially dangerous, as fake news can influence how people vote or view a candidate. Several recent elections around the world have shown the harmful effects of misinformation. Social media also enables the spread of online harassment. Political leaders, journalists, and even ordinary people often face abusive comments or threats for sharing opinions that others disagree with. This kind of hostile environment can discourage individuals from expressing their views. In a healthy democracy, people should feel safe discussing politics without fear of backlash or intimidation. However, it's not all negative. Social media has also played a major role in raising awareness about important political issues. Movements like #BlackLivesMatter and global climate change protests gained widespread attention and support thanks to social media platforms. People across the world can now connect with and support causes in other countries. It has amplified voices that were previously unheard, and has especially encouraged younger generations to become more politically active and vocal. In some cases, political leaders use social media to communicate directly with the public. This removes the traditional gatekeepers like news channels and allows leaders to explain policies, share updates, or respond to citizens. This can increase transparency and accessibility. However, it also depends on how responsibly leaders use these platforms. If used to spread hate or false information, the impact can be harmful. Another downside is the oversimplification of complex ideas. Political discussions require depth, evidence, and thoughtful reflection. But social media often favors short posts—like a 280-character tweet—which can make debates shallow. Many users focus on "winning" arguments instead of trying to understand other viewpoints. Finally, social media has influenced how news is produced and shared. Many news outlets now prioritize content that gets more likes and shares, sometimes at the expense of accuracy. Sensational headlines and clickbait have become common. This turns political coverage into entertainment and shifts attention away from serious issues. In conclusion, social media has transformed political discourse in significant ways. It has made political conversations more open and participatory, but also more polarized and emotionally charged. Challenges like fake news, echo chambers, online abuse, and shallow debates must be addressed. At the same time, the ability to raise awareness and include diverse voices is a powerful benefit. Moving forward, we must use social media responsibly—through education, critical thinking, fact-checking, and respectful dialogue—to ensure that its impact on politics is more positive than negative."""

# Tokenize essay into words
words = re.findall(r"\b\w[\w'-]*\b", essay)

# # Load your original error JSON
# with open("errors.json") as f:
#     data = json.load(f)
data=clean_json
# Rebuild the essay as space-joined string of word tokens
joined = " ".join(words)

# Fix error indices
for err in data["errorList"]:
    phrase = err["incorrectText"]
    phrase_tokens = re.findall(r"\b\w[\w'-]*\b", phrase)
    
    # Try to find the phrase in word list
    found = False
    for i in range(len(words) - len(phrase_tokens) + 1):
        if words[i:i+len(phrase_tokens)] == phrase_tokens:
            err["start"] = i
            err["end"] = i + len(phrase_tokens)
            found = True
            break
    if not found:
        err["start"] = -1
        err["end"] = -1
        print(f"Not found: '{phrase}'")

# Output corrected errors
print(json.dumps(data, indent=2))


Not found: 'creates'
Not found: 'contributes to'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
Not found: 'over'
{
  "errorList": [
    {
      "id": "err-1",
      "start": 0,
      "end": 1,
      "type": "Grammar",
      "subType": "Article",
      "incorrectText": "In",
      "correctedText": "It",
      "explanation": "The sentence requires a subject, not 'In'."
    },
    {
      "id": "err-2",
      "start": 8,
      "end": 14,
      "type": "Vocabulary",
      "subType": "Word Choice",
      "incorrectText": "one of the most important parts",
      "correctedText": "a key component",
      "explanation": "More formal and precise language is needed for a discussion of political discourse."
    },
    {
      "id": "err-3",
      "start": 6,
      "end": 8,
      "type": "Grammar",
      "subType": "Verb Form",
      "incorrectText": "has become",
      "correctedText": "has evolved"