In [52]:
import ast
import io
import json
import tokenize
from typing import Tuple, List, Any
import re

def extract_and_parse_prompt_list(code: str) -> List[Tuple[str, ...]]:
    """
    From a string of Python code, finds the first occurrence of
        = [ ... ]
    and parses that bracketed literal into a List[Tuple[str, ...]].

    Raises:
        ValueError if no list literal is found or it’s malformed.
    """
    # 1) grab everything from the first '=' up to the matching ']'
    m = re.search(r'=\s*(\[\s*[\s\S]*?\])', code)
    if not m:
        raise ValueError("No list literal found after an '=' in the code")
    list_str = m.group(1)

    # 2) safely evaluate it (only literals)
    try:
        data: Any = ast.literal_eval(list_str)
    except (SyntaxError, ValueError) as e:
        raise ValueError(f"Malformed list literal: {e}")

    # 3) validate shape
    if not isinstance(data, list) or not all(
        isinstance(item, (list, tuple)) and all(isinstance(x, str) for x in item) for item in data
    ):
        raise ValueError(
            "Parsed object is not a list of tuples/lists of strings"
        )

    # 4) convert to List[Tuple[str, ...]]
    return [tuple(str(x) for x in item) for item in data]


def _fix_quote_issues(code: str) -> str:
    """
    Fix common malformed string literal issues inside the first list literal
    that follows an '=' in the provided code string.

    Approach:
      - Find the first assignment to a list: locate the '[' that starts the list and its matching ']'.
      - For each tuple inside that list, identify string elements and normalize them:
        - Protect apostrophes inside words (cell's) so they are not misinterpreted.
        - Convert inner single-quoted phrases (e.g. 'ground glass') to double-quoted phrases.
        - Emit a safe Python string literal using json.dumps(...) (double-quoted, properly escaped).
      - Rebuild and return corrected code (only the list region is changed).
    """
    # find the bracketed list following the first '='
    eq_m = re.search(r'=\s*\[', code)
    if not eq_m:
        # nothing to fix
        return code

    list_start = code.find('[', eq_m.start())
    if list_start == -1:
        return code

    # find matching closing ']' for the list (simple bracket counter)
    depth = 0
    list_end = None
    for i in range(list_start, len(code)):
        c = code[i]
        if c == '[':
            depth += 1
        elif c == ']':
            depth -= 1
            if depth == 0:
                list_end = i
                break
    if list_end is None:
        # can't find end bracket: give up and return original
        return code

    list_region = code[list_start:list_end + 1]

    def fix_tuple_region(tuple_region: str) -> str:
        """
        tuple_region contains '(' ... ')' inclusive. We fix each element string inside.
        """
        assert tuple_region.startswith('(') and tuple_region.endswith(')')
        inner = tuple_region[1:-1]
        out_parts: List[str] = ['(']
        pos = 0
        L = len(inner)

        while pos < L:
            # copy leading whitespace
            ws_match = re.match(r'\s*', inner[pos:])
            if ws_match:
                ws = ws_match.group(0)
                out_parts.append(ws)
                pos += len(ws)

            if pos >= L:
                break

            ch = inner[pos]
            if ch in ("'", '"'):
                # heuristically find the closing quote that is followed only by optional whitespace
                # and then comma or end-of-tuple
                quote = ch
                j = pos + 1
                found_closing = False
                while j < L:
                    if inner[j] == '\\':
                        # skip escaped char
                        j += 2
                        continue
                    if inner[j] == quote:
                        # lookahead
                        k = j + 1
                        while k < L and inner[k].isspace():
                            k += 1
                        if k >= L or inner[k] in (',',):
                            found_closing = True
                            break
                    j += 1

                if not found_closing:
                    # fallback: try to find a closing quote before the next top-level comma
                    next_comma = inner.find(',', pos)
                    if next_comma == -1:
                        # take the rest
                        j = L - 1
                    else:
                        # take up to just before the comma
                        j = next_comma - 1
                        # ensure j is within bounds
                        if j < pos:
                            j = pos

                # element raw includes the quotes (pos .. j)
                element_raw = inner[pos:j+1]
                # find if there's trailing whitespace and a comma immediately after j
                k = j + 1
                trailing_ws = ''
                while k < L and inner[k].isspace():
                    trailing_ws += inner[k]
                    k += 1
                trailing_comma = ''
                if k < L and inner[k] == ',':
                    trailing_comma = ','
                    k += 1

                # original inner content (without the outer quotes)
                orig_inner = element_raw[1:-1]

                # --- transform inner text ---
                temp = orig_inner

                # protect word-internal apostrophes so they are not confused with quote-pairs
                temp = re.sub(r"(?<=\w)'(?=\w)", "__APOST__", temp)

                # convert inner single-quoted phrases to double-quoted phrases
                # (only acts on remaining single-quote pairs)
                temp = re.sub(r"'([^']*?)'", r'"\1"', temp)

                # restore protected apostrophes
                temp = temp.replace("__APOST__", "'")

                # emit a safe Python literal using json.dumps (double-quoted and escaped)
                safe_literal = json.dumps(temp)

                out_parts.append(safe_literal)
                if trailing_ws:
                    out_parts.append(trailing_ws)
                if trailing_comma:
                    out_parts.append(trailing_comma)
                pos = k
            else:
                # non-quoted token (e.g., stray tokens) -- copy until next comma
                next_comma = inner.find(',', pos)
                if next_comma == -1:
                    out_parts.append(inner[pos:])
                    pos = L
                else:
                    out_parts.append(inner[pos:next_comma])
                    out_parts.append(',')
                    pos = next_comma + 1

        out_parts.append(')')
        return ''.join(out_parts)

    # Walk the list_region and replace each top-level tuple region with its fixed version.
    fixed_list_builder: List[str] = []
    i = 0
    N = len(list_region)
    while i < N:
        c = list_region[i]
        if c == '(':
            # find matching ')'
            p = i
            depth = 0
            while p < N:
                if list_region[p] == '(':
                    depth += 1
                elif list_region[p] == ')':
                    depth -= 1
                    if depth == 0:
                        break
                p += 1
            if p >= N:
                # can't find matching ) - copy rest and break
                fixed_list_builder.append(list_region[i:])
                break
            tup = list_region[i:p+1]
            fixed_tup = fix_tuple_region(tup)
            fixed_list_builder.append(fixed_tup)
            i = p + 1
        else:
            fixed_list_builder.append(c)
            i += 1

    fixed_list_region = ''.join(fixed_list_builder)

    # Rebuild the full code
    fixed_code = code[:list_start] + fixed_list_region + code[list_end + 1:]
    return fixed_code



forced_double_str = _fix_quote_issues("""
prompts = [
    (
        'Basophil granules are large, irregular, and often mask the nucleus, creating a dark blue, almost black, appearance.',
        'Eosinophil granules are consistently bright orange-red and demonstrate a radial arrangement throughout the cytoplasm.',
        'Lymphocyte nucleus is hyperchromatic and occupies at least 80% of the cell volume, with minimal visible cytoplasm.',
        'Monocyte cytoplasm is abundant, gray-blue, and frequently contains fine azurophilic granules and occasional Russell bodies.',
        'Neutrophil granules are pale lilac and exhibit a subtle shift in color towards pink in response to inflammation.'
    ),
    (
        'Basophil cytoplasm is sparse and filled with metachromatic granules that stain purple-black with Wright-Giemsa.',
        'Eosinophil granules are highly refractile, appearing almost crystalline under high magnification.',
        'Lymphocyte cytoplasm is typically a thin, clear rim, sometimes exhibiting a slight perinuclear halo.',
        'Monocyte cytoplasm displays a 'ground glass' appearance due to the fine granularity and dispersed chromatin.',
        'Neutrophil cytoplasm is characterized by numerous fine, neutral granules that create a slightly hazy, 'smudged' effect.'
    ),
    (
        'Basophil granules are often so numerous that they distort the cell's shape, creating an irregular outline.',
        'Eosinophil granules are uniform in size and staining intensity, creating a homogenous orange-red cytoplasmic background.',
        'Lymphocyte chromatin is densely packed, with minimal visible nucleoli.',
        'Monocyte nucleus is often eccentric, positioned towards the periphery of the cell.',
        'Neutrophil segmentation is highly variable, ranging from bilobed to five-lobed forms.'
    ),
    (
        'Basophil granules release histamine and heparin, causing a metachromatic shift in staining.',
        'Eosinophil granules contain major basic protein, contributing to their intense eosinophilic staining.',
        'Lymphocyte cytoplasm lacks significant glycogen stores, resulting in a pale appearance.',
        'Monocyte cytoplasm contains lysosomes and phagosomes, indicating active phagocytosis.',
        'Neutrophil granules contain enzymes like myeloperoxidase, essential for bacterial killing.'
    ),
    (
        'Basophil granules are often found in clusters, creating localized areas of intense staining.',
        'Eosinophil granules are evenly distributed throughout the cytoplasm, creating a consistent orange-red hue.',
        'Lymphocyte nucleus is round and exhibits a smooth, regular contour.',
        'Monocyte cytoplasm is often extended into pseudopods, facilitating cell movement.',
        'Neutrophil granules are released during inflammation, contributing to the inflammatory response.'
    ),
    (
        'Basophil granules are readily degranulated upon stimulation, releasing their contents into the surrounding environment.',
        'Eosinophil granules are resistant to degradation, maintaining their staining intensity even after prolonged storage.',
        'Lymphocyte cytoplasm is often scant and contains few organelles.',
        'Monocyte cytoplasm is rich in ribosomes, reflecting their active protein synthesis.',
        'Neutrophil granules are categorized into primary, secondary, and tertiary granules, each with distinct functions.'
    ),
    (
        'Basophil granules exhibit a metachromatic effect, staining differently depending on the pH of the stain.',
        'Eosinophil granules are involved in the killing of parasites and modulation of allergic responses.',
        'Lymphocyte nucleus is often slightly indented, but generally maintains a round shape.',
        'Monocyte cytoplasm contains numerous vesicles involved in antigen processing and presentation.',
        'Neutrophil granules contain defensins, antimicrobial peptides that kill bacteria and fungi.'
    ),
    (
        'Basophil granules are the largest of all leukocyte granules, often exceeding 1 micrometer in diameter.',
        'Eosinophil granules are characterized by a central core surrounded by a dense matrix.',
        'Lymphocyte cytoplasm is typically devoid of prominent inclusions or vacuoles.',
        'Monocyte cytoplasm is often described as 'frosted glass' due to its fine granularity.',
        'Neutrophil granules are essential for the formation of reactive oxygen species, contributing to oxidative burst.'
    )
]
""")


print(forced_double_str)
print(extract_and_parse_prompt_list(forced_double_str))



prompts = [
    (
        "Basophil granules are large, irregular, and often mask the nucleus, creating a dark blue, almost black, appearance.",
        "Eosinophil granules are consistently bright orange-red and demonstrate a radial arrangement throughout the cytoplasm.",
        "Lymphocyte nucleus is hyperchromatic and occupies at least 80% of the cell volume, with minimal visible cytoplasm.",
        "Monocyte cytoplasm is abundant, gray-blue, and frequently contains fine azurophilic granules and occasional Russell bodies.",
        "Neutrophil granules are pale lilac and exhibit a subtle shift in color towards pink in response to inflammation."
    ),
    (
        "Basophil cytoplasm is sparse and filled with metachromatic granules that stain purple-black with Wright-Giemsa.",
        "Eosinophil granules are highly refractile, appearing almost crystalline under high magnification.",
        "Lymphocyte cytoplasm is typically a thin, clear rim, sometimes exhibiting a slight peri