In [2]:
import pandas as pd

data = [
    ("patch", "DeepSeek-R1", "Clustered large nuclei with prominent nucleoli: Fine, wavy fibrous mesh structures; organized cellular clusters surrounded by supporting cells; thickened nerve bundles adjacent to cellular aggregates; specific immunohistochemical staining patterns"),
    ("patch", "DeepSeek-R1", "Enlarged, densely packed nerve fibers without clustered nuclei: Increased density of linear, disorganized fibers; absence of organized cellular clusters; thickened muscular layers with sparse cellularity; inflammatory cell aggregates obscuring tissue architecture; submucosal layer thinning or reduced cellularity; excessive connective tissue deposition; lack of immunohistochemical staining in expected regions"),
    ("patch", "DeepSeek-R1", "Absence of clustered ganglion cells: Uniform smooth muscle layers lacking neural structures; sparse vascular networks without branching complexity; homogeneous mucosal architecture lacking specialized zones; minimal inflammatory infiltrates disrupting tissue organization; fibrosis obscuring normal stromal patterns; thickened muscularis propria without interspersed neural elements; lack of spindle-shaped interstitial cells between muscle fibers; reduced neuronal process density in submucosal and myenteric regions"),
    ("patch", "DeepSeek-R1", "Ganglion cell clusters (large nuclei, prominent nucleoli): Looser stromal matrix vs. dense muscle bundles; lower cellular density with interspersed nerve fibers; eosinophilic cytoplasm contrast to spindle-shaped myocytes; nodular/network architecture vs. linear muscle alignment; distinct nuclear morphology (round vs. elongated); supporting glial cell presence; vascularity variations; staining contrast in H&E (lighter vs. uniform)"),
    ("patch", "DeepSeek-R1", "Absence of ganglion cells: No clusters of large, pale-staining cells between smooth muscle layers; hypertrophic nerve fibers: thickened, eosinophilic strands in submucosa and muscularis; increased nerve bundle size: enlarged, irregularly contoured bundles within intermuscular regions; smooth muscle hypertrophy: thickened inner circular and outer longitudinal muscle layers; inflammatory infiltrates: lymphocytes or eosinophils in muscularis or submucosa; fibrosis: collagen deposition in muscularis, appearing as dense, acellular areas; mucosal alterations: crypt atrophy or ulceration secondary to obstruction; transition zone: abrupt histological shift from aganglionic to normal bowel with sudden nerve density reduction"),
    ("slide", "DeepSeek-R1", "Ganglion clusters with large and pale nuclei: Interspersed nerve fibers (wavy, eosinophilic); supporting glial cells (small, dark nuclei); located between inner circular and outer longitudinal muscle layers; sparse connective tissue matrix; distinct from densely packed smooth muscle surroundings"),
    ("slide", "DeepSeek-R1", "Uniform and continuous muscular layers with indistinct connective tissue borders: Absence of neural aggregates or ganglionic clusters between smooth muscle fibers; homogeneous stromal composition lacking specialized cellular networks; smooth, uninterrupted interfaces between longitudinal and circular muscle bands; inconspicuous interstitial cell populations; unremarkable vascular patterns without adjacent organized cellular arrangements"),
    ("slide", "DeepSeek-R1", "Clustered large neuronal cell bodies with prominent nucleoli: Reticular stromal fiber networks; heterogeneous tissue texture with mixed cell densities; focal disruption of smooth muscle continuity; localized presence of glial support cells; distinct basophilic or eosinophilic staining variations; nodular architecture interrupting muscular uniformity"),
    ("slide", "DeepSeek-R1", "Absence of ganglion cells in submucosa and muscularis propria: Large neuronal cells missing in typical locations; hypertrophic nerve fibers; thickened, eosinophilic nerve bundles within the submucosa or muscular layers; increased density of small nerve bundles; clusters of irregular nerve structures in affected areas; thickened muscularis propria; hyperplastic smooth muscle layers with altered orientation; abrupt transition from aganglionic to ganglionic segments; narrow lumen proximal to the affected zone with dilated normal colon distally; secondary mucosal inflammation; neutrophilic infiltrates or cryptitis near aganglionic regions; mucosal ulceration or erosions due to obstruction")
]

df = pd.DataFrame(data, columns=["prompt_level", "label", "descriptive_prompt"])
df

Unnamed: 0,prompt_level,label,descriptive_prompt
0,patch,DeepSeek-R1,Clustered large nuclei with prominent nucleoli...
1,patch,DeepSeek-R1,"Enlarged, densely packed nerve fibers without ..."
2,patch,DeepSeek-R1,Absence of clustered ganglion cells: Uniform s...
3,patch,DeepSeek-R1,"Ganglion cell clusters (large nuclei, prominen..."
4,patch,DeepSeek-R1,Absence of ganglion cells: No clusters of larg...
5,slide,DeepSeek-R1,Ganglion clusters with large and pale nuclei: ...
6,slide,DeepSeek-R1,Uniform and continuous muscular layers with in...
7,slide,DeepSeek-R1,Clustered large neuronal cell bodies with prom...
8,slide,DeepSeek-R1,Absence of ganglion cells in submucosa and mus...


In [3]:
df["label"] = [
    "plexus",  # Factors indicating presence of plexus regions
    "no plexus",  # Factors indicating absence of plexus regions
    "no plexus",  # Factors indicating absence of plexus regions
    "plexus",  # Features of plexus regions
    "no plexus",  # Features indicating Hirschsprung’s disease (absence of plexus)
    "plexus",  # Appearance of plexus regions in whole slide images
    "no plexus",  # Regions where plexus structures are not identifiable
    "plexus",  # Differences in whole slide images with vs without plexus regions
    "no plexus"  # Features indicating Hirschsprung’s disease (absence of plexus)
]

df.to_csv("DeepSeekR1-prompts_raw.csv")

In [4]:
df

Unnamed: 0,prompt_level,label,descriptive_prompt
0,patch,plexus,Clustered large nuclei with prominent nucleoli...
1,patch,no plexus,"Enlarged, densely packed nerve fibers without ..."
2,patch,no plexus,Absence of clustered ganglion cells: Uniform s...
3,patch,plexus,"Ganglion cell clusters (large nuclei, prominen..."
4,patch,no plexus,Absence of ganglion cells: No clusters of larg...
5,slide,plexus,Ganglion clusters with large and pale nuclei: ...
6,slide,no plexus,Uniform and continuous muscular layers with in...
7,slide,plexus,Clustered large neuronal cell bodies with prom...
8,slide,no plexus,Absence of ganglion cells in submucosa and mus...


In [5]:
import numpy as np

np.savetxt(r'DeepSeekR1-prompts_raw.txt', df.values, fmt='%s', delimiter=",")

In [6]:
import os
import json
import re

NUM_PROTO = 26  

def parse_str(raw):
    """Clean and format strings."""
    return raw.strip("-").strip()

def parse_kv(raw):
    """Extract key-value pairs from descriptive prompts."""
    key = raw.split(":")[0].strip()
    value = "".join(raw.split(":")[1:]).strip()
    return key, value

def parse_prompt(src_file, output_dir="."):
    """Parse prompts from a text file and save structured data in JSON format."""
    
    patch_ctx = {}
    slide_ctx = {}

    # Read the input file
    try:
        with open(src_file, "r") as src:
            for line in src.readlines():
                cols = line.strip().split(",")
                if len(cols) < 3:
                    print(f"Skipping invalid line: {line.strip()}")
                    continue
                
                prompt_level = cols[0].strip().lower()
                label = cols[1].strip().lower()
                prompt_desc = ",".join(cols[2:]).strip()

                if prompt_level == "patch":
                    if label not in patch_ctx:
                        patch_ctx[label] = {}
                    
                    key, value = parse_kv(prompt_desc)
                    key = parse_str(key)
                    patch_ctx[label][key] = parse_str(value)
                else:
                    # slide_ctx[label] = parse_str(prompt_desc)
                    slide_ctx[label] = slide_ctx.get(label, "") + ". " + parse_str(prompt_desc)


    except FileNotFoundError:
        print(f"Error: The file '{src_file}' was not found.")
        return
    except Exception as e:
        print(f"An error occurred while parsing: {e}")
        return

    # Filter TOP-N concepts
    for class_name, ctx in patch_ctx.items():
        kv_pairs = list(ctx.items())[:NUM_PROTO]
        patch_ctx[class_name] = dict(kv_pairs)

    # Save parsed data to JSON files
    with open(os.path.join(output_dir, "slide_prompts.json"), "w") as dst:
        json.dump(slide_ctx, dst, indent=2)

    with open(os.path.join(output_dir, "patch_prompts.json"), "w") as dst:
        json.dump(patch_ctx, dst, indent=2)

    print("Parsing complete. JSON files saved.")

def main():
    print("Parsing prompts from 'DeepSeekR1-prompts_raw.txt'...")
    parse_prompt(src_file="DeepSeekR1-prompts_raw.txt", output_dir=".")
    print("Done!")

if __name__ == "__main__":
    main()


Parsing prompts from 'DeepSeekR1-prompts_raw.txt'...
Parsing complete. JSON files saved.
Done!
