In [None]:
import os
import re
import glob
import json
from typing import List, Dict, Any, Optional
from openai import OpenAI

# ============ Configuration ============
api_key = os.getenv("OPENAI_API_KEY", "your_api_key")  # Replace with your actual API key
json_folder = "./informal_data/"
output_file = "my_dataset_cleaned.txt"
integrated_output_file = "integrated_dataset_with_triples.json"

# ============ OpenAI Client ============
client = OpenAI(api_key=api_key)

# ============ JSON Style Examples (consistent with target output) ============
EXAMPLES_JSON = [
    {
        "text": "An infinitely extending one-dimensional figure that has no curvature.",
        "concept": "line",
        "triples": [
            {
                "subject": "line",
                "role": "SUPERTYPE",
                "lemma": "figure",
                "explanation": "Innermost leftmost NP contains NN 'figure'."
            },
            {
                "subject": "line",
                "role": "DIFFERENTIA_QUALITY",
                "lemma": "infinitely extending",
                "explanation": "JJ/VP modifying the supertype; leftover in leftmost NP."
            },
            {
                "subject": "line",
                "role": "DIFFERENTIA_QUALITY",
                "lemma": "one-dimensional",
                "explanation": "JJ indicating a property in the leftmost NP."
            },
            {
                "subject": "line",
                "role": "ASSOCIATED_FACT",
                "lemma": "has no curvature",
                "explanation": "SBAR/VP clause expressing associated fact."
            }
        ]
    },
    {
        "text": "An algebra constructed from a module over a ring R, which is symmetric in the sense that it is generated by elements of the module with relations that make the multiplication commutative.",
        "concept": "line",
        "triples": [
            {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "symmetric",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "generated",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "elements",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "module",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "relations",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "multiplication",
                    "explanation": "Key term describing the symmetric property."
                },
                {
                    "subject": "symmetric algebra",
                    "role": "DIFFERENTIA_QUALITY",
                    "lemma": "commutative",
                    "explanation": "Key term describing the symmetric property."
                }
        ]
    }
]

# ============ Prompt ============
TRIPLE_EXTRACTION_PROMPT = """
You are a semantic role labeling expert specializing in natural language definitions.
Your task: extract triples (subject, role, lemma) from the given informal definition of a mathematicalconcept.

### Semantic Roles and Syntactic Patterns
| Role | Most common syntactic patterns |
| --- | --- |
| SUPERTYPE | innermost and leftmost NP containing at least one NN |
| DIFFERENTIA_QUALITY | leftovers in the innermost and leftmost NP; PP beginning with "of" |
| DIFFERENTIA_EVENT | SBAR; VP |
| EVENT_LOCATION | PP inside a SBAR or VP, possibly having a location named entity |
| EVENT_TIME | PP inside a SBAR or VP, possibly having a time interval named entity |
| ORIGIN_LOCATION | PP not inside a SBAR or VP, possibly having a location named entity |
| QUALITY_MODIFIER | NN, JJ or RB referring to an element inside a differentia quality |
| PURPOSE | VP beginning with TO; PP beginning with "for" with a VP right after |
| ASSOCIATED_FACT | SBAR; PP not beginning with "for" with a VP right after |
| ACCESSORY_DETERMINER | whole expression before supertype; common accessory expression |
| ACCESSORY_QUALITY | JJ, presence of a differentia quality, common accessory word |

### Requirements
1) Subject MUST be exactly the provided concept name: "{concept_name}".
2) Extract ALL possible triples from the text, adhering to the roles/patterns above.
3) For each triple:
   - Include a short "explanation" of the syntactic cue.
   - Lemma MUST be a **short meaningful mathematical term or keyword**, not a full sentence.
   - Remove function words (e.g., that, it, is, with) from lemmas.
   - If a lemma contains multiple keywords (e.g., "symmetric; generated; elements"), **split them into separate triples**, one keyword per triple.
4) Return ONLY a JSON object with this shape (no extra text):
{{
  "concept": "{concept_name}",
  "triples": [
    {{"subject": "...", "role": "...", "lemma": "...", "explanation": "..."}}
  ]
}}

### Few-shot JSON Examples
{examples_block}

### Input
Concept: {concept_name}
Text: {text}

### Output
Return ONLY the JSON object described above.
"""


def _examples_block(examples: List[Dict[str, Any]]) -> str:
    """Embed the JSON examples into the prompt (as plain JSON snippets) to help the model align the format."""
    return json.dumps(examples, ensure_ascii=False, indent=2)

# ============ Parsing Utilities ============
def clean_json_block(text: str) -> str:
    """Remove Markdown code blocks and extra wrapping, attempting to retain pure JSON."""
    if not text:
        return ""
    # Prioritize matching ```json ... ```
    m = re.search(r"```json\s*(\{[\s\S]*?\})\s*```", text, re.DOTALL)
    if m:
        return m.group(1).strip()
    # Then match ``` ... ```
    m = re.search(r"```\s*(\{[\s\S]*?\})\s*```", text, re.DOTALL)
    if m:
        return m.group(1).strip()
    # Attempt to extract the largest JSON block from the first { to the last }
    m = re.search(r"\{[\s\S]*\}", text, re.DOTALL)
    if m:
        return m.group(0).strip()
    return text.strip()

def try_parse_json(text: str) -> Optional[Dict[str, Any]]:
    """Attempt to parse text into a JSON object."""
    if not text:
        return None
    # Direct attempt
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    except Exception:
        pass
    # Attempt after cleaning
    cleaned = clean_json_block(text)
    if not cleaned:
        return None
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        pass
    except Exception:
        pass
    return None

def try_parse_tsv_lines(text: str, concept_name: str) -> Optional[Dict[str, Any]]:
    """
    Fallback: If the model outputs TSV (subject \t role \t object), parse it back.
    Returns a structure consistent with the main JSON: {"concept": ..., "triples":[...]}
    """
    if not text:
        return None
    triples = []
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    ok = False
    for ln in lines:
        parts = ln.split("\t")
        if len(parts) >= 3:
            ok = True
            subj, role, obj = parts[0], parts[1], "\t".join(parts[2:])  # Handle potential tabs in object
            triples.append({
                "subject": subj.strip(),
                "role": role.strip(),
                "lemma": obj.strip(),
                "explanation": "parsed from TSV fallback"
            })
    if ok:
        return {"concept": concept_name, "triples": triples}
    return None

# ============ GPT Call ============
def annotate_with_gpt(concept_name: str, text: str, model: str = "gpt-4o") -> Optional[Dict[str, Any]]:
    """
    Call GPT for triple extraction, attempting to return a JSON object:
    {
      "concept": concept_name,
      "triples": [{"subject":..., "role":..., "lemma":..., "explanation":...}, ...]
    }
    """
    # Input validation
    if not concept_name or not text:
        print(f"[INPUT ERROR] Empty concept_name or text: '{concept_name}', '{text}'")
        return None
    
    prompt = TRIPLE_EXTRACTION_PROMPT.format(
        concept_name=concept_name,
        text=text,
        examples_block=_examples_block(EXAMPLES_JSON)
    )

    # Call GPT API
    try:
        resp = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=2000  # Increased max tokens
        )
        content = (resp.choices[0].message.content or "").strip()
    except Exception as e:
        print(f"[API ERROR] '{concept_name}' -> {e}")
        return None

    if not content:
        print(f"[EMPTY RESPONSE] '{concept_name}' -> got empty response from API")
        return None

    # Parsing: JSON -> (if failed) cleaned JSON -> (if still failed) TSV fallback
    parsed = try_parse_json(content)
    if parsed is None:
        parsed = try_parse_tsv_lines(content, concept_name)

    if parsed is None:
        # Print raw output for debugging
        print(f"[PARSE ERROR] '{concept_name}' -> cannot parse output. Raw output (first 500 chars):\n{content[:500]}...\n")
        return None

    # Basic validation
    if not isinstance(parsed, dict):
        print(f"[FORMAT ERROR] '{concept_name}' -> parsed result is not a dict.")
        return None
        
    if "triples" not in parsed:
        print(f"[FORMAT ERROR] '{concept_name}' -> parsed but missing 'triples' field.")
        return None
        
    if not isinstance(parsed["triples"], list):
        print(f"[FORMAT ERROR] '{concept_name}' -> 'triples' is not a list.")
        return None
        
    if parsed.get("concept") in (None, ""):
        parsed["concept"] = concept_name

    return parsed

# ============ Unified Writing to Triple File ============
def normalize_token(s: str, lower=True) -> str:
    """Convert to underscore, optionally lowercase, and remove extra whitespace."""
    if s is None:
        return ""
    s = re.sub(r"\s+", "_", s.strip())
    return s.lower() if lower else s

def save_triples_struct(triples_obj: Dict[str, Any], outfile: str):
    """
    Write {"concept":..., "triples":[{subject,role,object,...},...]} as:
    subject \t ROLE \t object
    """
    triples = triples_obj.get("triples", [])
    if not triples:
        return

    with open(outfile, "a", encoding="utf-8") as f:
        for t in triples:
            if not isinstance(t, dict):
                continue
            subj = normalize_token(str(t.get("subject", "")), lower=True)
            role = normalize_token(str(t.get("role", "")), lower=False).upper()
            lemma  = normalize_token(str(t.get("lemma", "")), lower=True)
            if subj and role and lemma:
                f.write(f"{subj}\t{role}\t{lemma}\n")

# ============ New: Integrated JSON Functionality ============
def save_integrated_json(all_data: Dict[str, Any], outfile: str):
    """
    Save all data (including original information and newly extracted triples) as a JSON file.
    """
    try:
        with open(outfile, "w", encoding="utf-8") as f:
            json.dump(all_data, f, ensure_ascii=False, indent=2)
        print(f"[SUCCESS] Integrated JSON saved to: {outfile}")
    except Exception as e:
        print(f"[ERROR] Cannot save integrated JSON: {e}")

def process_json_with_integration(json_path: str, integrated_data: Dict[str, Any], outfile: str):
    """
    Process a JSON file and integrate the triples into the original structure, while maintaining the original TSV output functionality.
    """
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[LOAD ERROR] {json_path} -> {e}")
        return

    processed_count = 0
    success_count = 0
    
    for module_name, module_content in data.items():
        print(f"  Processing module: {module_name}")
        
        # Initialize integrated data structure, deep copy original data
        if module_name not in integrated_data:
            integrated_data[module_name] = json.loads(json.dumps(module_content))
        
        definitions = module_content.get("definitions", [])
        
        for def_idx, definition in enumerate(definitions):
            semantic = definition.get("semantic_analysis", {})
            concepts = semantic.get("concepts", [])
            
            for concept_idx, concept in enumerate(concepts):
                concept_name = concept.get("name")
                informal_definition = concept.get("informal_definition")
                
                if not concept_name or not informal_definition:
                    print(f"    [SKIP] Missing name or informal_definition: name='{concept_name}', definition='{informal_definition}'")
                    continue
                
                processed_count += 1
                print(f"    Processing concept: {concept_name}")
                
                triples_obj = annotate_with_gpt(concept_name, informal_definition)
                if triples_obj:
                    # Save to TSV file (original functionality)
                    save_triples_struct(triples_obj, outfile)
                    
                    # Add triples to the integrated JSON structure (new functionality)
                    try:
                        integrated_data[module_name]["definitions"][def_idx]["semantic_analysis"]["concepts"][concept_idx]["extracted_triples"] = triples_obj.get("triples", [])
                        integrated_data[module_name]["definitions"][def_idx]["semantic_analysis"]["concepts"][concept_idx]["extraction_metadata"] = {
                            "extraction_status": "success",
                            "triples_count": len(triples_obj.get("triples", [])),
                            "extraction_source": "gpt-4o"
                        }
                    except (KeyError, IndexError) as e:
                        print(f"    [JSON STRUCTURE ERROR] Cannot add triples to integrated data: {e}")
                    
                    success_count += 1
                    print(f"    [SUCCESS] '{concept_name}' -> {len(triples_obj.get('triples', []))} triples extracted")
                else:
                    # Mark failure in the integrated JSON
                    try:
                        integrated_data[module_name]["definitions"][def_idx]["semantic_analysis"]["concepts"][concept_idx]["extracted_triples"] = []
                        integrated_data[module_name]["definitions"][def_idx]["semantic_analysis"]["concepts"][concept_idx]["extraction_metadata"] = {
                            "extraction_status": "failed",
                            "triples_count": 0,
                            "extraction_source": "gpt-4o"
                        }
                    except (KeyError, IndexError) as e:
                        print(f"    [JSON STRUCTURE ERROR] Cannot add failure marker to integrated data: {e}")
                    
                    print(f"    [FAILED] '{concept_name}' -> no triples extracted.")
    
    print(f"  Module summary: {success_count}/{processed_count} concepts successfully processed")

# ============ Retain original processing function (backup) ============
def process_json(json_path: str, outfile: str):
    """
    Original processing function, only outputs TSV format (retained as backup).
    """
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"[LOAD ERROR] {json_path} -> {e}")
        return

    processed_count = 0
    success_count = 0
    
    for module_name, module_content in data.items():
        print(f"  Processing module: {module_name}")
        definitions = module_content.get("definitions", [])
        
        for definition in definitions:
            semantic = definition.get("semantic_analysis", {})
            concepts = semantic.get("concepts", [])
            
            for concept in concepts:
                concept_name = concept.get("name")
                informal_definition = concept.get("informal_definition")
                
                if not concept_name or not informal_definition:
                    print(f"    [SKIP] Missing name or informal_definition: name='{concept_name}', definition='{informal_definition}'")
                    continue
                
                processed_count += 1
                print(f"    Processing concept: {concept_name}")
                
                triples_obj = annotate_with_gpt(concept_name, informal_definition)
                if triples_obj:
                    save_triples_struct(triples_obj, outfile)
                    success_count += 1
                    print(f"    [SUCCESS] '{concept_name}' -> {len(triples_obj.get('triples', []))} triples extracted")
                else:
                    print(f"    [FAILED] '{concept_name}' -> no triples written.")
    
    print(f"  Module summary: {success_count}/{processed_count} concepts successfully processed")

In [None]:
# ============ Main Program ============
if __name__ == "__main__":
    # Validate API key
    if not api_key or api_key == "your_api_key":
        print("[ERROR] Please set your OpenAI API key in the environment variable OPENAI_API_KEY or modify the code.")
        exit(1)
    

    # Collect all JSON files
    json_files = glob.glob(os.path.join(json_folder, "*.json"))
    if not json_files:
        print(f"[WARN] No JSON files found in: {json_folder}")
        exit(1)
    
    print(f"Found {len(json_files)} JSON files to process.")

    # For storing all integrated data
    integrated_data = {}
    total_processed = 0
    
    # Process all JSON files, generating both TSV and integrated JSON
    for json_path in json_files:
        print(f"\nProcessing {os.path.basename(json_path)} ...")
        process_json_with_integration(json_path, integrated_data, output_file)
        total_processed += 1

    # Save the integrated JSON file
    if integrated_data:
        save_integrated_json(integrated_data, integrated_output_file)
    else:
        print("[WARN] No data was integrated, skipping JSON output.")

    print(f"\nAnnotation completed! Processed {total_processed} JSON files in total.")
    print(f"TSV format triples saved to: {output_file}")
    print(f"Integrated JSON file saved to: {integrated_output_file}")
    
    # Display line count of output file
    try:
        with open(output_file, "r", encoding="utf-8") as f:
            line_count = len(f.readlines())
        print(f"TSV output file contains {line_count} lines of triple data.")
    except Exception as e:
        print(f"Cannot count lines in TSV output file: {e}")
    
    # Display statistics for the integrated JSON
    try:
        with open(integrated_output_file, "r", encoding="utf-8") as f:
            integrated_json = json.load(f)
        
        total_concepts = 0
        total_extracted_triples = 0
        successful_extractions = 0
        
        for module_name, module_content in integrated_json.items():
            definitions = module_content.get("definitions", [])
            for definition in definitions:
                semantic = definition.get("semantic_analysis", {})
                concepts = semantic.get("concepts", [])
                for concept in concepts:
                    total_concepts += 1
                    extracted_triples = concept.get("triples", [])
                    if extracted_triples:
                        successful_extractions += 1
                        total_extracted_triples += len(extracted_triples)
        
        print(f"Integrated JSON statistics: {total_concepts} concepts, {successful_extractions} successful extractions, {total_extracted_triples} triples in total.")
        
    except Exception as e:
        print(f"Cannot generate statistics for integrated JSON file: {e}")