In [None]:
import json
import os
import re
from openai import AzureOpenAI
from tqdm import tqdm
from dotenv import load_dotenv

load_dotenv()

# This is for the Traditional Contempoary PDF

class Propositionizer1():
    def __init__(self):
        self.AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT_2")
        self.AZURE_KEY = os.getenv("AZURE_OPENAI_KEY_2")
        self.AZURE_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT_2")
        self.AZURE_API_VERSION = os.getenv("AZURE_OPENAI_VERSION_2")

        self.client = AzureOpenAI(
            azure_endpoint=self.AZURE_ENDPOINT,
            api_key=self.AZURE_KEY,
            api_version=self.AZURE_API_VERSION
        )

    def _build_prompt(self, content: str) -> str:
        return f"""
You are an assistant tasked with converting meaningful sentences into concise, decontextualized propositions in JSON format.

Before generating propositions, follow these steps:

1. **Ignore Irrelevant Inputs**: Do NOT return propositions for texts that:
   - Are shorter than a full sentence.
   - Are just numbers, section titles, or isolated words (e.g., “Edition”, “15”, “All rights reserved”).
   - Contain no meaningful content or context.

2. **Simplify and Decompose**: For valid inputs, split compound sentences into multiple short, simple statements.
   - Each proposition should stand alone without needing context.
   - Replace pronouns like "it", "they", or "this" with explicit references.

3. **Be Concise and Clear**: Rephrase only when needed. Avoid repeating information. Use neutral tone.

4. **Format**: Return a list of plain strings in a JSON array.

### Now apply this to the following input:
\"\"\"{content}\"\"\"

Output:
"""

    def _find_text(self, obj):
        """Recursively find 'text' in a dictionary"""
        if isinstance(obj, dict):
            for k, v in obj.items():
                if k == "text" and isinstance(v, str):
                    return v.strip()
                found = self._find_text(v)
                if found:
                    return found
        elif isinstance(obj, list):
            for item in obj:
                found = self._find_text(item)
                if found:
                    return found
        return None

    def _looks_like_sentence(self, text: str) -> bool:
        """Heuristic: true if text starts with capital and ends with punctuation"""
        return bool(re.match(r"^[A-Z\"“‘].+?[\.!?]$", text.strip()))

    def _safe_parse_json(self, text):
        """Extract and parse JSON array from LLM output"""
        try:
            if text.startswith("```json") or text.startswith("```"):
                text = text.strip("`").strip("json").strip()
            return json.loads(text)
        except Exception:
            return None

    def generate_propositions1(self, input_file: str, output_file: str):
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        updated_data = {}
        for section_key, entries in data.items():
            print(f"\nProcessing section: {section_key}")
            for entry in tqdm(entries, desc=f"Section {section_key}"):
                text = self._find_text(entry)
                entry["chunks"] = None  # Add the key always

                if not text or not self._looks_like_sentence(text):
                    continue

                try:
                    response = self.client.chat.completions.create(
                        model=self.AZURE_DEPLOYMENT,
                        messages=[{"role": "user", "content": self._build_prompt(text)}],
                        temperature=1,
                    )
                    proposition_text = response.choices[0].message.content.strip()
                    parsed = self._safe_parse_json(proposition_text)

                    if parsed and isinstance(parsed, list):
                        entry["chunks"] = parsed
                    else:
                        print(f"Could not parse: {proposition_text}")
                except Exception as e:
                    print(f"Error: {text[:40]}... — {e}")
                    entry["chunks"] = None

            updated_data[section_key] = entries

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(updated_data, f, indent=2, ensure_ascii=False)

        print(f"\nSaved updated JSON with cleaned propositions to: {output_file}")


In [None]:
import json
import os
import re
from openai import AzureOpenAI
from tqdm import tqdm
from dotenv import load_dotenv

# This is for the silk and objectifying china PDF

load_dotenv()

class Propositionizer2():
    def __init__(self):
        self.AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT_2")
        self.AZURE_KEY = os.getenv("AZURE_OPENAI_KEY_2")
        self.AZURE_DEPLOYMENT = os.getenv("AZURE_OPENAI_DEPLOYMENT_2")
        self.AZURE_API_VERSION = os.getenv("AZURE_OPENAI_VERSION_2")

        self.client = AzureOpenAI(
            azure_endpoint=self.AZURE_ENDPOINT,
            api_key=self.AZURE_KEY,
            api_version=self.AZURE_API_VERSION
        )

    def _build_prompt(self, content: str) -> str:
        return f"""
You are an assistant tasked with converting meaningful sentences into concise, decontextualized propositions in JSON format.

Follow these rules:

1. Ignore irrelevant inputs:
   - Do NOT return propositions for texts that are just numbers, symbols, or isolated words (e.g., “Edition”, “15”, “All rights reserved”).
   - Skip text that is a heading or label, rather than a full sentence.

2. Process only meaningful full sentences:
   - If the input is a complete sentence, return it as a one-item JSON array.
   - If the input has multiple clauses, split them into multiple short, standalone propositions.

3. Decompose and clarify:
   - Split compound sentences.
   - Replace vague pronouns like "it", "they", "this" with explicit references.
   - Preserve original phrasing and facts when possible.

4. Output format:
   - Always return a JSON array of plain strings.
   - Do not include markdown or explanations—only return the list.

### Example:

Input: "The distinctive floral spray and pomegranate designs on this plate were probably developed during the Xuande period."
Output: ["The floral spray and pomegranate designs on this plate were probably developed during the Xuande period."]

Input: "Curator Kikki Lam"
Output: []  (do not generate output for non-sentences)

Now apply these rules to the following input:

\"\"\"{content}\"\"\"

Output:
"""

    def _find_text(self, obj):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if k == "text" and isinstance(v, str):
                    return v.strip()
                found = self._find_text(v)
                if found:
                    return found
        elif isinstance(obj, list):
            for item in obj:
                found = self._find_text(item)
                if found:
                    return found
        return None

    def _safe_parse_json(self, text):
        try:
            parsed = json.loads(text)
            if isinstance(parsed, str):
                return [parsed]
            return parsed
        except json.JSONDecodeError:
            # Clean markdown/codeblock formatting
            text = text.strip().strip("`").strip("json").strip()
            try:
                parsed = json.loads(text)
                if isinstance(parsed, str):
                    return [parsed]
                return parsed
            except:
                return None

    def _looks_like_full_sentence(self, text):
        """Heuristic: full sentence starts with capital, ends with punctuation."""
        return bool(re.match(r"^[A-Z\"“‘].+?[\.!?]$", text.strip()))

    def generate_propositions2(self, input_file: str, output_file: str):
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        updated_data = {}

        for section_key, entries in data.items():
            valid_entries = []
            for entry in entries:
                text = self._find_text(entry)
                if text and self._looks_like_full_sentence(text):
                    valid_entries.append((entry, text))

            subset_size = len(valid_entries)
            subset = valid_entries[:subset_size]

            for entry, text in tqdm(subset, desc=f"Processing {section_key}"):
                try:
                    response = self.client.chat.completions.create(
                        model=self.AZURE_DEPLOYMENT,
                        messages=[{"role": "user", "content": self._build_prompt(text)}],
                        temperature=1,
                    )
                    proposition_text = response.choices[0].message.content.strip()
                    parsed = self._safe_parse_json(proposition_text)

                    if parsed and isinstance(parsed, list):
                        entry["chunks"] = parsed
                    else:
                        entry["chunks"] = None
                except Exception as e:
                    print(f"\n Exception on entry: {text[:60]}...\nError: {e}")
                    entry["chunks"] = None

            updated_data[section_key] = entries

        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(updated_data, f, indent=2, ensure_ascii=False)

        print(f"\n Saved updated JSON with propositions to: {output_file}")
