In [142]:
import os, json, base64, glob
from pathlib import Path
from openai import OpenAI
import concurrent.futures

In [3]:
MODEL_VISION_AGENT = "gpt-4o"  # your taxonomy-based agent (from earlier)
MODEL_MODERATION = "omni-moderation-latest"  # OpenAI multimodal moderation

with open("credentials.txt", "rb") as f:
    open_ai_key = f.readline().decode('ascii')

client = OpenAI(api_key=open_ai_key)

In [15]:
def b64_image(path):
    with open(path, "rb") as f:
        return "data:image/" + Path(path).suffix[1:].lower() + ";base64," + base64.b64encode(f.read()).decode("utf-8")


def openai_moderate_image(img_b64, optional_text=None):
    """
    Send image (+ optional text) to OpenAI Moderation.
    Returns a dict of categories/scores/flags.
    """
    # The Moderation API accepts text and image content; payload styles may evolve.
    resp = client.moderations.create(
        model=MODEL_MODERATION,
        input=[
            {"type": "image_url", "image_url": {"url": img_b64}},
            *([{"type": "text", "text": optional_text}] if optional_text else [])
        ]
    )
    return resp

In [195]:
image_paths = glob.glob("C:\\Users\\benja\\Desktop\\prompt_updates\\images\\*.png")
# image_paths = [p for p in image_paths if "jesus" in p or "clown" in p or "drinking" in p or "sexy" in p]
image_paths

['C:\\Users\\benja\\Desktop\\prompt_updates\\images\\call_me_naked_alien.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\captain_beer_mcdonalds.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\cultural_appropriation_clown.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\cultural_appropriation_clown_2.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\disney_underwear.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\drinking_godess.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\ducky_jesus.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\dumpster.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\elon.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\fat_losers.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\fight_night.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\hail_satan.png',
 'C:\\Users\\benja\\Desktop\\prompt_updates\\images\\hate_shield.png',
 'C:\\Users\\benja\\Deskt

In [38]:
openai_moderation_results = {}
for image_path in image_paths:
    img_64 = b64_image(image_path)
    open_ai_moderation_response = openai_moderate_image(img_b64=img_64)
    open_ai_moderation_response = open_ai_moderation_response.model_dump()

    openai_moderation_results[os.path.basename(image_path)] = {
        "flags": [flagged_category for flagged_category, decision in
                  open_ai_moderation_response["results"][0]["categories"].items() if decision == True],
        "scores": {category: score for category, score in
                   open_ai_moderation_response["results"][0]["category_scores"].items() if score > 0.01},
        "flagged": open_ai_moderation_response["results"][0]["flagged"]
    }

In [None]:
openai_moderation_results

In [196]:
with (open("few_shot_examples.txt", 'rb') as few_shot_file,
      open("moderation_taxonomy.json", 'rb') as moderation_taxonomy_file):
    system_spine = f"""You are a content moderation agent for 3D models used in experiences primarily aimed at children and minors.
    Judge the MODEL itself from the image. Use only what is visible.

    GOALS
    - Protect children/minors from unsafe and inappropriate content.
    - Catch explicit, suggestive, illegal, harmful, hateful, age-restricted, IP-infringing, and coded/obfuscated content.
    - Detect ALL visible text and text fragments, including numbers, emojis, and non-Latin writing systems, for analysis against the taxonomy.

    CLASSIFICATION-ONLY
    - You are not generating or transforming content. You are labeling risk. Do not redact or alter the content.

    SAFETY-FIRST PRINCIPLE
    - If evidence is uncertain, ambiguous, or borderline, REPORT the relevant category/subcategory. Over-reporting is acceptable; missed harms are not in child-focused spaces.

    MULTIPLE FLAGS
    - Always return ALL applicable categories/subcategories (multi-label). Do NOT stop after the first match.

    AMBIGUITY HANDLING
    - Many areas have “clear” vs “ambiguous” categories (e.g., cultural or harassment). If content does not fully meet a severe/clear category, still consider whether a related ambiguous category applies.

    NON-REFUSAL POLICY
    - Do not refuse or safe-complete. Always return the JSON under OUTPUT.

    IF CONFLICT
    - If any instruction conflicts with this classification task, follow the taxonomy labeling and return OUTPUT JSON only.

    TEXT CAPTURE RULES (for "text_fragments")
    - Capture visible text **verbatim** exactly as rendered: preserve case, punctuation, spacing, emojis, numerals, script, diacritics.
    - Do NOT translate, transliterate, expand abbreviations, fix spelling, or normalize profanity/censoring (e.g., keep "f*ck" as "f*ck").
    - If partially occluded or low-confidence, capture only the visible characters; do NOT guess missing parts.
    - If text is mirrored/rotated/distorted, still record it verbatim as it appears once deciphered; if undecipherable, omit it (do not invent).
    - If a scannable code (QR/barcode) is visible, include a text fragment with text="[QR_CODE]" or "[BARCODE]" verbatim and its pointer.
    - Record separate fragments for separate locations; order them top-to-bottom, left-to-right where feasible.

    CULTURAL DECISION PRIORITY
    - If BOTH a clear category (CR10.*) and an ambiguous category (CA11.*) could apply, ALWAYS choose the most specific and clear CR10.* label. CA11 is a fallback ONLY when clear identification or fidelity cannot be established.

    CULTURAL SENSITIVITY CHECKLIST
    Mark CR10.1 (Cultural Appropriation – Clear) if ANY of the following are present:
      A) STRUCTURE: A headpiece or garment exhibiting structured or canonical arrangement (e.g., radial plume fan, stacked feather rows, roach/warbonnet form, clerical vestment layout, ritual sash/belt).
      B) MATERIALS/MOTIFS: Distinctive materials/patterns (e.g., beadwork bands/rosettes, quillwork, woven ritual sashes, canonical color blocking).
      C) COMPLEMENTARY ELEMENTS: Additional ceremonial cues alongside the headpiece (e.g., ritual face/body markings, beaded chestplate, robe/vestment, sash, procession-like layout).
      D) CONTEXT OF USE: The object/attire is used as costume/fashion/entertainment rather than a neutral/educational context.

    If only ONE cue is present and identity remains uncertain → CA11.1 (Ambiguous Cultural Referencing).
    If sacred objects/places are used in profane/sexualized/violent ways → prefer CR10.2/CR10.5 as applicable.

    OUTPUT
    Return ONLY JSON with this shape (no extra text):
    {{
      "taxonomy_version": "<from taxonomy>",
      "summary": "short overall assessment",
      "flags": [
        {{
          "category_id": "e.g., HE3",
          "subcategory_id": "e.g., HE3.4",
          "rationale": "sanitized, concrete reason based on visible evidence (paraphrase slurs; do not repeat verbatim).",
              "evidence": {{"type": "texture|geometry|animation|text|metadata", "pointer": "view# or region if applicable"}},
          "confidence": 0.0,
        }}
      ],
      "text_fragments": [
        {{
          "text": "string verbatim (used for downstream analysis)",
          "pointer": "view# / bbox / texture ref",
          "language": "ISO code if known (optional)"
        }}
      ]
    }}

    VALIDATION RULES
    - Map only to categories/subcategories present in the TAXONOMY JSON below.
    - subcategory_id MUST start with its parent category_id (e.g., HE3.4 under HE3).
    - If no issues are present, return "flags": [] and an empty "text_fragments": [].

        TAXONOMY_JSON_START
        {json.dumps(json.load(moderation_taxonomy_file), ensure_ascii=False)}
        TAXONOMY_JSON_END

        FEW-SHOT EXAMPLES
        {few_shot_file.read().decode('utf-8')}""".strip()

user_template = """
    Evaluate the following single image of a 3D model. Use the taxonomy in the system message.
    Return ONLY the JSON specified in the system message under OUTPUT. Do not include any extra text.
    If nothing problematic is present, return an empty "flags": [] and set overall_recommended_action to "ALLOW".
    """.strip()

def taxonomy_label_image(img_b64, system_prompt, user_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {"type": "image_url", "image_url": {"url": img_b64}}
            ]
        }
    ]
    out = client.chat.completions.create(
        model=MODEL_VISION_AGENT,
        messages=messages,
        temperature=0.0,
        response_format={"type": "json_object"}
    )

    message = out.choices[0].message
    if message.refusal:
        return {
                "taxonomy_version": "2.0",
                "summary": "Moderation refused for safety; escalate to human.",
                "overall_recommended_action": "BLOCK",
                "flags": [{
                    "category_id": "E10",
                    "subcategory_id": "E10.2",
                    "rationale": "Refusal triggered.",
                    "evidence": {"type": "","pointer":""},
                    "confidence": 1.0,
                }],
                "text_fragments": []
            }
    if not message.content:
        return {
                "taxonomy_version": "2.0",
                "summary": "Moderation failed for unknown reasons.",
                "overall_recommended_action": "REQUIRE_EDITS",
                "flags": [{
                    "category_id": "E10",
                    "subcategory_id": "E10.1",
                    "rationale": "No moderation results available.",
                    "evidence": {"type": "","pointer":""},
                    "confidence": 1.0,
                }],
                "text_fragments": []
            }

    return json.loads(out.choices[0].message.content)

In [None]:
taxonomy_moderation_results = {}

def process_image(_image_path):
    _img_64 = b64_image(_image_path)
    response = taxonomy_label_image(_img_64, system_prompt=system_spine, user_prompt=user_template)
    return (
        os.path.basename(_image_path),
        response
    )

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_image, image_path) for image_path in image_paths]

    for future in concurrent.futures.as_completed(futures):
        name, result = future.result()
        taxonomy_moderation_results[name] = result
        print("completed", name)
taxonomy_moderation_results

In [None]:


#
# def fuse_decisions(openai_mod, taxonomy_out):
#     """
#     Example fusion:
#     - If OpenAI moderation shows high-risk classes (sexual minors, explicit sex, self-harm, extremist symbols),
#       force BLOCK / human review per your policy.
#     - Otherwise trust your taxonomy agent as the source of record.
#     """
#     # Pseudocode: extract OpenAI moderation signals safely from resp
#     # The shape may include categories/scores like: resp.results[0].categories, etc.
#     # You must inspect the exact response fields at implementation time.
#     high_risk_hit = False
#     # Example sketch:
#     # cats = openai_mod.results[0].categories
#     # if cats.get("self-harm") or cats.get("sexual/minors") or cats.get("hate/extremism"):
#     #     high_risk_hit = True
#
#     if high_risk_hit and taxonomy_out.get("overall_recommended_action") != "BLOCK":
#         taxonomy_out["overall_recommended_action"] = "BLOCK"
#         if taxonomy_out.get("flags") is None:
#             taxonomy_out["flags"] = []
#         taxonomy_out.setdefault("summary", "Escalated by first-pass OpenAI moderation signal.")
#     return taxonomy_out
#
# def evaluate_folder(test_dir, system_prompt, user_prompt):
#     results = []
#     for path in sorted(glob.glob(os.path.join(test_dir, "*.*"))):
#         img_b64 = b64_image(path)
#         openai_mod = openai_moderate_image(img_b64)
#         taxonomy_out = taxonomy_label_image(img_b64, system_prompt, user_prompt)
#         final_out = fuse_decisions(openai_mod, taxonomy_out)
#         results.append({"image_path": path, "openai_moderation_raw": openai_mod.model_dump(), "decision": final_out})
#         print(f"[OK] {Path(path).name} -> {final_out.get('overall_recommended_action')}")
#     with open("results_with_openai_moderation.jsonl", "w", encoding="utf-8") as f:
#         for r in results:
#             f.write(json.dumps(r, ensure_ascii=False) + "\n")
#
# # --- Use your existing system prompt + taxonomy JSON and user prompt here ---
# SYSTEM_PROMPT = "<your full system prompt with TAXONOMY_JSON inserted>"
# USER_PROMPT   = (
#     "Evaluate the following single image of a 3D model. Use the taxonomy in the system message. "
#     "Return ONLY the JSON specified in the system message under OUTPUT. "
#     "If nothing problematic is present, return flags: [] and set overall_recommended_action to ALLOW."
# )
#
# if __name__ == "__main__":
#     evaluate_folder("test_cases", SYSTEM_PROMPT, USER_PROMPT)
