In [2]:
import json
from openai import AzureOpenAI
import os
from dotenv import load_dotenv
import re

In [3]:
THEME_TAGS = [
    "symbolism", "technique", "mythical_creatures", "landscapes", "nature"
]
TIME_PERIOD_TAGS = [
    "tang", "song", "yuan", "ming", "qing"
]
ART_MEDIUM_TAGS = [
    "silk", "cermanics", "paintings"
]

SECTION_TYPE_TAGS = [
    "artist", "historical_context", "artwork"
]

In [4]:
def create_prompt(text, header):
    return f"""
You are a museum content classifier. Based on the exhibit description below, assign the most appropriate value for each of the following fields using only the given list of tags, you may select one, more than one, or none for each respective category. 

Respond only with a JSON object using the tag values provided.

Use all of this context when making your decision.

---

**Header**: {header}
**Text**: {text}

---


**Themes** (choose one): 
{json.dumps(THEME_TAGS)}

**Time Period** (choose one): 
{json.dumps(TIME_PERIOD_TAGS)}

**Geographic Region** (choose one):
{json.dumps(ART_MEDIUM_TAGS)}

---

Respond with JSON in this format:
{{
  "themes": ["ThemeTagHere"],
  "time_period": ["TimePeriodTagHere"],
  "art_medium": ["ArtMediumTagHere"]
}}
"""

def clean_json_string(s: str) -> str:
    # Remove markdown code fences and language specifiers
    s = s.strip()
    if s.startswith("```"):
        # Remove starting and ending fence.
        s = s.lstrip("`").rstrip("`").strip()
        # Remove language specifier if it exists (e.g. "json")
        s = re.sub(r"^json\s*", "", s, flags=re.IGNORECASE)
    return s

def tag_chunk(text, header):
    prompt = create_prompt(text, header)
    client = AzureOpenAI(
        api_version=os.environ.get("API_VERSION", "2024-12-01-preview"),
        azure_endpoint=os.environ.get(
            "AZURE_OPENAI_ENDPOINT", "https://openai-ai-museum.openai.azure.com/"
        ),
        api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    )
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=16384,
        temperature=0,
        model=os.environ.get("AZURE_OPENAI_DEPLOYEMENT", "gpt-4o"),
        stream=True,
    )

    # Collect streamed content
    collected_response = ""
    for chunk in response:
        if chunk.choices:
            delta = chunk.choices[0].delta
            collected_response += delta.content if hasattr(delta, "content") and delta.content is not None else ""
    
    # Clean the response from markdown formatting
    clean_response = clean_json_string(collected_response)
    
    try:
        return json.loads(clean_response)
    except json.JSONDecodeError:
        print("Could not decode JSON:", clean_response)
        return {}

def topic_tagger(input_file, output_dir=None):
    """_summary_

    Args:
        input_file (_type_): _description_
        output_dir (_type_, optional): _description_. Defaults to None.
    """
    # Load chunked JSON
    with open(input_file, "r") as f:
        data = json.load(f)
    
    if output_dir is None:
        output_file = input_file  # Overwrite input file if no output directory specified.
    else:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        output_file = os.path.join(output_dir, os.path.basename(input_file))

        
    for item in data.values(): 
        chunk_text_list = item.get('text', '')
        chunk_text = " ".join(chunk_text_list)
        header = item.get('header', '')
        llm_tags = tag_chunk(chunk_text, header)
        
        # update tags in JSON
        item['time_period'] = llm_tags['time_period']
        item['art_medium'] = llm_tags['art_medium']
        item['themes'] = llm_tags['themes']

    # Save updated JSON
    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(data, f, indent=2)

In [5]:
topic_tagger("output/Objectifying_China/preprocessed/en_contents_chunked_sample2.json", 
             "output/Objectifying_China/tagged")