In [1]:
import json

In [2]:
file_path = "output/Objectifying_China/preprocessed/en_contents_v2.json"

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)


In [None]:
def section_chunker(file_path): 
    """
    Processes a JSON file containing text and image chunks, grouping them into sections based on
    text headers. Each section is represented as a dictionary with its ID, type, header text, and lists of text and image data.
    Once a section is formed, all text entries are combined into one single string.
    """
    import json

    with open(file_path, "r") as f:
        data = json.load(f)

    # Flatten chunks (original order)
    all_chunks = []
    for chunk_list in data.values():
        all_chunks.extend(chunk_list)

    # Grouped result as a dictionary
    grouped = {}
    current_group = None

    for chunk in all_chunks:
        if chunk["type"] == "text" and "text_level" in chunk:
            # Store previous group, if exists, and join the text list into one string.
            if current_group:
                current_group["page_idx"] = list(sorted(set(current_group["page_idx"])))
                current_group["text"] = " ".join(current_group["text"])
                grouped[current_group["id"]] = current_group
            
            # Start new group
            current_group = {
                "id": chunk["id"],
                "type": "section",
                "header": chunk["text"],
                "text": [],
                "page_idx": [],
                "img_path": [],
                "img_caption": [],
                "img_footnote": [], 
                # leave blank for topic tagging
                # "section_type": [], 
                # "time_period": [], 
                # "art_medium": [],
                # "themes": []
            }

        elif current_group:
            if chunk["type"] == "text":
                current_group["text"].append(chunk["text"])
                current_group["page_idx"].append(chunk["page_idx"])
            elif chunk["type"] == "image":
                current_group["img_path"].append(chunk["img_path"])
                current_group["page_idx"].append(chunk["page_idx"])
                current_group["img_caption"].extend(chunk.get("img_caption", []))
                current_group["img_footnote"].extend(chunk.get("img_footnote", []))
                
    # Save the last group if it exists, joining the text entries.
    if current_group:
        current_group["page_idx"] = list(sorted(set(current_group["page_idx"])))
        current_group["text"] = " ".join(current_group["text"])
        grouped[current_group["id"]] = current_group
        current_group["exhibit"] = "objectifying_china" # rename as needed

    # Save as JSON
    with open("en_contents_chunked.json", "w") as f:
        json.dump(grouped, f, indent=2)

In [10]:
section_chunker(file_path)

In [5]:
with open("grouped_output.json", "r") as f:
    sampled_data = json.load(f)

In [None]:
sampled_data['lnwpgxpl'].update()

{'id': 'lnwpgxpl',
 'type': 'section',
 'header': 'Objectifying China',
 'text': ['Ming and Qing Dynasty Ceramics and Their Stylistic Influences Abroad'],
 'page_idx': [3],
 'img_path': [],
 'img_caption': [],
 'img_footnote': [],
 'section_type': [],
 'time_period': [],
 'art_medium': [],
 'themes': []}

In [37]:
text_sample = sampled_data["qwdktbdk"]["text"]

In [38]:
text_sample 

['The Chinese term ‘ci’ (translated as porcelain in English) refers to all ceramics that are fired at high temperatures, including porcelain and stoneware.',
 'In the West, the term porcelain refers specifically to white ceramics made with a special type of clay called kaolin and fired to a temperature of about $1300^{\\circ}\\mathrm{C}$ , which results in a translucent, glassy material that makes a ringing sound when struck.',
 'Stoneware is used to refer to related ceramics that are similarly hard and dense, but which are made with grey or brown clay, may or may not be white-bodied, do not transmit light, and are fired to a slightly lower temperature of 1000 to $1250^{\\circ}\\mathrm{C}$ .',
 'Ceramics fired below this temperature range are called earthenwares.',
 'The terms ‘protoporcelain’ or ‘porcellaneous’ are sometimes used to describe early ceramics made with some of the same ingredients and physical characteristics of porcelain.']

In [44]:
sampled_data["qwdktbdk"]['header'] = "Porcelain"

In [45]:
sampled_data["qwdktbdk"]['header']

'Porcelain'