In [None]:
import sqlite3, random, pathlib
from search import search_library
from dotenv import load_dotenv
from openai import OpenAI, OpenAIError
from pydantic import BaseModel, Field, ValidationError
import os
from openai import OpenAI
from pydantic import BaseModel
import json
import time
import random
from helpers.sentence_splitter import split_sentences
from time import sleep
from tqdm import tqdm  # optional: pip install tqdm

from search import search_library
DB_FILE = "library_temp.sqlite"
MAX_RESULTS = 100

def ask(sys_msg, usr_msg):
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": sys_msg},
            {"role": "user", "content": usr_msg}
        ]
    )
    return resp.choices[0].message.content


def print_obj(obj, i=1, params=["agent", "interaction", "space", "time", "causal link", "physical law"]):
    print(f"{i} - ")
    for param in params:
        print(f"{param}: {obj[param]}")
    

def parse_func_resp(resp, obj_name="element", params=["name", "text"], print="True"):
    args_raw = resp.choices[0].message.tool_calls[0].function.arguments
    raws = json.loads(args_raw)[obj_name]

    obj_lst = []
    for i, obj in enumerate(raws, 1):
        if print:
            pass
            #print_obj(obj, i, params)
        obj_lst.append(obj)
    
    return obj_lst

def elements_to_string(dims, label="ELEMENTS"):
    s = ""
    for dim, elems in dims.items():
        s += f"{dim}\n{label}:\n"
        for elem in elems:
            s += elem + "\n"
        s += "\n"

    return s

def sample_text(text, x):
    max_start = max(0, len(text) - x)
    random_start = random.randint(0, max_start)
    excerpt = text[random_start:random_start + x]
    return excerpt

def ask_func(sys_msg, usr_msg, FUNC):
    resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[{"role":"system","content":sys_msg}, {"role":"user","content":usr_msg}],
            tools=[FUNC], tool_choice="auto", temperature=0.5
            )
    return resp

def sample_elems(elements, per_dim, per_elem):
    total_dim = len(elements)
    ndims = min(int(per_dim * total_dim), total_dim)

    # Sample dimension indices
    dimension_indices = random.sample(range(total_dim), ndims)

    sample_elems = {}

    for di in dimension_indices:
        dimension_name = list(elements.keys())[di]
        dimension_elems = list(elements.values())[di]

        total_elems = len(dimension_elems)
        nelems = min(int(per_elem * total_elems), total_elems)

        # Sample element indices from this dimension
        element_indices = random.sample(range(total_elems), nelems)
        sample_elems[dimension_name] = [dimension_elems[ei] for ei in element_indices]

    return sample_elems

def clean_elements(elements_raw):
    elements = {}
    for element_raw in elements_raw:
        k = element_raw["name"]
        v = element_raw["text"]
        element = (k, [item.strip() for item in v.split('\n') if item.strip()])
        elements[element[0]] = element[1]
    return elements


def remove_story_scaffold_keys(data):
    return {k: v for k, v in data.items() if "Story Scaffold" not in k}

    

load_dotenv()
client = OpenAI()
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")

DB_FILE = "library_temp.sqlite"
MAX_RESULTS = 100


SYS_PROMPT_ELEMENTS_DIAMONDS = """
My goal is to construct a foundation of story elements relevant to diamonds, including their scientific properties, classification systems, cultural roles, synthetic processes, and economic or social significance. I want each element to be self-explanatory so that it can serve as a modular unit in the construction of narratives or analytical models about diamonds.

The following text is from reference material on diamonds including gemological manuals, technical reports, and buying guides.

Extract a comprehensive and self-contained list of:

Agents: (e.g. individual or collective actors involved with diamonds — people, institutions, tools, forces, material)
Interactions: (e.g. processes, mechanisms, or relationships between agents or diamond properties)
Spaces: (e.g. geological environments, laboratory contexts, economic or retail markets, regulatory spaces)
Time: (e.g. chronological sequences, stages of formation, market dynamics over time)
Causal Links: (e.g. cause-effect chains linking diamond treatments, grading, formation processes, or purchase behavior)
Physical Laws / Constraints: (e.g. physical, chemical, or optical principles governing diamond properties, classification, or manipulation)

Please format each category as a list of bullet points. Each bullet point must be a **self-contained element**: it should define the item clearly, include context or examples if helpful, and be understandable in isolation.

"""


SYS_PROMPT_DIMENSIONS = """
You are helping build a complete narrative of hindgut morphogenesis in Drosophila, with special attention to the role of mechanical forces - driving the rotation and elongation of the hindgut - and the overexpression of myosin 1c (Myo1C) in left-right (LR) inversion.

Your task is not to tell the story, but to define what would be required to tell a **complete and comprehensive story** for this set of story elements.

- Break down these elements into their essential **story dimensions** or **sub-components**.
- Think about biological scale (molecular, cellular, tissue), space, time, dynamics, agents, interactions, functions, and unknowns.
- Consider how these elements might relate to:
  - other parts of the hindgut
  - the mechanical forces shaping the tissue
  - the overexpression of Myo1C and its effects on LR inversion

Output a structured list of **story dimensions** that should be explored in order to fully develop these elements. If helpful, include example questions or variables within each dimension.

Format:

**1. Dimension Name**  
- Description of what this dimension involves.  
- Example questions or aspects to explore.

(Repeat as needed for all relevant dimensions)
"""

SYS_PROMPT_PARSE_ELEMENTS = ("Read the following text and separate each element of the suggested story framework into a object with the name and text.\n"
        "You should return a list of objects containing each story element in a given structure.\n"
        "Do not add or remove text; your job is to parse and deconstruct the text string into the given structure.\n"
        "Treat every line that begins with a number followed by “.” and a space (e.g. “1. ”) or **Element** as the NAME of a new element."
        "Capture all following lines up to—but not including—the next such line as the TEXT for that element.")


ELEM_FUNC = {
    "type": "function",
    "function": {
        "name": "element_function",
        "description": "Read the following text and separate each element of the suggested story framework into a object with the name and text. \
        You should return a list of objects containing each story element in a given structure. \
        Do not add or remove text; your job is to parse and deconstruct the text string into the given structure.",
        "parameters": {
            "type": "object",
            "properties": {
                "element": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": { "type": "string" },
                            "text": { "type": "string" }
                        },
                        "required": ["name", "text"]
                    }
                }
            },
            "required": ["element"]
        }
    }
}

ADD_ELEM_FUNC = {
    "type": "function",
    "function": {
        "name": "elements_function",
        "description": "Converts a string of elements into a list of elements.",
        "parameters": {
            "type": "object",
            "properties": {
                "element": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": { "type": "string" }
                        },
                        "required": ["name"]
                    }
                }
            },
            "required": ["element"]
        }
    }
}


In [11]:
KEYWORDS = ["diamond"]
search_results = search_library(DB_FILE, KEYWORDS, MAX_RESULTS, match_all=True)
papers = {}
for r in search_results:
    title, text = r['title'], r['text']
    papers[title] = text
    print(title, len(text))

Diamond Handbook: How To Look At Diamonds & Avoid Ripoffs 319838
The Diamond Book: CIBJO Diamond Commission 2022-1 59207
Diamond: Electronic Properties and Applications 1154803
Diamonds: An Early History of the King of Gems 693764
Legend of the Rock: A Two Thousand Year History of the Diamond 21569
Diamonds 142168


In [12]:
for n in range(len(papers)):
    title, text = list(papers.keys())[n], list(papers.values())[n]
    excerpt = sample_text(text, 10000)
    print(title)
    print("***"*20)
    print(excerpt)

Diamond Handbook: How To Look At Diamonds & Avoid Ripoffs
************************************************************
 of diamonds by heating diamonds to temperatures 
above 1900°C under extreme pressure. It can turn inexpensive brown 
diamonds colorless or make them green, yellow, blue, red or pink. The 
color is stable. 
The main advantage to buying treated diamonds is price. Dealer 
Paul Reiser, who specializes in selling second-hand diamonds, says that 
laser-drilled stones are typically discounted 25 to 30 % and fracture-filled 
ones often sell at 40% off of the secondary market price of untreated 
stones. 
Diamonds colored by irradiation or HPHT treatment are a fraction of 
the cost of natural fancy color diamonds. For example, a one-carat 
irradiated “fancy” green diamond of VS clarity may retail for $3000 to 
$4000 per carat. If the same diamond were of natural color, it would 
probably sell for over $100,000 per carat because natural green diamonds 
are unusually rare. In Oct

In [16]:
elements_all = []

for n in range(len(papers)):
    for _ in range(3):
        title, text = list(papers.keys())[n], list(papers.values())[n]
        excerpt = sample_text(text, 10000)

        sys_msg = SYS_PROMPT_ELEMENTS_DIAMONDS
        user_msg = f"TEXT EXCERPT:\n{excerpt}"
        elements_resp = ask(sys_msg, user_msg)

        sys_msg = SYS_PROMPT_PARSE_ELEMENTS
        usr_msg = f"STORY FRAMEWORK:\n{elements_resp}"
        elements_func = ask_func(sys_msg, usr_msg, ELEM_FUNC)

        elements_raw = parse_func_resp(elements_func, "element", ["name", "text"], "False")
        elements = clean_elements(elements_raw)

        elements_all.append(elements)
        print(elements)

{'Agents': ['- Jewelers: Retailers and specialists in selecting and selling high-quality diamonds.', '- Suppliers: Companies providing well-cut diamonds to jewelers and stores.', '- Manufacturers: Companies patenting and producing specific diamond cuts, such as William Goldberg Corp, Baroka Creations Inc., Lili Diamonds, and Royal Asscher Diamond Co.', '- Distributors: Entities responsible for distributing diamonds in various regions, like Douglas Mays for Wild & Petsch.', '- Customers: Individuals purchasing diamonds, often attracted by unique cuts and quality.', '- Certification Bodies: Organizations such as GCAL, GIA, AGS, and HRD providing certification and grading for diamonds.', '- Designers: High-end designers using specific branded cuts for luxury jewelry creations.', '- Retail Stores: Physical or online stores showcasing and selling diamonds to end consumers.'], 'Interactions': ['- Branding: The strategic naming and marketing of specific diamond cuts to increase appeal and rec

In [18]:
for elements in elements_all:
    print(elements)

{'Agents': ['- Jewelers: Retailers and specialists in selecting and selling high-quality diamonds.', '- Suppliers: Companies providing well-cut diamonds to jewelers and stores.', '- Manufacturers: Companies patenting and producing specific diamond cuts, such as William Goldberg Corp, Baroka Creations Inc., Lili Diamonds, and Royal Asscher Diamond Co.', '- Distributors: Entities responsible for distributing diamonds in various regions, like Douglas Mays for Wild & Petsch.', '- Customers: Individuals purchasing diamonds, often attracted by unique cuts and quality.', '- Certification Bodies: Organizations such as GCAL, GIA, AGS, and HRD providing certification and grading for diamonds.', '- Designers: High-end designers using specific branded cuts for luxury jewelry creations.', '- Retail Stores: Physical or online stores showcasing and selling diamonds to end consumers.'], 'Interactions': ['- Branding: The strategic naming and marketing of specific diamond cuts to increase appeal and rec

In [19]:
elements = []
for d in elements_all:
    for value in d.values():
        if isinstance(value, list):
            elements.extend(value)
        else:
            elements.append(value)

print(len(elements))

168


In [20]:
SYSTEM_PROMPT_ELEMENTS_MORE_DIAMONDS = """
You are a gemology and materials science expert helping to extract all relevant physical, chemical, commercial, and cultural elements described in a text about diamonds.  
Your task is to read the full text, examine a provided list of previously extracted elements, and evaluate whether it is complete.  
If any relevant elements (e.g., classifications, material properties, synthesis methods, grading criteria, historical facts, cultural meanings, or industrial applications) are mentioned in the text but missing from the list, identify and list them clearly.
Each new element should be self-contained, meaning it includes enough context to be understood without referencing the rest of the list.
"""

USER_PROMPT_ELEMENTS_MORE_DIAMONDS = """
Here is the excerpt from a text on diamonds:

{excerpt}

Below is the current list of identified elements:

{elements_string}

Is this list of elements comprehensive based on the text? If not, what elements are missing? Please provide the missing elements in a clean list.
"""

In [21]:

additional_elements = []
for n in range(len(papers)):
    title, text = list(papers.keys())[n], list(papers.values())[n]
    excerpt = sample_text(text, 10000)
    sampled_elements = random.sample(elements, min(100, len(elements)))
    elements_string = "\n".join(f"- {el}" for el in sampled_elements)
    sys_msg = SYSTEM_PROMPT_ELEMENTS_MORE_DIAMONDS
    usr_msg = USER_PROMPT_ELEMENTS_MORE_DIAMONDS.format(excerpt=excerpt, elements_string=elements_string)
    add_elem = ask(sys_msg, usr_msg)
    print(add_elem)
    additional_elements.append(add_elem)

The list you've provided is extensive, but it does not cover several aspects specifically detailed in the diamond text excerpt you shared. Here are missing elements from the excerpt:

1. **Clarity Grading System**: The categorization of diamonds based on inclusions and blemishes, including:
   - VVS (Very, Very Slightly Included)
   - VS (Very Slightly Included)
   - SI (Slightly Included)
   - I (Imperfect), including distinctions of I1, I2, and I3 in different regions (e.g., European grading P1, P2, P3).

2. **Types of Inclusions**:
   - **Crystals**: Inclusion of over 24 types of minerals, most commonly other diamonds; distinctions such as pinpoints and collector-worthy crystals.
   - **Cracks**: Including cleavages and fractures, where significant cracks are referred to as feathers.
   - **Clouds**: Characterized as hazy areas detrimental to transparency and aesthetics if substantial.
   - **Growth or Grain Lines**: Also known as twinning lines that affect transparency and appearan

In [26]:
to_add_elems = []
for add_elems in additional_elements:
    sys_msg = SYS_PROMPT_PARSE_ELEMENTS
    usr_msg = f"ELEMENTS:\n{add_elems}"
    elements_func = ask_func(sys_msg, usr_msg, ELEM_FUNC)
    elements_raw = parse_func_resp(elements_func, "element", ["name", "text"], "False")
    to_add_elems.extend(elements_raw)
    

In [28]:
for elem in to_add_elems:
    n, t = elem["name"], elem["text"]
    elements.append(f"{n}\n{t}")

In [29]:
print(len(elements))

230


In [30]:
num_batches = 5        # how many different samples to take
sample_size = 50        # number of elements per sample
output_file = "diamond_summaries.txt"
SYSTEM_PROMPT_SUMMARIZE_DIAMONDS = """
You are a gemology and materials science expert with a deep understanding of how scientific, commercial, and cultural narratives are constructed from elemental knowledge.  
Given a list of extracted elements related to diamonds—including physical properties, chemical classifications, synthesis methods, grading criteria, commercial roles, and cultural meanings—your task is to analyze and summarize the structure of this list.

Identify:
- The **types of elements** present (e.g., properties, agents, processes, classifications, technologies, beliefs)
- The **dimensions** they span (e.g., material, temporal, spatial, economic, symbolic)
- The **narrative forms** they support (e.g., scientific explanations, commercial decision-making, cultural metaphors)

Reflect on the **diversity, depth, and potential integration** of these elements into coherent **causal or mechanistic frameworks**, as well as their ability to support **multi-layered narratives** about the development, transformation, and perception of diamonds.
Your output should help guide the construction of rich, multidimensional stories using these elements as foundational components.
"""

# Create output file
with open(output_file, "w") as out:
    out.write("# Summary of Random Samples from Element List\n\n")

sys_msg = SYSTEM_PROMPT_SUMMARIZE_DIAMONDS
# Run the loop
for i in tqdm(range(num_batches), desc="Sampling and summarizing"):
    sampled = random.sample(elements, sample_size)
    formatted_elements = "\n".join(f"- {el}" for el in sampled)

    usr_msg = f"""Here is a list of elements sampled from a larger library (>500 elements) involved in diamonds:

    {formatted_elements}

    Please:
    1. Summarize what these elements are and what they represent.
    2. Describe the dimensions of narratives they help capture (e.g., material, temporal, spatial, economic, symbolic).
    3. Comment on the **diversity, depth, and potential integration** these elements allow us to tell (e.g. stories of scientific explanations, commercial decision-making, cultural metaphors).
    """
    try: 
        summary = ask(sys_msg, usr_msg)
        with open(output_file, "a") as out:
                out.write(f"\n\n## Batch {i+1}\n\n")
                out.write(summary)

        sleep(1.5)  # respectful pause
    except Exception as e:
        print(f"Error on batch {i+1}: {e}")
        continue

Sampling and summarizing: 100%|██████████| 5/5 [01:04<00:00, 12.92s/it]
