In [49]:
import sqlite3, random, pathlib
from search import search_library
from dotenv import load_dotenv
from openai import OpenAI, OpenAIError
from pydantic import BaseModel, Field, ValidationError
import os
from openai import OpenAI
from pydantic import BaseModel
import json
import time
import random
from helpers.sentence_splitter import split_sentences
from time import sleep
from tqdm import tqdm  # optional: pip install tqdm


In [16]:
def ask(sys_msg, usr_msg):
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=[
            {"role": "system", "content": sys_msg},
            {"role": "user", "content": usr_msg}
        ]
    )
    return resp.choices[0].message.content

def print_obj(obj, i=1, params=["agent", "interaction", "space", "time", "causal link", "physical law"]):
    print(f"{i} - ")
    for param in params:
        print(f"{param}: {obj[param]}")
    

def parse_func_resp(resp, obj_name="element", params=["name", "text"], print="True"):
    args_raw = resp.choices[0].message.tool_calls[0].function.arguments
    raws = json.loads(args_raw)[obj_name]

    obj_lst = []
    for i, obj in enumerate(raws, 1):
        if print:
            pass
            #print_obj(obj, i, params)
        obj_lst.append(obj)
    
    return obj_lst

def elements_to_string(dims, label="ELEMENTS"):
    s = ""
    for dim, elems in dims.items():
        s += f"{dim}\n{label}:\n"
        for elem in elems:
            s += elem + "\n"
        s += "\n"

    return s

def sample_text(text, x):
    max_start = max(0, len(text) - x)
    random_start = random.randint(0, max_start)
    excerpt = text[random_start:random_start + x]
    return excerpt

def ask_func(sys_msg, usr_msg, FUNC):
    resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[{"role":"system","content":sys_msg}, {"role":"user","content":usr_msg}],
            tools=[FUNC], tool_choice="auto", temperature=0.5
            )
    return resp

def sample_elems(elements, per_dim, per_elem):
    total_dim = len(elements)
    ndims = min(int(per_dim * total_dim), total_dim)

    # Sample dimension indices
    dimension_indices = random.sample(range(total_dim), ndims)

    sample_elems = {}

    for di in dimension_indices:
        dimension_name = list(elements.keys())[di]
        dimension_elems = list(elements.values())[di]

        total_elems = len(dimension_elems)
        nelems = min(int(per_elem * total_elems), total_elems)

        # Sample element indices from this dimension
        element_indices = random.sample(range(total_elems), nelems)
        sample_elems[dimension_name] = [dimension_elems[ei] for ei in element_indices]

    return sample_elems

def clean_elements(elements_raw):
    elements = {}
    for element_raw in elements_raw:
        k = element_raw["name"]
        v = element_raw["text"]
        element = (k, [item.strip() for item in v.split('\n') if item.strip()])
        elements[element[0]] = element[1]
    return elements


def remove_story_scaffold_keys(data):
    return {k: v for k, v in data.items() if "Story Scaffold" not in k}

    



load_dotenv()
client = OpenAI()
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o")

DB_FILE = "library.sqlite"
MAX_RESULTS = 100

In [39]:

SYS_PROMPT_ELEMENTS = """
My goal is a description of the mechanical forces driving the development of the hindgut and an understanding of how the overexpression of myosin 1c drives the inversion of LR asymmetry 
during development. I want to write a complete story of hindgut development incorporating mechanical force as central players in this story, 
then create a complete story of hindgut development under Myo1C OE, the generation of LR inversion from the role of Myo1C to the hindgut rotation. 
The following text is from a paper on hindgut morphogenesis.
Extract a list of agents, interactions, spatial compartments, developmental stages, causal chains, and physical constraints involved in hindgut morphogenesis and the effect of myosin 1c overexpression on LR inversion.
Structure the output as a story scaffold:
Agents: ...
Interactions: ...
Spaces: ...
Time: ...
Causal links: ...
Physical laws/constraints: ...
"""


SYS_PROMPT_DIMENSIONS = """
You are helping build a complete narrative of hindgut morphogenesis in Drosophila, with special attention to the role of mechanical forces - driving the rotation and elongation of the hindgut - and the overexpression of myosin 1c (Myo1C) in left-right (LR) inversion.

Your task is not to tell the story, but to define what would be required to tell a **complete and comprehensive story** for this set of story elements.

- Break down these elements into their essential **story dimensions** or **sub-components**.
- Think about biological scale (molecular, cellular, tissue), space, time, dynamics, agents, interactions, functions, and unknowns.
- Consider how these elements might relate to:
  - other parts of the hindgut
  - the mechanical forces shaping the tissue
  - the overexpression of Myo1C and its effects on LR inversion

Output a structured list of **story dimensions** that should be explored in order to fully develop these elements. If helpful, include example questions or variables within each dimension.

Format:

**1. Dimension Name**  
- Description of what this dimension involves.  
- Example questions or aspects to explore.

(Repeat as needed for all relevant dimensions)
"""

SYS_PROMPT_PARSE_ELEMENTS = ("Read the following text and separate each element of the suggested story framework into a object with the name and text.\n"
        "You should return a list of objects containing each story element in a given structure.\n"
        "Do not add or remove text; your job is to parse and deconstruct the text string into the given structure.\n"
        "Treat every line that begins with a number followed by “.” and a space (e.g. “1. ”) or **Element** as the NAME of a new element."
        "Capture all following lines up to—but not including—the next such line as the TEXT for that element.")


ELEM_FUNC = {
    "type": "function",
    "function": {
        "name": "element_function",
        "description": "Read the following text and separate each element of the suggested story framework into a object with the name and text. \
        You should return a list of objects containing each story element in a given structure. \
        Do not add or remove text; your job is to parse and deconstruct the text string into the given structure.",
        "parameters": {
            "type": "object",
            "properties": {
                "element": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": { "type": "string" },
                            "text": { "type": "string" }
                        },
                        "required": ["name", "text"]
                    }
                }
            },
            "required": ["element"]
        }
    }
}

ADD_ELEM_FUNC = {
    "type": "function",
    "function": {
        "name": "elements_function",
        "description": "Converts a string of elements into a list of elements.",
        "parameters": {
            "type": "object",
            "properties": {
                "element": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": { "type": "string" }
                        },
                        "required": ["name"]
                    }
                }
            },
            "required": ["element"]
        }
    }
}

In [9]:
KEYWORDS = ["Drosophila", "hindgut"]# ["active tension network", "epithelial"]  # list of words
search_results = search_library(DB_FILE, KEYWORDS, MAX_RESULTS, match_all=True)
# papers = {}
for r in search_results:
    title, text = r['title'], r['text']
    papers[title] = text

KEYWORDS = ["active tension network", "epithelial"]  # list of words
search_results = search_library(DB_FILE, KEYWORDS, MAX_RESULTS, match_all=True)
# papers = {}
for r in search_results:
    title, text = r['title'], r['text']
    papers[title] = text


rem = [None, "Fundamental Neuroscience (3rd edition)","Harrison's Principles of Internal Medicine 15th Edition", "Developmental Biology, 7th Edition"]
for r in rem:
    if r in papers:
        del papers[r]


for title in list(papers.keys()):
    print(title)

Active Tension Network Model in Epithelial Tissues
The Geometric Basis of Epithelial Convergent Extension
Generating Active T1 Transitions through Mechanochemical Feedback
The Drosophila actin nucleator DAAM is essential for left-right asymmetry
Myosin1D is an evolutionarily conserved regulator of animal left–right asymmetry
An unconventional myosin in Drosophila reverses the default handedness in visceral organs
Chiral cell sliding drives left-right asymmetric organ twisting
Cell Chirality Drives Left-Right Asymmetric Morphogenesis
Chirality in Planar Cell Shape Contributes to Left-Right Asymmetric Epithelial Morphogenesis
Vertebrate myosin 1d regulates left–right organizer morphogenesis and laterality
Distinct Cellular and Junctional Dynamics Independently Regulate the Rotation and Elongation of the Embryonic Gut in Drosophila
Genital disc growth in Drosophila
Understanding Laterality Disorders and the Left-Right Organizer: Insights from Zebrafish
Left–right Myosin-Is, Myosin1C, and 

In [14]:
stories = {}
elements_all = []

for n in range(len(papers)):
    title, text = list(papers.keys())[n], list(papers.values())[n]
    excerpt = sample_text(text, 10000)

    sys_msg = SYS_PROMPT_ELEMENTS
    user_msg = f"TEXT EXCERPT:\n{excerpt}"
    elements_resp = ask(sys_msg, user_msg)

    sys_msg = SYS_PROMPT_PARSE_ELEMENTS
    usr_msg = f"STORY FRAMEWORK:\n{elements_resp}"
    elements_func = ask_func(sys_msg, usr_msg, ELEM_FUNC)

    elements_raw = parse_func_resp(elements_func, "element", ["name", "text"], "False")
    elements = clean_elements(elements_raw)

    elements_all.append(elements)
    print(elements)

{'Story Scaffold': ['Hindgut Morphogenesis and Myosin 1c Overexpression'], 'Agents': ['- Cells (constituting the 2D array, each represented as polygons)', '- Actomyosin bundles (mechanically coupled to each cell edge)', '- Adherens junctions (connecting actomyosin bundles)', '- Vertices (representing nodes formed by cell edges)', '- Myosin motors (influencing contraction of actomyosin bundles)', '- Cadherin dimers (crosslinking actomyosin cables at interfaces)'], 'Interactions': ['- Tension (Tij) changes with edge length (r_ij) and actomyosin state', '- Apical pressure (pα) changes with cortical area (dAα)', '- Mechanical coupling of actomyosin bundles via cadherin', '- Myosin recruitment influenced by internal strain rate', '- Feedback mechanism affecting myosin distribution in response to mechanical load', '- Balance of mechanical forces to achieve equilibrium'], 'Spaces': ['- 2D plane (representing the array of cells)', '- Tension plane (a conceptual triangulation to illustrate tens

In [18]:
elements_all_clean = [remove_story_scaffold_keys(d) for d in elements_all]
for elem in elements_all_clean:
    print(elem)

{'Agents': ['- Cells (constituting the 2D array, each represented as polygons)', '- Actomyosin bundles (mechanically coupled to each cell edge)', '- Adherens junctions (connecting actomyosin bundles)', '- Vertices (representing nodes formed by cell edges)', '- Myosin motors (influencing contraction of actomyosin bundles)', '- Cadherin dimers (crosslinking actomyosin cables at interfaces)'], 'Interactions': ['- Tension (Tij) changes with edge length (r_ij) and actomyosin state', '- Apical pressure (pα) changes with cortical area (dAα)', '- Mechanical coupling of actomyosin bundles via cadherin', '- Myosin recruitment influenced by internal strain rate', '- Feedback mechanism affecting myosin distribution in response to mechanical load', '- Balance of mechanical forces to achieve equilibrium'], 'Spaces': ['- 2D plane (representing the array of cells)', '- Tension plane (a conceptual triangulation to illustrate tension vectors)', '- Cell interfaces (where actomyosin bundles exert force)',

In [21]:
# Assuming `list_of_dicts` is your list of dictionaries

elements = []
for d in elements_all:
    for value in d.values():
        if isinstance(value, list):
            elements.extend(value)
        else:
            elements.append(value)


In [22]:
print(len(elements))

400


In [24]:
print(elements)

['- Cells (constituting the 2D array, each represented as polygons)', '- Actomyosin bundles (mechanically coupled to each cell edge)', '- Adherens junctions (connecting actomyosin bundles)', '- Vertices (representing nodes formed by cell edges)', '- Myosin motors (influencing contraction of actomyosin bundles)', '- Cadherin dimers (crosslinking actomyosin cables at interfaces)', '- Tension (Tij) changes with edge length (r_ij) and actomyosin state', '- Apical pressure (pα) changes with cortical area (dAα)', '- Mechanical coupling of actomyosin bundles via cadherin', '- Myosin recruitment influenced by internal strain rate', '- Feedback mechanism affecting myosin distribution in response to mechanical load', '- Balance of mechanical forces to achieve equilibrium', '- 2D plane (representing the array of cells)', '- Tension plane (a conceptual triangulation to illustrate tension vectors)', '- Cell interfaces (where actomyosin bundles exert force)', '- Vertex as nodes in the cellular meshw

In [25]:
unique_items = list(set(elements))
print(len(unique_items))

394


In [33]:
from collections import Counter
from difflib import SequenceMatcher

# Example input: long list (abbreviated for runtime demonstration)
long_list = elements

# Option 1: Remove exact duplicates, preserving order
seen = set()
unique_ordered = []
for item in long_list:
    if item not in seen:
        seen.add(item)
        unique_ordered.append(item)

# Option 2: Group near-duplicates using string similarity
def group_similar_items(items, threshold=0.90):
    groups = []
    used = set()
    for i, item in enumerate(items):
        if i in used:
            continue
        group = [item]
        used.add(i)
        for j in range(i + 1, len(items)):
            if j in used:
                continue
            similarity = SequenceMatcher(None, item.lower(), items[j].lower()).ratio()
            if similarity >= threshold:
                group.append(items[j])
                used.add(j)
        groups.append(group)
    return groups

near_duplicate_groups = group_similar_items(unique_ordered)

#import ace_tools as tools; tools.display_dataframe_to_user(name="Near-Duplicate Groups", dataframe={"Group": near_duplicate_groups})

#print(near_duplicate_groups)

# Merge duplicates by taking the first element in each group and replacing the others
merged_values = []
replacements = {}

for group in near_duplicate_groups:
    if len(group) > 1:
        canonical = group[0]
        for duplicate in group[1:]:
            replacements[duplicate] = canonical

# Apply replacements to the original list
merged_list = []
for item in elements:
    merged_list.append(replacements.get(item, item))

# Remove exact duplicates after merging
elements = list(set(merged_list))


[['- Cells (constituting the 2D array, each represented as polygons)'], ['- Actomyosin bundles (mechanically coupled to each cell edge)'], ['- Adherens junctions (connecting actomyosin bundles)'], ['- Vertices (representing nodes formed by cell edges)'], ['- Myosin motors (influencing contraction of actomyosin bundles)'], ['- Cadherin dimers (crosslinking actomyosin cables at interfaces)'], ['- Tension (Tij) changes with edge length (r_ij) and actomyosin state'], ['- Apical pressure (pα) changes with cortical area (dAα)'], ['- Mechanical coupling of actomyosin bundles via cadherin'], ['- Myosin recruitment influenced by internal strain rate'], ['- Feedback mechanism affecting myosin distribution in response to mechanical load'], ['- Balance of mechanical forces to achieve equilibrium'], ['- 2D plane (representing the array of cells)'], ['- Tension plane (a conceptual triangulation to illustrate tension vectors)'], ['- Cell interfaces (where actomyosin bundles exert force)'], ['- Vertex

In [31]:


# Print or use the sampled elements
#for i, item in enumerate(sampled_elements, 1):
#    print(f"{i}. {item}")

381


In [37]:
SYSTEM_PROMPT_ELEMENTS_COMPLETE = """
You are a developmental biology expert helping to extract all relevant mechanistic, molecular, and morphological elements described in a text about Drosophila hindgut morphogenesis. 
Your task is to read the full text, examine a provided list of extracted elements, and evaluate whether it is complete. 
If any relevant elements (e.g., proteins, structures, interactions, cellular events, mechanisms, stages, forces) are mentioned in the text but missing from the list, identify and list them clearly.
"""

USER_PROMPT_ELEMENTS_COMPLETE = """
Here is the text describing the biological process:

{excerpt}

Below is the current list of identified elements:

{elements_string}

Is this list of elements comprehensive based on the text? If not, what elements are missing? Please provide the missing elements in a clean list.
"""

In [38]:
additional_elements = []
for n in range(len(papers)):
    title, text = list(papers.keys())[n], list(papers.values())[n]
    excerpt = sample_text(text, 10000)
    sampled_elements = random.sample(elements, min(100, len(elements)))
    elements_string = "\n".join(f"- {el}" for el in sampled_elements)
    sys_msg = SYSTEM_PROMPT_ELEMENTS_COMPLETE
    usr_msg = USER_PROMPT_ELEMENTS_COMPLETE.format(excerpt=excerpt, elements_string=elements_string)
    add_elem = ask(sys_msg, usr_msg)
    print(add_elem)
    additional_elements.append(add_elem)
    
    
    

Upon reviewing the text, there are several additional elements mentioned that are not included in the current list. Below is the list of missing elements:

1. Cortical tension
2. Adherens junctions
3. Friction (ν) as a factor in the timescale of mechanical relaxation
4. Triangulation of tension plane
5. Actomyosin bundles
6. Force balance at vertices
7. Tension vectors (Tij, Tik, Til)
8. Myosin recruitment based on mechanical feedback
9. Intrinsic variables representing actomyosin and cadherin interface state
10. Elastic form of tension, Tij = K(rij − ℓij)
11. Walking kernel W(x) responsible for myosin activity
12. Stall force Ts and its impact on the actomyosin bundle
13. Load per myosin and its physiological effects
14. Isogonal modes and deformations
15. Conformational changes such as 'isogonal breathing mode'
16. Compatibility constraint χα = 1 for cell geometry and tension balance
17. Displacement (δri) dependent on equilibrium state parameters
18. Geometry implications from dual 

The provided text primarily discusses elements related to cell mechanics, tension dynamics, and geometric transformations during tissue morphogenesis, particularly in Drosophila or similar developmental contexts. The text focuses on concepts like isogonal shear, Delaunay triangulations, and the behavior of tension triangles rather than any specific proteins or traditional molecular pathways.

Here are some missing or additional relevant elements extracted from the text:

1. Isogonal shear
2. Cell-level shear modulus
3. Tissue-level shear modulus
4. Cortical tensions
5. Isogonal deformation
6. Pure shear
7. Tension vectors and their arrangement
8. Isogonal displacement field
9. Linear gradient in the tension space
10. Quadratic isogonal potential
11. Force balance equations in tissue
12. Symmetric lattice of tension triangles
13. Mechanosensitive feedback loop for tension dynamics
14. Passive tension at vertex junctions
15. Elastic energy based on cell shape tensor
16. Cortical area cha

In [40]:
SYSTEM_PROMPT_PARSE_ADD_ELEMENTS = """
You are a helpful assistant that extracts scientific elements from technical biological or biophysical texts. 
Your job is to read the text and convert any listed or described biological or mechanical elements into a structured list of objects. 
Each object represents a single element with its name (include any associated description inside the name parameter).
"""

USER_PROMPT_PARSE_ADD_ELEMENTS = """
Read the following scientific text and extract all elements it lists or implies. Convert them into a structured list using the provided function. Each element should appear once with its name in a clean format.

The provided text primarily discusses elements related to cell mechanics, tension dynamics, and geometric transformations during tissue morphogenesis, particularly in Drosophila or similar developmental contexts.

{elems}
"""
elems_all = []
for elems in additional_elements:
    # convert to a list
    elements_func = ask_func(sys_msg, usr_msg, ELEM_FUNC)

    sys_msg = SYSTEM_PROMPT_PARSE_ADD_ELEMENTS
    usr_msg = USER_PROMPT_PARSE_ADD_ELEMENTS.format(elems=elems)
    elems_func = ask_func(sys_msg, usr_msg, ADD_ELEM_FUNC)

    elems_raw = parse_func_resp(elems_func, "element", ["name"])
    elems_all.extend(elems_raw)
    #elements = clean_elements(elements_raw)


In [47]:
print(len(elems_all))
print(len(elements))

217
400


In [48]:
elements.extend(elems_all)
print(len(elements))

617


In [52]:
num_batches = 10        # how many different samples to take
sample_size = 50        # number of elements per sample
output_file = "element_summaries.txt"
sys_prompt = """
You are a scientific summarizer and storyteller. 
Given a list of biological and mechanical elements involved in Drosophila hindgut morphogenesis, analyze and summarize the types of elements, 
the morphogenetic dimensions they span, and the kinds of developmental narratives they enable. 
Focus on their diversity, depth, and potential integration into causal and mechanistic explanations of development.
"""

# Create output file
with open(output_file, "w") as out:
    out.write("# Summary of Random Samples from Element List\n\n")

# Run the loop
for i in tqdm(range(num_batches), desc="Sampling and summarizing"):
    sampled = random.sample(elements, sample_size)
    formatted_elements = "\n".join(f"- {el}" for el in sampled)

    usr_prompt = f"""Here is a list of elements sampled from a larger library (>500 elements) involved in hindgut morphogenesis:

    {formatted_elements}

    Please:
    1. Summarize what these elements are and what they represent.
    2. Describe the dimensions of hindgut morphogenesis they help capture (e.g. mechanical forces, gene regulation, spatial organization, feedback loops).
    3. Comment on the types of developmental stories or causal narratives these elements allow us to tell (e.g. stories of tissue twisting, cellular rearrangement, molecular feedback).
    """
    try: 
        summary = ask(sys_prompt, usr_prompt)
        with open(output_file, "a") as out:
                out.write(f"\n\n## Batch {i+1}\n\n")
                out.write(summary)

        sleep(1.5)  # respectful pause
    except Exception as e:
        print(f"Error on batch {i+1}: {e}")
        continue





Sampling and summarizing: 100%|██████████| 10/10 [03:03<00:00, 18.40s/it]
