In [39]:
# Add the root of the project (where `src/` lives) to sys.path
import sys
from pyprojroot import here
sys.path.append(str(here()))
import dotenv
import os
# Load environment variables from .env file
dotenv.load_dotenv()

# Set the API keys
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
print(f"AZURE_OPENAI_API_KEY: {azure_openai_api_key[:4]}...")
if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")
# Set the OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
print(f"OPENAI_API_KEY: {openai_api_key[:4]}...")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set.")
                           
azure_deployment=os.getenv("AZURE_MODEL_DEPLOYMENT_NAME")
azue_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")



AZURE_OPENAI_API_KEY: GEQy...
OPENAI_API_KEY: sk-p...


In [40]:
from src.utils.extract_synthesis_and_challenges import extract_synthesis, extract_challenges

In [41]:
from src.utils.pymupdf_loader import PyMuPDFLoader
data_dir = here("data/papers/Na-Mn-O")
pdf_loader = PyMuPDFLoader(data_dir)
file_name = "000690060.pdf"
pdf_text = pdf_loader.load_a_pdf(file_name)

synthesis_text = '''{text}'''.format(text="\n".join(pdf_text))

# load synthesis output schema
import json
path_to_schema = here("data/papers/Na-Mn-O/synthesis_schema.json")
# Load the JSON file
with open(path_to_schema, "r", encoding="utf-8") as f:
    synthesis_schemas = json.load(f)
# Extract the last optimized schema
synthesis_output_schema = synthesis_schemas[-1]["optimized_schema"]
#print(type(synthesis_output_schema))

print(f"starting synthesis extraction======================")
# extract synthesis
synthesis_info = extract_synthesis(
                                   synthesis_text=synthesis_text,
                                   synthesis_output_schema= synthesis_output_schema,
                                   api_key=azure_openai_api_key,
                                   azure=True,
                                   model_name=azure_deployment,
                                   temp=0)

print(f"end======================")
#print(f"Extracted synthesis info: {synthesis_info}")
print (f"Starting challenges extraction======================")
# extract challenges

# load challenges output schema
import json
path_to_schema = here("data/papers/Na-Mn-O/challenges_schema.json")
# Load the JSON file
with open(path_to_schema, "r", encoding="utf-8") as f:
    challenges_schemas = json.load(f)

challenges_output_schema = challenges_schemas[-1]["optimized_schema"]

challenges_info = extract_challenges(
                                     synthesis_text=synthesis_text,
                                     challenges_output_schema=challenges_output_schema,
                                     api_key=azure_openai_api_key,
                                     azure=True,
                                     model_name=azure_deployment,
                                     temp=0)
print(f"end======================")
print(f"Extracted challenges info: {challenges_info}")

# cancat the the extracted synthesis and challenges info with the file name and save it into a json file
import json
import os
output_dir = here("data/papers/Na-Mn-O")
output_file = os.path.join(output_dir, "synthesis_and_challenges.json")
with open(output_file, "w", encoding="utf-8") as f:
    json.dump({"file_name": file_name.replace(".pdf", ""),
        "synthesis_info": synthesis_info,
        "challenges_info": challenges_info
    }, f, indent=4)

print(f"Number tokens of PDFs loaded: {len(pdf_text)}")
print(type(pdf_text))
print(pdf_text[:100])

synthesis_text = '''{text}'''.format(text=pdf_text)
print(pdf_text[:100])



Extracted challenges info: [{'material': 'P2−NaxMnO2 flakes', 'stage': 'application', 'challenge': {'description': 'Partial phase transition to an orthorhombic crystal system upon Na+ insertion.', 'type': 'structural', 'impact': 'Repeated phase transitions can lead to structural inhomogeneity, reducing cycling stability.'}, 'solution': {'description': 'Co-doping on the order of 10% suppresses structural transformations.', 'methodology': 'Incorporation of cobalt into the material structure.', 'effectiveness': 'Effective in suppressing phase transitions and improving cycling stability.'}, 'evidence': {'source': 'Results and Discussion section', 'data': 'Structural studies showed suppression of phase transitions in Co-doped materials.'}, 'context': {'material_properties': 'Layered P2-type structure with hexagonal unit cell.', 'experimental_conditions': 'Cycling tests with Na+ insertion.', 'application_relevance': 'Improved cycling stability for sodium-ion battery cathodes.'}}, {'material'

In [4]:
type(synthesis_info)

list

In [5]:
# Load the json file and print the content
with open(output_file, "r") as f:
    data = json.load(f)
    print("File Name:", data["file_name"])
    print("Synthesis Info:", data["synthesis_info"])
    print("Challenges Info:", data["challenges_info"])

File Name: 000690060
Synthesis Info: [{'material': {'name': 'Na0.6MnO2+z', 'phase': 'P2', 'dopants': [], 'morphology': 'flake', 'purity': None}, 'synthesis_steps': [{'step': 1, 'label': 'Precursor Mixing', 'details': {'reagents': [{'name': 'NaNO3', 'stoichiometry': None}, {'name': 'Mn(CH3COO)2', 'stoichiometry': None}], 'solvent': 'deionized water', 'precipitant': None, 'temperature': None, 'pressure': None, 'duration': None, 'atmosphere': None, 'equipment': None}}, {'step': 2, 'label': 'Combustion', 'details': {'reagents': [{'name': 'HNO3', 'stoichiometry': None}, {'name': 'gelatin', 'stoichiometry': None}], 'solvent': None, 'precipitant': None, 'temperature': None, 'pressure': None, 'duration': None, 'atmosphere': None, 'equipment': None}}], 'post_processing': [{'step': 1, 'label': 'Annealing', 'details': {'temperature': '800°C', 'duration': '4 h', 'atmosphere': None, 'equipment': None}}, {'step': 2, 'label': 'Quenching', 'details': {'temperature': '610°C', 'duration': '9 h', 'atmosp

In [6]:
# Looping trough files in the data_dir and extracting the synthesis and challenges info
import tqdm.auto as tqdm
file_names = os.listdir(data_dir)
# filter the file names to only include pdf files
file_names = [file_name for file_name in file_names if file_name.endswith(".pdf")]

# load synthesis optimized schema for synthesis and challenges
# load synthesis output schema
import json
path_to_schema = here("data/papers/Na-Mn-O/synthesis_schema.json")
# Load the JSON file
with open(path_to_schema, "r", encoding="utf-8") as f:
    synthesis_schemas = json.load(f)
# Extract the last optimized schema
synthesis_output_schema = synthesis_schemas[-1]["optimized_schema"]
#print(type(synthesis_output_schema))
# load challenges output schema
path_to_schema = here("data/papers/Na-Mn-O/challenges_schema.json")
# Load the JSON file
with open(path_to_schema, "r", encoding="utf-8") as f:
    challenges_schemas = json.load(f)
challenges_output_schema = challenges_schemas[-1]["optimized_schema"]

# Looping through files in the data_dir and extracting the synthesis and challenges info

for file_name in tqdm.tqdm(file_names, desc=f"Extracting synthesis and challenges from {file_name}"):
    pdf_text = pdf_loader.load_a_pdf(file_name)
    synthesis_text = '''{text}'''.format(text="\n".join(pdf_text))
    synthesis_info = extract_synthesis(
                                    synthesis_text=synthesis_text,
                                    synthesis_output_schema=synthesis_output_schema,
                                    api_key=azure_openai_api_key,
                                    azure=True,
                                    model_name=azure_deployment,
                                    temp=0)

    challenges_info = extract_challenges(
                                        synthesis_text=synthesis_text,
                                        challenges_output_schema=challenges_output_schema,
                                        api_key=azure_openai_api_key,
                                        azure=True,
                                        model_name=azure_deployment,
                                        temp=0)
    # concat the the extracted synthesis and challenges info with the file name and save it into a json file
    output_file = os.path.join(output_dir, file_name.replace(".pdf", "_synthesis_and_challenges.json"))
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump({"file_name": file_name.replace(".pdf", ""),
            "synthesis_info": synthesis_info,
            "challenges_info": challenges_info
        }, f, indent=4)

Extracting synthesis and challenges from 000690060.pdf:   0%|          | 0/5 [00:00<?, ?it/s]

# Extracting info from 040201870.pdf

In [None]:
from src.utils.pymupdf_loader import PyMuPDFLoader
data_dir = here("data/papers/Na-Mn-O")
pdf_loader = PyMuPDFLoader(data_dir)
file_name = "040201870.pdf"
pdf_text = pdf_loader.load_a_pdf(file_name)
from pprint import pprint

print(f"Number tokens of PDFs loaded: {len(pdf_text)}")
pprint(pdf_text[:10000])

# synthesis_text = '''{text}'''.format(text="\n".join(pdf_text))

# synthesis_info = extract_synthesis(
#                                    synthesis_text=synthesis_text,
#                                    api_key=azure_openai_api_key,
#                                    azure=True,
#                                    model_name=azure_deployment,
#                                    temp=0)

# challenges_info = extract_challenges(
#                                      synthesis_text=synthesis_text,
#                                      api_key=azure_openai_api_key,
#                                      azure=True,
#                                      model_name=azure_deployment,
#                                      temp=0)

# # cancat the the extracted synthesis and challenges info with the file name and save it into a json file
# import json
# import os
# output_dir = here("data/papers/Na-Mn-O")
# output_file = os.path.join(output_dir, "synthesis_and_challenges.json")
# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump({"file_name": file_name.replace(".pdf", ""),
#         "synthesis_info": synthesis_info,
#         "challenges_info": challenges_info
#     }, f, indent=4)

# print(f"Number tokens of PDFs loaded: {len(pdf_text)}")
# print(type(pdf_text))
# print(pdf_text[:100])

# synthesis_text = '''{text}'''.format(text=pdf_text)
# print(pdf_text[:100])


Number tokens of PDFs loaded: 46213
('Solid State Ionics 126 (1999) 3–24 www.elsevier.com/locate/ssi Studies of '
 'the layered manganese bronzes, Na [Mn M ]O with 2 / 3 12x x 2 M 5 Co, Ni, '
 'Li, and Li [Mn M ]O prepared by ion-exchange 2 / 3 12x x 2 a b ,* J.M. '
 'Paulsen , J.R. Dahn aDepartment of Physics, Dalhousie University, Halifax, '
 'Nova Scotia, Canada B3H 3J5 bDepartments of Physics and Chemistry, Dalhousie '
 'University, Halifax, Nova Scotia, Canada B3H 3J5 Received 7 May 1999; '
 'accepted 24 May 1999 Abstract Layered sodium manganese bronzes, Na MO (M 5 '
 'Mn A , A 5 Co, Li, Ni) with the P2-structure were investigated. 2 / 3 2 12x '
 'x A phase diagram (composition–structure–temperature diagram) for materials '
 'synthesized in air is presented. Substitution extends the stability region '
 'of P2 phases toward lower temperatures. Na MnO exhibits a monoclinic '
 'distortion. Weakly 2 / 3 2 substituted samples exhibit an orthorhombic '
 'distortion of the ideal P2-stru

In [18]:
# Load the json file and print the content
with open(output_file, "r") as f:
    data = json.load(f)
    print("File Name:", data["file_name"])
    print("Synthesis Info:", data["synthesis_info"])
    print("Challenges Info:", data["challenges_info"])

File Name: 040201870
Synthesis Info: ```json
[]
```
Challenges Info: ```json
[]
```


In [28]:
from src.utils.pymupdf_loader import PyMuPDFLoader
data_dir = here("data/papers/Na-Mn-O")
pdf_loader = PyMuPDFLoader(data_dir)
file_name = "000690060.pdf"
output_file = os.path.join(output_dir, file_name.replace(".pdf", "_synthesis_and_challenges.json"))

# Load the json file and print the content
with open(output_file, "r") as f:
    data = json.load(f)
    print("File Name:", data["file_name"])
    print("Synthesis Info:", data["synthesis_info"])
    print("Challenges Info:", data["challenges_info"])

File Name: 000690060
Synthesis Info: ```json
[
  {
    "material": "Na0.6MnO2+z",
    "synthesis": {
      "steps": [
        {
          "step": 1,
          "label": "Precursor Mixing",
          "details": {
            "reagents": ["NaNO3", "Mn(CH3COO)2"],
            "temperature": null,
            "duration": null
          }
        },
        {
          "step": 2,
          "label": "Solution Preparation",
          "details": {
            "reagents": ["deionized water"],
            "temperature": null,
            "duration": null
          }
        },
        {
          "step": 3,
          "label": "Combustion",
          "details": {
            "reagents": ["HNO3", "gelatin"],
            "temperature": null,
            "duration": null
          }
        },
        {
          "step": 4,
          "label": "Annealing",
          "details": {
            "reagents": [],
            "temperature": "800°C",
            "duration": "4 h"
          }
        },
       

In [33]:


# Extract materials from synthesis_info
materials = []
if "synthesis_info" in data:
    synthesis_info = json.loads(data["synthesis_info"].strip("```json\n"))
    materials.extend([entry["material"] for entry in synthesis_info])

print(materials)

# # Remove duplicates and print the list of materials
# unique_materials = list(set(materials))
# print("Extracted Materials:")
# for material in unique_materials:
#     print(material)

['Na0.6MnO2+z', 'Na0.6Co0.1Mn0.9O2+z', 'Na0.7MnO2+z', 'Na0.6Co0.1Mn0.9O2+z (spheres)']


In [34]:
# Extract materials from challenges_info (if applicable)
materials = []
if "challenges_info" in data:
    challenges_info = json.loads(data["challenges_info"].strip("```json\n"))
    materials.extend([entry["material"] for entry in challenges_info])

print(materials)

['P2−NaxMnO2 flakes', 'P2−NaxMnO2 spheres', 'P2−NaxCo0.1Mn0.9O2 flakes', 'P2−NaxCo0.1Mn0.9O2 spheres']


In [10]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json

# Add the root of the project (where `src/` lives) to sys.path
import sys
from pyprojroot import here
sys.path.append(str(here()))
import dotenv
import os
# Load environment variables from .env file
dotenv.load_dotenv()

# Set the API keys
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
print(f"AZURE_OPENAI_API_KEY: {azure_openai_api_key[:4]}...")
if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")
                           
azure_deployment=os.getenv("AZURE_MODEL_DEPLOYMENT_NAME")
azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")

# Create the AzureChatOpenAI client
llm = AzureChatOpenAI(
            azure_deployment=azure_deployment,
            api_version=azure_api_version,
            temperature=0,
            openai_api_key=azure_openai_api_key,
            azure_endpoint=azure_endpoint)

# Prompt template
prompt = PromptTemplate.from_template("""
You are a lab assistant helping visualize synthesis procedures.

Given this list of synthesis steps for the material "{material_name}", create a Mermaid.js flowchart.
Use relevant emojis for each step to represent their function (e.g., 🧪 for mixing, 🔥 for heating, 💧 for washing).
Show arrows between the steps to represent process flow.

Steps:
{steps_text}

Output only the Mermaid diagram.
""")

chain = LLMChain(llm=llm, prompt=prompt)

# Example JSON loading (just load one material for demo)
file_path = str(here("data/papers/Na-Mn-O/000690060_synthesis_and_challenges.json"))
with open(file_path, "r") as f:
    synthesis_data = json.load(f)

material = synthesis_data["synthesis_info"][0]
material_name = material["material"]["name"]
steps = material["synthesis_steps"] + material["post_processing"]
steps_text = "\n".join([
    f"- Step {s['step']}: {s['label']}" for s in steps
])

# Run the chain
diagram = chain.run(material_name=material_name, steps_text=steps_text)

print(diagram)


AZURE_OPENAI_API_KEY: GEQy...
```mermaid
graph TD
    A[🧪 Precursor Mixing] --> B[🔥 Combustion]
    B --> C[🔥 Annealing]
    C --> D[💧 Quenching]
```


In [29]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json
import os
import dotenv
from pyprojroot import here

# Load environment variables from .env file
dotenv.load_dotenv()

# Set the API keys
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")
print(f"AZURE_OPENAI_API_KEY: {azure_openai_api_key[:4]}...")

# Azure OpenAI credentials
azure_deployment = os.getenv("AZURE_MODEL_DEPLOYMENT_NAME")
azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

# Create AzureChatOpenAI instance
llm = AzureChatOpenAI(
    azure_deployment=azure_deployment,
    api_version=azure_api_version,
    temperature=0,
    openai_api_key=azure_openai_api_key,
    azure_endpoint=azure_endpoint
)

# Prompt template
prompt = PromptTemplate.from_template("""
You are a scientific assistant tasked with visualizing a synthesis procedure.

You will output:
1. A **Mermaid.js flowchart** showing each synthesis step.
2. A **summary line** describing the material using icons to represent metadata.

---

Use the following emoji icons to represent each key:


### Use the following emoji to represent each schema key:

#### 🧪 = Material name  
🧬 = Composition  
🌀 = Morphology  
🧱 = Phase  
🧩 = Dopant element  
📊 = Dopant percentage  
⚡ = Valence  
🛡️ = Dopant role  
🌐 = Dopant distribution  

🔬 = Precursor name  
🏢 = Supplier  
💎 = Purity  
🧊 = Physical form  
⚖️ = Stoichiometry  
🛠️ = Preparation method / Equipment  
🌡️ = Temperature  
⏱️ = Duration  
🌫️ = Environment  
📉 = Cooling rate  
❄️ = Cooling method  
⚗️ = pH  
🧯 = Pressure  
🌀 = Mixing method   

---

### Step Instructions:
- In the **summary line**, show exactly one value for each of the above keys from the material metadata, each with its emoji.
- In the **Mermaid diagram**, label each node as:
  `"Step <number>: <emoji> <step label> — <one key detail like T, t, pH, or equipment>"`.
- Use arrows to connect steps in order.
- Output must be in valid Markdown: one summary line followed by one code block containing the diagram.

---

### Example format:
🧪 Na₀.₆MnO₂+z | 🧱 P2 | 🌀 flake | 🧩 Co 10% | 🔬 NaNO₃  
```mermaid
graph TD  
  A[Step 1: 🧪 Precursor Mixing — T=RT] --> B[Step 2: 🔥 Combustion — T=500°C] 

Given this list of synthesis steps for the material "{material_name}", create a Mermaid.js flowchart.
Show arrows between the steps to represent process flow. Include the value of precursors, reagents, and solvents etc. when available in list in the graph.

Steps:
{steps_text}

Output only the Mermaid diagram.
""")

chain = LLMChain(llm=llm, prompt=prompt)

# Load JSON file
file_path = str(here("data/papers/Na-Mn-O/000690060_synthesis_and_challenges.json"))
with open(file_path, "r") as f:
    synthesis_data = json.load(f)

# Format step description with optional temperature/duration
def format_step(step):
    label = step["label"]
    step_num = step["step"]
    details = step.get("details", {})
    
    # Try to include reagents or solvent
    reagent_names = [r["name"] for r in details.get("reagents", []) if "name" in r]
    solvent = details.get("solvent")
    extra = []
    if reagent_names:
        extra.append(" + ".join(reagent_names))
    if solvent:
        extra.append(f"solvent: {solvent}")
    
    return f"- Step {step_num}: {label} ({'; '.join(extra)})"



# Loop through materials and generate diagrams
for mat in synthesis_data["synthesis_info"]:
    name = mat["material"]["name"]
    steps = mat["synthesis_steps"] + mat["post_processing"]
    steps_text = "\n".join([format_step(s) for s in steps])
    diagram = chain.run(material_name=name, steps_text=steps_text)
    print(f"### {name}\n```mermaid\n{diagram}\n```")


AZURE_OPENAI_API_KEY: GEQy...
### Na0.6MnO2+z
```mermaid
🧪 Na₀.₆MnO₂+z | 🧱 P2 | 🌀 flake | 🧩 Co 10% | 🔬 NaNO₃  
```mermaid
graph TD  
  A[Step 1: 🔬 Precursor Mixing — Solvent=DI Water] --> B[Step 2: 🔥 Combustion — Reagent=HNO₃]  
  B --> C[Step 3: 🌡️ Annealing — T=High] --> D[Step 4: ❄️ Quenching — Cooling=Rapid]  
```
```
### Na0.6Co0.1Mn0.9O2+z
```mermaid
🧪 Na₀.₆Co₀.₁Mn₀.₉O₂+z | 🧱 P2 | 🌀 flake | 🧩 Co 10% | 🔬 NaNO₃  
```mermaid
graph TD  
  A[Step 1: 🌀 Precursor Mixing — Solvent=DI Water] --> B[Step 2: 🔥 Combustion — Reagent=Gelatin]  
  B --> C[Step 3: 🌡️ Annealing — T=High] --> D[Step 4: ❄️ Quenching — Cooling=Rapid]  
```
```
### Na0.7MnO2+z
```mermaid
🧪 Na₀.₇MnO₂+z | 🧱 P2 | 🌀 flake | 🧩 Co 10% | 🔬 NH₄HCO₃  
```mermaid
graph TD  
  A[Step 1: 🌀 Precursor Mixing — Solvent=DI Water] --> B[Step 2: 🌫️ Filtration and Washing — Environment=Neutral]  
  B --> C[Step 3: 🌡️ Annealing — T=800°C]  
  C --> D[Step 4: ❄️ Quenching — Cooling=Rapid]  
```
```
### Na0.6Co0.1Mn0.9O2+z
```mermaid
🧪 Na₀

In [35]:
from IPython.display import HTML, display

from IPython.display import HTML, display

def render_mermaid_diagram(mermaid_code, material_name=""):
    html = f"""
    <div class="mermaid">
    {mermaid_code}
    </div>
    <script type="module">
      import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.esm.min.mjs';
      mermaid.initialize({{ startOnLoad: true }});
    </script>
    <h4>{material_name}</h4>
    """
    display(HTML(html))



material_name = "Na0.6MnO2+z"
mermaid_code = """
graph TD  
  A[Step 1: 🔬 Precursor Mixing — Solvent=DI Water] --> B[Step 2: 🔥 Combustion — Reagent=HNO₃]  
  B --> C[Step 3: 🌡️ Annealing — T=High] --> D[Step 4: ❄️ Quenching — Cooling=Rapid]  
"""
render_mermaid_diagram(mermaid_code, material_name)

    


In [34]:
!jupyter labextension install @jupyterlab/mermaid


[33m(Deprecated) Installing extensions with the jupyter labextension install command is now deprecated and will be removed in a future major version of JupyterLab.

Users should manage prebuilt extensions with package managers like pip and conda, and extension authors are encouraged to distribute their extensions as prebuilt packages [0m

Conflicting Dependencies:
JupyterLab                        Extension        Package
>=4.4.4 <4.5.0                    >=4.5.2 <5.0.0   @jupyterlab/apputils
>=6.3.4 <6.4.0                    >=6.4.2 <7.0.0   @jupyterlab/coreutils
>=3.11.4 <3.12.0                  >=3.12.2 <4.0.0  @jupyterlab/rendermime-interfaces


In [36]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json
import os
import dotenv
from pyprojroot import here
from graphviz import Digraph

# Load environment variables from .env file
dotenv.load_dotenv()

# Azure OpenAI credentials
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_deployment = os.getenv("AZURE_MODEL_DEPLOYMENT_NAME")
azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")
print(f"AZURE_OPENAI_API_KEY: {azure_openai_api_key[:4]}...")

# Initialize LLM
llm = AzureChatOpenAI(
    azure_deployment=azure_deployment,
    api_version=azure_api_version,
    temperature=0,
    openai_api_key=azure_openai_api_key,
    azure_endpoint=azure_endpoint
)

# Prompt to generate Graphviz DOT format
prompt = PromptTemplate.from_template("""
You are a scientific assistant tasked with visualizing a synthesis procedure.

Convert the synthesis steps below into a Graphviz DOT format directed graph.
Each node should represent a step labeled with:
- step number
- step name
- important reagents, solvents, or conditions

Use arrows to represent step order.
Return only a complete Graphviz DOT code block.

Steps:
{steps_text}
""")

chain = LLMChain(llm=llm, prompt=prompt)

# Load JSON file
file_path = str(here("data/papers/Na-Mn-O/000690060_synthesis_and_challenges.json"))
with open(file_path, "r") as f:
    synthesis_data = json.load(f)

# Format step description with reagents and solvent
def format_step(step):
    label = step["label"]
    step_num = step["step"]
    details = step.get("details", {})

    # Collect info
    info_lines = []

    reagents = [r["name"] for r in details.get("reagents", []) if r.get("name")]
    if reagents:
        info_lines.append("Reagents: " + ", ".join(reagents))

    if details.get("solvent"):
        info_lines.append(f"Solvent: {details['solvent']}")

    # Optional condition fields
    temp = details.get("temperature", None)
    duration = details.get("duration", None)
    pressure = details.get("pressure", None)
    atmosphere = details.get("atmosphere", None)
    equipment = details.get("equipment", None)

    conds = []
    if temp: conds.append(f"T={temp}")
    if duration: conds.append(f"t={duration}")
    if pressure: conds.append(f"p={pressure}")
    if atmosphere: conds.append(f"env={atmosphere}")
    if equipment: conds.append(f"eq={equipment}")

    if conds:
        info_lines.append("Conditions: " + ", ".join(conds))

    if not info_lines:
        info_lines.append("Conditions: None Specified")

    # Join into graph label
    return f"Step {step_num}: {label}\\n" + "\\n".join(info_lines)


# Generate for the first material only (for simplicity)
material = synthesis_data["synthesis_info"][0]
steps = material["synthesis_steps"] + material["post_processing"]
steps_text = "\n".join([format_step(s) for s in steps])

# Get Graphviz DOT code from the LLM
dot_code = chain.run(steps_text=steps_text)
print("Generated Graphviz DOT code:\n")
print(dot_code)

# Render it with Graphviz (optional visualization)
graph = Digraph(comment="Synthesis Steps")

for i, mat in enumerate(synthesis_data["synthesis_info"]):
    name = mat["material"]["name"]
    steps = mat["synthesis_steps"] + mat["post_processing"]
    steps_text = "\n".join([format_step(s) for s in steps])

    dot_code = chain.run(steps_text=steps_text)

    graph = Digraph(comment=name)
    graph.attr("node", shape="box")  # ← consistent shape for all nodes
    graph.node("title", f"Material: {name}", shape="plaintext")

    first_node = None
    for line in dot_code.splitlines():
        line = line.strip()
        if line.startswith("node ") or "node [shape=" in line:
            # skip global node attribute declaration
            continue
        if "->" in line or "[" in line:
            graph.body.append(line)

    
    if first_node:
        graph.edge("title", first_node)

    graph.render(f"synthesis_graph_{i+1}", format="png", cleanup=True)
    print(f"Saved synthesis_graph_{i+1}.png for {name}")



AZURE_OPENAI_API_KEY: GEQy...
Generated Graphviz DOT code:

```dot
digraph synthesis {
    node [shape=box];

    step1 [label="Step 1: Precursor Mixing\nReagents: NaNO3, Mn(CH3COO)2\nSolvent: deionized water"];
    step2 [label="Step 2: Combustion\nReagents: HNO3, gelatin"];
    step3 [label="Step 3: Annealing\nConditions: T=800°C, t=4 h"];
    step4 [label="Step 4: Quenching\nConditions: T=610°C, t=9 h"];

    step1 -> step2;
    step2 -> step3;
    step3 -> step4;
}
```
Saved synthesis_graph_1.png for Na0.6MnO2+z
Saved synthesis_graph_2.png for Na0.6Co0.1Mn0.9O2+z
Saved synthesis_graph_3.png for Na0.7MnO2+z
Saved synthesis_graph_4.png for Na0.6Co0.1Mn0.9O2+z


In [38]:
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import json
import os
import dotenv
from pyprojroot import here
from graphviz import Digraph

# Load environment variables from .env file
dotenv.load_dotenv()

# Azure OpenAI credentials
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_deployment = os.getenv("AZURE_MODEL_DEPLOYMENT_NAME")
azure_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")

if not azure_openai_api_key:
    raise ValueError("AZURE_OPENAI_API_KEY environment variable not set.")
print(f"AZURE_OPENAI_API_KEY: {azure_openai_api_key[:4]}...")

# Initialize LLM
llm = AzureChatOpenAI(
    azure_deployment=azure_deployment,
    api_version=azure_api_version,
    temperature=0,
    openai_api_key=azure_openai_api_key,
    azure_endpoint=azure_endpoint
)

# Prompt for challenge-to-solution graph
prompt = PromptTemplate.from_template("""
You are a scientific assistant tasked with visualizing material performance challenges.

Given a challenge description, show the logical relationship as a Graphviz DOT graph:
- nodes: Challenge → Impact → Solution → Evidence
- edges: direct the logical flow
- include brief summary text in node labels

Input:
{challenge_info}

Output only the DOT graph.
""")

chain = LLMChain(llm=llm, prompt=prompt)

# Load JSON file
file_path = str(here("data/papers/Na-Mn-O/000690060_synthesis_and_challenges.json"))
with open(file_path, "r") as f:
    synthesis_data = json.load(f)

challenges = synthesis_data.get("challenges_info", [])

for i, ch in enumerate(challenges):
    material = ch.get("material")
    challenge = ch.get("challenge", {})
    solution = ch.get("solution", {})
    evidence = ch.get("evidence", {})

    challenge_info = f"""
Material: {material}
Challenge: {challenge.get('description')} (type: {challenge.get('type')})
Impact: {challenge.get('impact')}
Solution: {solution.get('description')} ({solution.get('methodology')})
Effectiveness: {solution.get('effectiveness')}
Evidence: {evidence.get('data')}
"""

    dot_code = chain.run(challenge_info=challenge_info)
    print(f"Challenge Graph {i+1} for {material}\n")
    print(dot_code)

    # Optional: Render the DOT graph
    graph = Digraph(comment=f"Challenge {i+1}")
    for line in dot_code.splitlines():
        line = line.strip()
        if line.startswith("node ["):
            continue  # skip style declarations
        if "->" in line or "[" in line:
            graph.body.append(line)

    graph.render(f"challenge_graph_{i+1}", format="png", cleanup=True)
    print(f"Saved challenge_graph_{i+1}.png for {material}\n")

AZURE_OPENAI_API_KEY: GEQy...
Challenge Graph 1 for P2−NaxMnO2 flakes

```dot
digraph MaterialPerformance {
    rankdir=LR;
    node [shape=box];

    Challenge [label="Challenge: Partial phase transition to orthorhombic crystal system upon Na+ insertion"];
    Impact [label="Impact: Structural inhomogeneity reduces cycling stability"];
    Solution [label="Solution: Co-doping (~10%) suppresses structural transformations"];
    Evidence [label="Evidence: Structural studies show suppression of phase transitions in Co-doped materials"];

    Challenge -> Impact;
    Impact -> Solution;
    Solution -> Evidence;
}
```
Saved challenge_graph_1.png for P2−NaxMnO2 flakes

Challenge Graph 2 for P2−NaxMnO2 spheres

```dot
digraph MaterialPerformance {
    rankdir=LR;
    node [shape=box];

    Challenge [label="Challenge: Higher strain within the structure due to reduced volume expansion and contraction. (Structural)"];
    Impact [label="Impact: Strain accumulation may lead to structural fatig

In [None]:
#### We can also use https://todiagram.com/editor to visualize the graph


In [37]:
chain

LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['steps_text'], input_types={}, partial_variables={}, template='\nYou are a scientific assistant tasked with visualizing a synthesis procedure.\n\nConvert the synthesis steps below into a Graphviz DOT format directed graph.\nEach node should represent a step labeled with:\n- step number\n- step name\n- important reagents, solvents, or conditions\n\nUse arrows to represent step order.\nReturn only a complete Graphviz DOT code block.\n\nSteps:\n{steps_text}\n'), llm=AzureChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x1498c6270>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x149952270>, root_client=<openai.lib.azure.AzureOpenAI object at 0x149655760>, root_async_client=<openai.lib.azure.AsyncAzureOpenAI object at 0x1498c7e90>, temperature=0.0, model_kwargs={}, openai_api_key=SecretStr('**********'), disabled_params={'parallel_tool_calls': None}