In [35]:
from openai import OpenAI
import pprint

client = OpenAI(
  api_key=""
)

assistant = client.beta.assistants.create(
  name="SBIR Assistant",
  instructions="You are an expert on government grants and funding. You are able to answer questions about the grants and funding available to small businesses.",
  model="gpt-4o-mini",
  tools=[{"type": "file_search"}],
)

In [23]:
# Create a vector store caled "SBIR Grants"
vector_store = client.beta.vector_stores.create(name="SBIR Grants")

# Ready the files for upload to OpenAI
file_paths = ["./DoD_grants.pdf", "./2025_NASA_SBIR_PhaseI.pdf"]
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=2, failed=0, in_progress=0, total=2)


In [24]:
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [25]:
# Create a thread and attach the file to the message
thread = client.beta.threads.create()

message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="Use the PDF file Dod_grants that is in your vector store. Only use that to answer this prompt: Give me all grants related to supply chain vulnerabilities."
)

# thread = client.beta.threads.create(
#   messages=[
#     {
#       "role": "user",
#       "content": "Use the PDF file Dod_grants that is in your vector store. Only use that to answer this prompt: Give me all grants related to supply chain vulnerabilities.",
#     }
#   ]
# )

# The thread now has a vector store with that file in its tool resources.
print(thread.tool_resources.file_search)

None


In [26]:
# Use the create and poll SDK helper to create a run and poll the status of
# the run until it's in a terminal state.

run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))

message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
    message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
    if file_citation := getattr(annotation, "file_citation", None):
        cited_file = client.files.retrieve(file_citation.file_id)
        citations.append(f"[{index}] {cited_file.filename}")

print(message_content.value)
print("\n".join(citations))

Here are the grants related to supply chain vulnerabilities from the provided document:

1. **Resilience against Supply Chain Cyber Vulnerabilities**
   - **Objective:** Develop a technology that ensures computing hardware integrated into future combat systems is trustworthy and secure against cyber threats.
   - **Description:** Focuses on enhancing the security and robustness of shipboard computing infrastructure, which includes the integration of advanced multi-die systems. This technology is crucial for adapting to and responding to stresses within the IT environment, with an emphasis on cyber security[0].

2. **Smart Contracts for Supply Chain Risk Management (SCRM)**
   - **Objective:** Develop an acquisition and sustainment contracting framework that utilizes blockchain technology to manage supply chain risks.
   - **Description:** The project aims to improve visibility and responsiveness within the global supply chain, particularly concerning sub-tier vendors. The implementatio

In [11]:
# testing follow up question
follow_up = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content="Tell me more about the first grant"
)

follow_up_run = client.beta.threads.runs.create_and_poll(
    thread_id=thread.id, assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=follow_up_run.id))

message_content = messages[0].content[0].text
print(message_content.value)

Here are the details regarding the grant titled **Resilience against Supply Chain Cyber Vulnerabilities**:

- **Objective**: The primary aim of this grant is to develop technology that ensures that computing hardware technologies integrated into future combat systems are trustworthy and cyber secure. This is critical for maintaining the integrity and functionality of defense systems within potentially hostile environments.

- **Description**: The shipboard computing infrastructure currently comprises over 3,000 Central Processing Unit (CPU) cores distributed across various military-grade cabinets situated in multiple spaces within a ship. This distributed setup is designed to enhance operational survivability in the event of damage to specific components. However, this inherent complexity brings significant challenges in maintaining security, robustness, trustworthiness, and performance of the computing infrastructure. 

The proposed technology focuses on infrastructure resilience, whi

In [33]:
from pydantic import BaseModel
import pdfplumber
import tiktoken  # OpenAI's tokenizer
import time

class NASASBIRExtraction(BaseModel):
    subtopic_letter: str
    subtopic_number: str
    title: str
    scope_title: str
    expected_TRL_at_completion: str
    keywords: list[str]

# Function to extract text from PDF
def extract_pdf_text(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

# Function to split text into chunks within the token limit
def chunk_text(text, max_tokens=10_000):
    enc = tiktoken.encoding_for_model("gpt-4o-mini")
    tokens = enc.encode(text)
    
    # Split into chunks
    chunks = [tokens[i:i + max_tokens] for i in range(0, len(tokens), max_tokens)]
    
    # Decode tokens back into text chunks
    return [enc.decode(chunk) for chunk in chunks]

# Step 1: Extract and chunk text
pdf_text = extract_pdf_text("2025_NASA_SBIR_PhaseI.pdf")
chunks = chunk_text(pdf_text)

grant_results = []

# Step 2: Send each chunk separately
for chunk in chunks:
    try:
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert at structured data extraction for grant proposals. Extract structured data from the provided text."},
                {"role": "user", "content": chunk},
            ],
            response_format=NASASBIRExtraction,  # Using OpenAI's `parse()`
        )
        grant_results.append(completion.choices[0].message.parsed)
        
        time.sleep(1)  # Prevent rate limit issues
    except Exception as e:
        print(f"Error processing chunk: {e}")

# Print results
print(grant_results)

[NASASBIRExtraction(subtopic_letter='A', subtopic_number='1.02', title='Quiet Performance - Airframe Noise (SBIR)', scope_title='Airframe Noise Analysis and Characterization', expected_TRL_at_completion='2 to 5', keywords=['airframe noise', 'aeroacoustic analysis', 'computational fluid dynamics', 'propulsion efficiency', 'aircraft design']), NASASBIRExtraction(subtopic_letter='A1.08', subtopic_number='A1.08', title='Aeronautics Ground Test and Measurement Technologies: Diagnostic Systems for High-Speed Flows and Icing (SBIR)', scope_title='Miniaturized Flow Diagnostics for High-Speed Flows', expected_TRL_at_completion='4 to 7', keywords=['high-speed flows', 'diagnostic systems', 'miniaturized flow diagnostics', 'aeronautics', 'wind tunnel measurements']), NASASBIRExtraction(subtopic_letter='A2.04', subtopic_number='A2.04', title='Aviation Cybersecurity', scope_title='On-Board-Multicast-Network Systems Monitoring and Anomaly Detection with Reporting', expected_TRL_at_completion='4 to 5'

In [37]:
pprint.pprint(grant_results)

[NASASBIRExtraction(subtopic_letter='A', subtopic_number='1.02', title='Quiet Performance - Airframe Noise (SBIR)', scope_title='Airframe Noise Analysis and Characterization', expected_TRL_at_completion='2 to 5', keywords=['airframe noise', 'aeroacoustic analysis', 'computational fluid dynamics', 'propulsion efficiency', 'aircraft design']),
 NASASBIRExtraction(subtopic_letter='A1.08', subtopic_number='A1.08', title='Aeronautics Ground Test and Measurement Technologies: Diagnostic Systems for High-Speed Flows and Icing (SBIR)', scope_title='Miniaturized Flow Diagnostics for High-Speed Flows', expected_TRL_at_completion='4 to 7', keywords=['high-speed flows', 'diagnostic systems', 'miniaturized flow diagnostics', 'aeronautics', 'wind tunnel measurements']),
 NASASBIRExtraction(subtopic_letter='A2.04', subtopic_number='A2.04', title='Aviation Cybersecurity', scope_title='On-Board-Multicast-Network Systems Monitoring and Anomaly Detection with Reporting', expected_TRL_at_completion='4 to 