<a href="https://colab.research.google.com/github/austin-hua/2015-ui/blob/master/Untitled4%20-%20Seth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Run this cell => Restart the session => Start executing the below cells **(DO NOT EXECUTE THIS CELL AGAIN)**
# Core LangChain and AI ecosystem packages
!pip install -q \
    langchain==0.3.21 \
    huggingface_hub==0.29.3 \
    openai==1.68.2 \
    chromadb==0.6.3 \
    langchain_openai==0.3.10 \
    langchain-community==0.3.20 \
    lark==1.2.2 \
    rank_bm25==0.2.2 \
    numpy==2.1.0 \
    scipy==1.15.2 \
    scikit-learn==1.6.1 \
    transformers==4.50.0 \
    pypdf==5.4.0 \
    tiktoken==0.9.0 \
    sentence_transformers==4.0.0
# Install locally or in notebook:
!pip install langchain-community pypdf
!pip install -q langchain-community pypdf

# Then in Python:
from langchain_community.document_loaders import PyPDFLoader

# PyTorch with CUDA 12.4 support
!pip install torch==2.6.0+cu124 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

import warnings
warnings.filterwarnings('ignore')

# @title Loading the `config.json` file
import json, os

# Load the JSON file and extract values
file_name = 'config.json'
with open(file_name, 'r') as file:
    config = json.load(file)
    os.environ['OPENAI_API_KEY']  = config.get("API_KEY") # Loading the API Key
    os.environ["OPENAI_BASE_URL"] = config.get("OPENAI_API_BASE") # Loading the API Base Url

# @title Defining the LLM Model - Use `gpt-4o` Model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# from langchain.document_loaders import PyPDFLoader




# Load NOFO and extract topic
from langchain.document_loaders import PyPDFLoader

pdf_file = "/content/NOFO_pdf.pdf"
loader = PyPDFLoader("/content/NOFO_pdf.pdf", mode="single")
NOFO_pdf = loader.load() # Changed from pdf_loader.load()
doc_text = "\n\n".join([doc.page_content for doc in NOFO_pdf])
doc_text = doc_text[:12000]  # Token-safe truncation


# Prompt to extract topic
topic_extraction_prompt = f"""
You are a grant analyst. Carefully read the following NOFO text and identify the **topic** for which funding is being provided.

Only return the topic name. Do not include any explanation, context, or extra text.

Text:
\"\"\"
{doc_text}
\"\"\"
"""

topic_extraction = llm.invoke(topic_extraction_prompt)
topic = topic_extraction.content.strip
print("Extracted Topic:", topic)

import zipfile
import os

# Unzip papers
with zipfile.ZipFile("/content/ResearchPapers.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

# Set research paper folder path
papers_path = "/content/Papers/"


# Reload NOFO for use in prompt
nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
nofo_docs = nofo_loader.load()
NOFO_text = "\n\n".join([doc.page_content for doc in nofo_docs])
NOFO_text = NOFO_text[:12000]

relevance_prompt = f"""
You are an expert grant reviewer and research analyst.

Analyze the relevance of the following research paper in relation to the topic, goals, objectives, and funding criteria outlined in the NOFO document.

Your task is to:
1. Determine whether the research aligns with the funding opportunity goals, objectives, and evaluation priorities.
2. Assess whether the research could reasonably be used to support or inspire a viable project proposal under this NOFO.
3. Evaluate the relevance based on domain, methodology, or application.

If the research paper does not relate to the topic by domain, methodology, or intended application, return:
**"Paper not related to topic"**

### NOFO Topic:
{topic}



---

NOFO Document:
\"\"\"
{NOFO_text}
\"\"\"

Research Paper:
\"\"\"
"""  # This ends the prompt template before appending paper text

import tiktoken
import requests

from tiktoken import get_encoding
encoding = get_encoding("cl100k_base")
MAX_TOKENS = 120000

documents = []
relevant_papers_count = 0
irrelevant_papers_count = 0
total_files = len([f for f in os.listdir(papers_path) if f.endswith(".pdf")])
progress_cnt = 1

for filename in os.listdir(papers_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(papers_path, filename)

        try:
            # Load research paper
            paper_loader = PyPDFLoader(file_path,mode="single")
            paper_docs = paper_loader.load()
            paper_text = paper_docs[0].page_content

            # Truncate paper text to fit token limit
            base_tokens = len(encoding.encode(relevance_prompt_template))
            available_tokens = MAX_TOKENS - base_tokens
            truncated_pages = encoding.decode(encoding.encode(paper_text)[:available_tokens])

            # Complete the full prompt
            full_prompt = relevance_prompt + truncated_pages + '\n\"\"\"\n\nReturn your answer in the following format:\n- Relevance Summary:\n- Alignment with NOFO Goals:\n- Potential for Use in Proposal Development:'

            # Invoke LLM
            response = llm.invoke(full_prompt)
            print(response.content)
            print(f"Successfully processed: {progress_cnt}/{total_files}")
            progress_cnt += 1

			# if "PAPER RELEVANT TO TOPIC" in response.content:
			#	relevant_papers_count +1


            if "PAPER NOT RELATED TO TOPIC" in response.content:
                irrelevant_papers_count += 1
                continue

            documents.append({
                'title': filename,
                'llm_response': response.content,
                'file_path': file_path
            })
            relevant_papers_count += 1

        except Exception as e:
            print(f"!!! Error processing {filename}: {str(e)}")

# Summary
print("="*50)
docs = loader.load()
print(docs[0].page_content[:500])  # First 500 characters
print(f"Relevant Papers: {relevant_papers_count}/{total_files}")
print(f"Irrelevant Papers: {irrelevant_papers_count}/{total_files}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.1.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.1.0 which is incompatible.[0m[31m
Looking in indexes: https://download.pytorch.org/whl/cu124




RateLimitError: Error code: 429 - {'reason': {'error': 'You exceeded your current quota!!'}}

In [None]:
# @title Run this cell => Restart the session => Start executing the below cells **(DO NOT EXECUTE THIS CELL AGAIN)**
# Core LangChain and AI ecosystem packages
!pip install -q \
    langchain==0.3.21 \
    huggingface_hub==0.29.3 \
    openai==1.68.2 \
    chromadb==0.6.3 \
    langchain_openai==0.3.10 \
    langchain-community==0.3.20 \
    lark==1.2.2 \
    rank_bm25==0.2.2 \
    numpy==2.1.0 \
    scipy==1.15.2 \
    scikit-learn==1.6.1 \
    transformers==4.50.0 \
    pypdf==5.4.0 \
    tiktoken==0.9.0 \
    sentence_transformers==4.0.0
# Install locally or in notebook:
!pip install langchain-community pypdf
!pip install -q langchain-community pypdf
!pip install tenacity

# Then in Python:
from langchain_community.document_loaders import PyPDFLoader

# PyTorch with CUDA 12.4 support
!pip install torch==2.6.0+cu124 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

import warnings
warnings.filterwarnings('ignore')

# @title Loading the `config.json` file
import json, os

# Load the JSON file and extract values
file_name = 'config.json'
with open(file_name, 'r') as file:
    config = json.load(file)
    os.environ['OPENAI_API_KEY']  = config.get("API_KEY") # Loading the API Key
    os.environ["OPENAI_BASE_URL"] = config.get("OPENAI_API_BASE") # Loading the API Base Url

# @title Defining the LLM Model - Use `gpt-4o` Model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# from langchain.document_loaders import PyPDFLoader




# Load NOFO and extract topic
from langchain.document_loaders import PyPDFLoader

pdf_file = "/content/NOFO_pdf.pdf"
loader = PyPDFLoader("/content/NOFO_pdf.pdf", mode="single")
NOFO_pdf = loader.load() # Changed from pdf_loader.load()
doc_text = "\n\n".join([doc.page_content for doc in NOFO_pdf])
doc_text = doc_text[:12000]  # Token-safe truncation


# Prompt to extract topic
topic_extraction_prompt = f"""
You are a grant analyst. Carefully read the following NOFO text and identify the **topic** for which funding is being provided.

Only return the topic name. Do not include any explanation, context, or extra text.

Text:
\"\"\"
{doc_text}
\"\"\"
"""

# Implement retry mechanism for the LLM call
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(print, logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_llm_with_retry(prompt):
    """Helper function to invoke LLM with retry logic."""
    return llm.invoke(prompt)

# Call the LLM using the retry function
try:
    topic_extraction = invoke_llm_with_retry(topic_extraction_prompt)
    topic = topic_extraction.content.strip() # Added parentheses to call the strip method
    print("Extracted Topic:", topic)
except RateLimitError as e:
    print(f"Failed to extract topic after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during topic extraction: {e}")

import zipfile
import os

# Unzip papers
with zipfile.ZipFile("/content/ResearchPapers.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

# Set research paper folder path
papers_path = "/content/Papers/"


# Reload NOFO for use in prompt
nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
nofo_docs = nofo_loader.load()
NOFO_text = "\n\n".join([doc.page_content for doc in nofo_docs])
NOFO_text = NOFO_text[:12000]

relevance_prompt = f"""
You are an expert grant reviewer and research analyst.

Analyze the relevance of the following research paper in relation to the topic, goals, objectives, and funding criteria outlined in the NOFO document.

Your task is to:
1. Determine whether the research aligns with the funding opportunity goals, objectives, and evaluation priorities.
2. Assess whether the research could reasonably be used to support or inspire a viable project proposal under this NOFO.
3. Evaluate the relevance based on domain, methodology, or application.

If the research paper does not relate to the topic by domain, methodology, or intended application, return:
**"Paper not related to topic"**

### NOFO Topic:
{topic}



---

NOFO Document:
\"\"\"
{NOFO_text}
\"\"\"

Research Paper:
\"\"\"
"""  # This ends the prompt template before appending paper text

import tiktoken
import requests
import logging # Import logging for tenacity

from tiktoken import get_encoding
encoding = get_encoding("cl100k_base")
MAX_TOKENS = 120000

documents = []
relevant_papers_count = 0
irrelevant_papers_count = 0
total_files = len([f for f in os.listdir(papers_path) if f.endswith(".pdf")])
progress_cnt = 1

for filename in os.listdir(papers_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(papers_path, filename)

        try:
            # Load research paper
            paper_loader = PyPDFLoader(file_path,mode="single")
            paper_docs = paper_loader.load()
            paper_text = paper_docs[0].page_content

            # Truncate paper text to fit token limit
            base_tokens = len(encoding.encode(relevance_prompt_template))
            available_tokens = MAX_TOKENS - base_tokens
            truncated_pages = encoding.decode(encoding.encode(paper_text)[:available_tokens])

            # Complete the full prompt
            full_prompt = relevance_prompt + truncated_pages + '\n\"\"\"\n\nReturn your answer in the following format:\n- Relevance Summary:\n- Alignment with NOFO Goals:\n- Potential for Use in Proposal Development:'

            # Invoke LLM with Tenacity
			@tenacity.retry(
                wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
                stop=tenacity.stop_after_attempt(5),
                before_sleep=tenacity.before_sleep_log(print, logging.INFO),
                retry=tenacity.retry_if_exception_type(RateLimitError)
            )
            def invoke_llm_for_relevance(prompt):
                """Helper function to invoke LLM with retry logic for relevance check."""
                return llm.invoke(prompt)

			response = llm.invoke(full_prompt)
			print(response.content)

            print(f"Successfully processed: {progress_cnt}/{total_files}")
            progress_cnt += 1

			# if "PAPER RELEVANT TO TOPIC" in response.content:
			#	relevant_papers_count +1


            if "PAPER NOT RELATED TO TOPIC" in response.content:
                irrelevant_papers_count += 1
                continue

            documents.append({
                'title': filename,
                'llm_response': response.content,
                'file_path': file_path
            })
            relevant_papers_count += 1

        except Exception as e:
            print(f"!!! Error processing {filename}: {str(e)}")

# Summary
print("="*50)
# Make sure 'loader' is still the NOFO loader here
docs = loader.load()
print(docs[0].page_content[:500])  # First 500 characters
print(f"Relevant Papers: {relevant_papers_count}/{total_files}")
print(f"Irrelevant Papers: {irrelevant_papers_count}/{total_files}")

TabError: inconsistent use of tabs and spaces in indentation (ipython-input-10-733272735.py, line 159)

In [None]:
# @title Run this cell => Restart the session => Start executing the below cells **(DO NOT EXECUTE THIS CELL AGAIN)**
# Core LangChain and AI ecosystem packages
!pip install -q \
    langchain==0.3.21 \
    huggingface_hub==0.29.3 \
    openai==1.68.2 \
    chromadb==0.6.3 \
    langchain_openai==0.3.10 \
    langchain-community==0.3.20 \
    lark==1.2.2 \
    rank_bm25==0.2.2 \
    numpy==2.1.0 \
    scipy==1.15.2 \
    scikit-learn==1.6.1 \
    transformers==4.50.0 \
    pypdf==5.4.0 \
    tiktoken==0.9.0 \
    sentence_transformers==4.0.0
# Install locally or in notebook:
!pip install langchain-community pypdf
!pip install -q langchain-community pypdf
!pip install tenacity

# Then in Python:
from langchain_community.document_loaders import PyPDFLoader

# PyTorch with CUDA 12.4 support
!pip install torch==2.6.0+cu124 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

import warnings
warnings.filterwarnings('ignore')

# @title Loading the `config.json` file
import json, os

# Load the JSON file and extract values
file_name = 'config.json'
with open(file_name, 'r') as file:
    config = json.load(file)
    os.environ['OPENAI_API_KEY']  = config.get("API_KEY") # Loading the API Key
    os.environ["OPENAI_BASE_URL"] = config.get("OPENAI_API_BASE") # Loading the API Base Url

# @title Defining the LLM Model - Use `gpt-4o` Model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# from langchain.document_loaders import PyPDFLoader




# Load NOFO and extract topic
from langchain.document_loaders import PyPDFLoader

pdf_file = "/content/NOFO_pdf.pdf"
loader = PyPDFLoader("/content/NOFO_pdf.pdf", mode="single")
NOFO_pdf = loader.load() # Changed from pdf_loader.load()
doc_text = "\n\n".join([doc.page_content for doc in NOFO_pdf])
doc_text = doc_text[:12000]  # Token-safe truncation


# Prompt to extract topic
topic_extraction_prompt = f"""
You are a grant analyst. Carefully read the following NOFO text and identify the **topic** for which funding is being provided.

Only return the topic name. Do not include any explanation, context, or extra text.

Text:
\"\"\"
{doc_text}
\"\"\"
"""

# Initialize topic to a default value before the try block
topic = ""

# Implement retry mechanism for the LLM call
import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic

@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_llm_with_retry(prompt):
    """Helper function to invoke LLM with retry logic."""
    return llm.invoke(prompt)

# Call the LLM using the retry function
try:
    topic_extraction = invoke_llm_with_retry(topic_extraction_prompt)
    topic = topic_extraction.content.strip() # Added parentheses to call the strip method
    print("Extracted Topic:", topic)
except RateLimitError as e:
    print(f"Failed to extract topic after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during topic extraction: {e}")

import zipfile
import os

# Unzip papers
with zipfile.ZipFile("/content/ResearchPapers.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

# Set research paper folder path
papers_path = "/content/Papers/"


# Reload NOFO for use in prompt
nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
nofo_docs = nofo_loader.load()
NOFO_text = "\n\n".join([doc.page_content for doc in nofo_docs])
NOFO_text = NOFO_text[:12000]

relevance_prompt_template = f"""
You are an expert grant reviewer and research analyst.

Analyze the relevance of the following research paper in relation to the topic, goals, objectives, and funding criteria outlined in the NOFO document.

Your task is to:
1. Determine whether the research aligns with the funding opportunity goals, objectives, and evaluation priorities.
2. Assess whether the research could reasonably be used to support or inspire a viable project proposal under this NOFO.
3. Evaluate the relevance based on domain, methodology, or application.

If the research paper does not relate to the topic by domain, methodology, or intended application, return:
**"Paper not related to topic"**

### NOFO Topic:
{topic}



---

NOFO Document:
\"\"\"
{NOFO_text}
\"\"\"

Research Paper:
\"\"\"
"""  # This ends the prompt template before appending paper text

# Define the helper function for invoking LLM for relevance check outside the loop
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
    stop=tenacity.stop_after_attempt(5),
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO),
    retry=tenacity.retry_if_exception_type(RateLimitError)
)
def invoke_llm_for_relevance(prompt):
    """Helper function to invoke LLM with retry logic for relevance check."""
    return llm.invoke(prompt)

import tiktoken
import requests
import logging # Import logging for tenacity

from tiktoken import get_encoding
encoding = get_encoding("cl100k_base")
MAX_TOKENS = 120000

documents = []
relevant_papers_count = 0
irrelevant_papers_count = 0
total_files = len([f for f in os.listdir(papers_path) if f.endswith(".pdf")])
progress_cnt = 1

for filename in os.listdir(papers_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(papers_path, filename)

        try:
            # Load research paper
            paper_loader = PyPDFLoader(file_path,mode="single")
            paper_docs = paper_loader.load()
            paper_text = paper_docs[0].page_content

            # Truncate paper text to fit token limit
            base_tokens = len(encoding.encode(relevance_prompt_template))
            available_tokens = MAX_TOKENS - base_tokens
            truncated_pages = encoding.decode(encoding.encode(paper_text)[:available_tokens])

            # Complete the full prompt
            full_prompt = relevance_prompt_template + truncated_pages + '\n\"\"\"\n\nReturn your answer in the following format:\n- Relevance Summary:\n- Alignment with NOFO Goals:\n- Potential for Use in Proposal Development:'

            # Invoke LLM using the helper function
            response = invoke_llm_for_relevance(full_prompt)
            print(response.content)

            print(f"Successfully processed: {progress_cnt}/{total_files}")
            progress_cnt += 1

            # if "PAPER RELEVANT TO TOPIC" in response.content:
            #   relevant_papers_count +1


            if "PAPER NOT RELATED TO TOPIC" in response.content:
                irrelevant_papers_count += 1
                continue

            documents.append({
                'title': filename,
                'llm_response': response.content,
                'file_path': file_path
            })
            relevant_papers_count += 1

        except RateLimitError as e:
             print(f"!!! Rate limit error processing {filename}: {str(e)}. Skipping for now.")
             irrelevant_papers_count += 1 # Or handle skipped files differently
        except Exception as e:
            print(f"!!! Error processing {filename}: {str(e)}")
            irrelevant_papers_count += 1 # Consider papers with processing errors as irrelevant for the count

# Summary
print("="*50)
# Make sure 'loader' is still the NOFO loader here
docs = loader.load()
print(docs[0].page_content[:500])  # First 500 characters
print(f"Relevant Papers: {relevant_papers_count}/{total_files}")
print(f"Irrelevant Papers: {irrelevant_papers_count}/{total_files}")

Looking in indexes: https://download.pytorch.org/whl/cu124




An unexpected error occurred during topic extraction: RetryError[<Future at 0x7a9717595910 state=finished raised RateLimitError>]




!!! Error processing Vol33Iss1_INSNApdf.pdf: RetryError[<Future at 0x7a971b9f1b10 state=finished raised RateLimitError>]
!!! Error processing RES2D.pdf: RetryError[<Future at 0x7a97286369d0 state=finished raised RateLimitError>]
!!! Error processing Sailer McCulloh Soc Net and Spatial Config.pdf: RetryError[<Future at 0x7a971b9d6910 state=finished raised RateLimitError>]
!!! Error processing McCullohCarleyJOSS.pdf: RetryError[<Future at 0x7a971b9ddfd0 state=finished raised RateLimitError>]
!!! Error processing Cross_Platform_Information_Spread_During_the_January_6th_Capitol_Riots.pdf: RetryError[<Future at 0x7a971de07650 state=finished raised RateLimitError>]
!!! Error processing jfq-110_46-53_Cruickshank.pdf: RetryError[<Future at 0x7a9728635dd0 state=finished raised RateLimitError>]
!!! Error processing Multi_Agent_Systems_for_Frame_Detection.pdf: RetryError[<Future at 0x7a971b93d7d0 state=finished raised RateLimitError>]
!!! Error processing BotBuster___AAAI.pdf: RetryError[<Future 



!!! Error processing ONA-in-R.pdf: RetryError[<Future at 0x7a97221eb950 state=finished raised RateLimitError>]
!!! Error processing ONA-using-igraph.pdf: RetryError[<Future at 0x7a9723240b90 state=finished raised RateLimitError>]
!!! Error processing Quantifying_Information_Advantage.pdf: RetryError[<Future at 0x7a970e50eb90 state=finished raised RateLimitError>]
!!! Error processing EmergencyResponseAI.pdf: RetryError[<Future at 0x7a97233340d0 state=finished raised RateLimitError>]
!!! Error processing Leveraging_AI_to_Improve_Viral_Information_Detection_in_Online_Discourse.pdf: RetryError[<Future at 0x7a972106db10 state=finished raised RateLimitError>]
!!! Error processing The ABCs of AI-Enabled Intelligence Analysis - War on the Rocks.pdf: RetryError[<Future at 0x7a9723357e90 state=finished raised RateLimitError>]
!!! Error processing Arrow White Paper DExTra.pdf: RetryError[<Future at 0x7a97284cb650 state=finished raised RateLimitError>]
!!! Error processing ALL18.pdf: RetryError[<



!!! Error processing LLM_UQ.pdf: RetryError[<Future at 0x7a971b9d60d0 state=finished raised RateLimitError>]




!!! Error processing cycon-final-draft.pdf: RetryError[<Future at 0x7a9722155590 state=finished raised RateLimitError>]
!!! Error processing HIV.pdf: RetryError[<Future at 0x7a972241ccd0 state=finished raised RateLimitError>]
!!! Error processing YouTube-COVID.pdf: RetryError[<Future at 0x7a972234e810 state=finished raised RateLimitError>]
!!! Error processing Tweets-to-touchdowns.pdf: RetryError[<Future at 0x7a9723331f50 state=finished raised RateLimitError>]
!!! Error processing Helene_and_Milton_ACM.pdf: RetryError[<Future at 0x7a971b9dcd50 state=finished raised RateLimitError>]
!!! Error processing Kent2022_Chapter_MicroscopicMarkovChainApproach.pdf: RetryError[<Future at 0x7a97286702d0 state=finished raised RateLimitError>]
!!! Error processing Leadership of Data Annotation 20180304v2.pdf: RetryError[<Future at 0x7a9723307350 state=finished raised RateLimitError>]
!!! Error processing IkeNet.pdf: RetryError[<Future at 0x7a971ba0a710 state=finished raised RateLimitError>]
!!! Error



!!! Error processing Planning for AI Sustainment A Methodology for Maintenance and Cost Management_V5.pdf: RetryError[<Future at 0x7a971ab88d10 state=finished raised RateLimitError>]
!!! Error processing NAP Behavioral Sci Intel.pdf: RetryError[<Future at 0x7a97286f8310 state=finished raised RateLimitError>]




!!! Error processing Encyclopedia of SNA - R Packages.pdf: RetryError[<Future at 0x7a97119a4c10 state=finished raised RateLimitError>]




!!! Error processing FSS-19_paper_137.pdf: RetryError[<Future at 0x7a97221eb8d0 state=finished raised RateLimitError>]




!!! Error processing k-truss.pdf: RetryError[<Future at 0x7a971cc65850 state=finished raised RateLimitError>]
!!! Error processing Food Addiction 20231222 v3.pdf: RetryError[<Future at 0x7a97286b25d0 state=finished raised RateLimitError>]
!!! Error processing LLM_Confidence_Metrics.pdf: RetryError[<Future at 0x7a97230339d0 state=finished raised RateLimitError>]
!!! Error processing Kidney_Behavioral.pdf: RetryError[<Future at 0x7a972867e350 state=finished raised RateLimitError>]
!!! Error processing improving-decision-support-for-organ-transplant.pdf: RetryError[<Future at 0x7a9728665b10 state=finished raised RateLimitError>]




!!! Error processing Parler_Disinformation_Challenge___CMOT_Extended.pdf: RetryError[<Future at 0x7a970c8c5c10 state=finished raised RateLimitError>]
!!! Error processing Political Party Cohesion.pdf: RetryError[<Future at 0x7a97284b45d0 state=finished raised RateLimitError>]
!!! Error processing WEIRD.pdf: RetryError[<Future at 0x7a972867dc50 state=finished raised RateLimitError>]
!!! Error processing A_Complex_Network_Approach_to_Find_Latent_Terorrist_Communities.pdf: RetryError[<Future at 0x7a9710f4a190 state=finished raised RateLimitError>]
!!! Error processing TrainingSetSize.pdf: RetryError[<Future at 0x7a9722375510 state=finished raised RateLimitError>]
!!! Error processing Multi_view_Clustering_for_Social_Based_Data.pdf: RetryError[<Future at 0x7a9722f45890 state=finished raised RateLimitError>]


KeyboardInterrupt: 

In [None]:
# @title Run this cell => Restart the session => Start executing the below cells **(DO NOT EXECUTE THIS CELL AGAIN)**
# Core LangChain and AI ecosystem packages
!pip install -q \
    langchain==0.3.21 \
    huggingface_hub==0.29.3 \
    openai==1.68.2 \
    chromadb==0.6.3 \
    langchain_openai==0.3.10 \
    langchain-community==0.3.20 \
    lark==1.2.2 \
    rank_bm25==0.2.2 \
    numpy==2.1.0 \
    numpy==2.1 \
    scipy==1.15.2 \
    scikit-learn==1.6.1 \
    transformers==4.50.0 \
    pypdf==5.4.0 \
    tiktoken==0.9.0 \
    sentence_transformers==4.0.0
# Install locally or in notebook:
!pip install langchain-community pypdf
!pip install -q langchain-community pypdf
!pip install tenacity

# Then in Python:
from langchain_community.document_loaders import PyPDFLoader

# PyTorch with CUDA 12.4 support
!pip install torch==2.6.0+cu124 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

import warnings
warnings.filterwarnings('ignore')

# @title Loading the `config.json` file
import json, os

# Load the JSON file and extract values
file_name = 'config.json'
with open(file_name, 'r') as file:
    config = json.load(file)
    os.environ['OPENAI_API_KEY']  = config.get("API_KEY") # Loading the API Key
    os.environ["OPENAI_BASE_URL"] = config.get("OPENAI_API_BASE") # Loading the API Base Url

# @title Defining the LLM Model - Use `gpt-4o` Model
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# from langchain.document_loaders import PyPDFLoader




# Load NOFO and extract topic
from langchain.document_loaders import PyPDFLoader

pdf_file = "/content/NOFO_pdf.pdf"
loader = PyPDFLoader("/content/NOFO_pdf.pdf", mode="single")
NOFO_pdf = loader.load() # Changed from pdf_loader.load()
doc_text = "\n\n".join([doc.page_content for doc in NOFO_pdf])
doc_text = doc_text[:12000]  # Token-safe truncation

# Prompt to extract topic
topic_extraction_prompt = f"""
You are a grant analyst. Carefully read the following NOFO text and identify the **topic** for which funding is being provided.

Only return the topic name. Do not include any explanation, context, or extra text.

Text:
\"\"\"
{doc_text}
\"\"\"
"""

# Initialize topic to a default value before the try block
topic = ""

# Implement retry mechanism for the LLM call
import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic

@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_llm_with_retry(prompt):
    """Helper function to invoke LLM with retry logic."""
    return llm.invoke(prompt)

# Call the LLM using the retry function
try:
    topic_extraction = invoke_llm_with_retry(topic_extraction_prompt)
    topic = topic_extraction.content.strip() # Added parentheses to call the strip method
    print("Extracted Topic:", topic)
except RateLimitError as e:
    print(f"Failed to extract topic after multiple retries due to rate limit: {e}")
    # Consider assigning a placeholder or handling this case explicitly
    topic = "Unknown Topic (Rate Limit Error)" # Assign a default or error message
except Exception as e:
    print(f"An unexpected error occurred during topic extraction: {e}")
    # Consider assigning a placeholder or handling this case explicitly
    topic = "Unknown Topic (Processing Error)" # Assign a default or error message

import zipfile
import os

# Unzip papers
with zipfile.ZipFile("/content/ResearchPapers.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")

# Set research paper folder path
papers_path = "/content/Papers/"


# Reload NOFO for use in prompt
nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
nofo_docs = nofo_loader.load()
NOFO_text = "\n\n".join([doc.page_content for doc in nofo_docs])
NOFO_text = NOFO_text[:12000]

relevance_prompt_template = f"""
You are an expert grant reviewer and research analyst.

Analyze the relevance of the following research paper in relation to the topic, goals, objectives, and funding criteria outlined in the NOFO document.

Your task is to:
1. Determine whether the research aligns with the funding opportunity goals, objectives, and evaluation priorities.
2. Assess whether the research could reasonably be used to support or inspire a viable project proposal under this NOFO.
3. Evaluate the relevance based on domain, methodology, or application.

If the research paper does not relate to the topic by domain, methodology, or intended application, return:
**"Paper not related to topic"**

### NOFO Topic:
{topic}



---

NOFO Document:
\"\"\"
{NOFO_text}
\"\"\"

Research Paper:
\"\"\"
"""  # This ends the prompt template before appending paper text

# Define the helper function for invoking LLM for relevance check outside the loop
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
    stop=tenacity.stop_after_attempt(5),
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO),
    retry=tenacity.retry_if_exception_type(RateLimitError)
)
def invoke_llm_for_relevance(prompt):
    """Helper function to invoke LLM with retry logic for relevance check."""
    return llm.invoke(prompt)

import tiktoken
import requests
import logging # Import logging for tenacity

from tiktoken import get_encoding
encoding = get_encoding("cl100k_base")
MAX_TOKENS = 120000

documents = []
relevant_papers_count = 0
irrelevant_papers_count = 0
total_files = len([f for f in os.listdir(papers_path) if f.endswith(".pdf")])
progress_cnt = 1

for filename in os.listdir(papers_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(papers_path, filename)

        try:
            # Load research paper
            paper_loader = PyPDFLoader(file_path,mode="single")
            paper_docs = paper_loader.load()
            paper_text = paper_docs[0].page_content

            # Truncate paper text to fit token limit
            base_tokens = len(encoding.encode(relevance_prompt_template))
            available_tokens = MAX_TOKENS - base_tokens
            truncated_pages = encoding.decode(encoding.encode(paper_text)[:available_tokens])

            # Complete the full prompt
            full_prompt = relevance_prompt_template + truncated_pages + '\n\"\"\"\n\nReturn your answer in the following format:\n- Relevance Summary:\n- Alignment with NOFO Goals:\n- Potential for Use in Proposal Development:'

            # Invoke LLM using the helper function
            response = invoke_llm_for_relevance(full_prompt)
            # print(response.content)

            print(f"Successfully processed: {progress_cnt}/{total_files}")
            progress_cnt += 1

            # if "PAPER RELEVANT TO TOPIC" in response.content:
            #   relevant_papers_count +1


            if "PAPER NOT RELATED TO TOPIC" in response.content:
                irrelevant_papers_count += 1
                continue

            documents.append({
                'title': filename,
                'llm_response': response.content,
                'file_path': file_path
            })
            relevant_papers_count += 1

        except RateLimitError as e:
             print(f"!!! Rate limit error processing {filename}: {str(e)}. Skipping for now.")
             irrelevant_papers_count += 1 # Or handle skipped files differently
        except Exception as e:
            print(f"!!! Error processing {filename}: {str(e)}")
            irrelevant_papers_count += 1 # Consider papers with processing errors as irrelevant for the count

# Summary
print("="*50)
# Make sure 'loader' is still the NOFO loader here
docs = loader.load()
print(docs[0].page_content[:500])  # First 500 characters
print(f"Relevant Papers: {relevant_papers_count}/{total_files}")
print(f"Irrelevant Papers: {irrelevant_papers_count}/{total_files}")

Looking in indexes: https://download.pytorch.org/whl/cu124




An unexpected error occurred during topic extraction: RetryError[<Future at 0x7b5bac128d90 state=finished raised RateLimitError>]




!!! Error processing Multi_view_Clustering_for_Social_Based_Data.pdf: RetryError[<Future at 0x7b5bac047b90 state=finished raised RateLimitError>]
!!! Error processing jfq-110_46-53_Cruickshank.pdf: RetryError[<Future at 0x7b5bad653f10 state=finished raised RateLimitError>]




!!! Error processing BotBuster___AAAI.pdf: RetryError[<Future at 0x7b5bac9ac050 state=finished raised RateLimitError>]
!!! Error processing Political Party Cohesion.pdf: RetryError[<Future at 0x7b5bad354cd0 state=finished raised RateLimitError>]




!!! Error processing Encyclopedia of SNA - R Packages.pdf: RetryError[<Future at 0x7b5baa683990 state=finished raised RateLimitError>]
!!! Error processing Extreme Cohesion Darknet 20190815.pdf: RetryError[<Future at 0x7b5bad39f590 state=finished raised RateLimitError>]
!!! Error processing Sim of Decon.pdf: RetryError[<Future at 0x7b5bac314b50 state=finished raised RateLimitError>]
!!! Error processing DIVERSE_LLM_Dataset___IEEE_Big_Data.pdf: RetryError[<Future at 0x7b5bad9a0f50 state=finished raised RateLimitError>]
!!! Error processing Cohort_Optimization_Methods_SNAMS_2021_working_draft (4).pdf: RetryError[<Future at 0x7b5babfccb50 state=finished raised RateLimitError>]
!!! Error processing Political_Networks_Conference.pdf: RetryError[<Future at 0x7b5bac3fa990 state=finished raised RateLimitError>]
!!! Error processing ICWSM_2025_Political_Bias.pdf: RetryError[<Future at 0x7b5bae00f090 state=finished raised RateLimitError>]




!!! Error processing Take_boards.pdf: RetryError[<Future at 0x7b5bacd3aa10 state=finished raised RateLimitError>]




!!! Error processing HIV.pdf: RetryError[<Future at 0x7b5baca4b650 state=finished raised RateLimitError>]




!!! Error processing AAAI IAA CV.pdf: RetryError[<Future at 0x7b5bad9a22d0 state=finished raised RateLimitError>]
!!! Error processing Knowing the Terrain.pdf: RetryError[<Future at 0x7b5bad6daa90 state=finished raised RateLimitError>]
!!! Error processing NBA Performance.pdf: RetryError[<Future at 0x7b5bacd39e10 state=finished raised RateLimitError>]




!!! Error processing WEIRD.pdf: RetryError[<Future at 0x7b5bac005550 state=finished raised RateLimitError>]
!!! Error processing MOOC 20190828.pdf: RetryError[<Future at 0x7b5bacec6c90 state=finished raised RateLimitError>]
!!! Error processing The ABCs of AI-Enabled Intelligence Analysis - War on the Rocks.pdf: RetryError[<Future at 0x7b5bac1288d0 state=finished raised RateLimitError>]
!!! Error processing Leveraging_AI_to_Improve_Viral_Information_Detection_in_Online_Discourse.pdf: RetryError[<Future at 0x7b5bac047090 state=finished raised RateLimitError>]
!!! Error processing Benson_MA491_NLP.pdf: RetryError[<Future at 0x7b5bad305410 state=finished raised RateLimitError>]
!!! Error processing ONA-in-R.pdf: RetryError[<Future at 0x7b5bac3f8790 state=finished raised RateLimitError>]




!!! Error processing Social Media Mental Health Final.pdf: RetryError[<Future at 0x7b5bac1f05d0 state=finished raised RateLimitError>]
!!! Error processing Dormant Bots 20190814.pdf: RetryError[<Future at 0x7b5bad39f6d0 state=finished raised RateLimitError>]
!!! Error processing CausalOrgInorgContent.pdf: RetryError[<Future at 0x7b5babfcef90 state=finished raised RateLimitError>]
!!! Error processing Dissertation.pdf: RetryError[<Future at 0x7b5bad0bb8d0 state=finished raised RateLimitError>]




!!! Error processing RES2D.pdf: RetryError[<Future at 0x7b5bad3f93d0 state=finished raised RateLimitError>]
!!! Error processing k-truss.pdf: RetryError[<Future at 0x7b5bac124e90 state=finished raised RateLimitError>]
!!! Error processing Multi_Agent_Systems_for_Frame_Detection.pdf: RetryError[<Future at 0x7b5bace38b10 state=finished raised RateLimitError>]
!!! Error processing McCullohCarleyJOSS.pdf: RetryError[<Future at 0x7b5bacb78850 state=finished raised RateLimitError>]
!!! Error processing Limit Velocity.pdf: RetryError[<Future at 0x7b5bac1a3b90 state=finished raised RateLimitError>]
!!! Error processing Hashtag_Revival.pdf: RetryError[<Future at 0x7b5bad3cd010 state=finished raised RateLimitError>]
!!! Error processing Kidney_Behavioral.pdf: RetryError[<Future at 0x7b5bad350dd0 state=finished raised RateLimitError>]
!!! Error processing SocNetChgDet.pdf: RetryError[<Future at 0x7b5bacb36d90 state=finished raised RateLimitError>]




!!! Error processing 2021_EPJ_MVMCInfoOps.pdf: RetryError[<Future at 0x7b5bac374b50 state=finished raised RateLimitError>]
!!! Error processing Review of R Packages_20161026.pdf: RetryError[<Future at 0x7b5bac94ced0 state=finished raised RateLimitError>]
!!! Error processing TrainingSetSize.pdf: RetryError[<Future at 0x7b5bac3744d0 state=finished raised RateLimitError>]
!!! Error processing 23-US-DHS-001.pdf: RetryError[<Future at 0x7b5bac9ada90 state=finished raised RateLimitError>]
!!! Error processing On the Science of Networks.pdf: RetryError[<Future at 0x7b5bac33ef90 state=finished raised RateLimitError>]
!!! Error processing NeuroSynchrony.pdf: RetryError[<Future at 0x7b5bac155410 state=finished raised RateLimitError>]
!!! Error processing Clustering_Analysis_of_Website_Usage_on_Twitter_during_the_COVID_19_Pandemic.pdf: RetryError[<Future at 0x7b5bacb76f90 state=finished raised RateLimitError>]
!!! Error processing Parler_Disinformation_Challenge___CMOT_Extended.pdf: RetryError[<



!!! Error processing Acquiring Maintainable AI_Enable Systems_Final.pdf: RetryError[<Future at 0x7b5bac658090 state=finished raised RateLimitError>]
!!! Error processing SM Customer Feedback_FAB_2019_rev3.pdf: RetryError[<Future at 0x7b5bad37f450 state=finished raised RateLimitError>]




!!! Error processing Sailer McCulloh Soc Net and Spatial Config.pdf: RetryError[<Future at 0x7b5bacb53c90 state=finished raised RateLimitError>]




!!! Error processing NeuroCogInfluence.pdf: RetryError[<Future at 0x7b5bac3e8c50 state=finished raised RateLimitError>]
!!! Error processing COVID Bayesian Data Aug.pdf: RetryError[<Future at 0x7b5bac94d810 state=finished raised RateLimitError>]
!!! Error processing Utility Seeking in Complex Social Systems.pdf: RetryError[<Future at 0x7b5bacd38710 state=finished raised RateLimitError>]
!!! Error processing Characterizing_Communities_of_Hashtag_Usage_on_Twitter_During_the_2020_COVID_19_Pandemic.pdf: RetryError[<Future at 0x7b5baa7e23d0 state=finished raised RateLimitError>]
!!! Error processing Quantifying_Information_Advantage.pdf: RetryError[<Future at 0x7b5bacb01d50 state=finished raised RateLimitError>]
!!! Error processing Confidence_Chaining.pdf: RetryError[<Future at 0x7b5bac33e110 state=finished raised RateLimitError>]
!!! Error processing Text Analysis Using Automated Language Translators.pdf: RetryError[<Future at 0x7b5bad30fc90 state=finished raised RateLimitError>]
!!! Erro



!!! Error processing Kent2022_Chapter_MicroscopicMarkovChainApproach.pdf: RetryError[<Future at 0x7b5bac1aefd0 state=finished raised RateLimitError>]




!!! Error processing Symbolic Generative AI 20231012.pdf: RetryError[<Future at 0x7b5bacec41d0 state=finished raised RateLimitError>]
!!! Error processing cycon-final-draft.pdf: RetryError[<Future at 0x7b5bac65b790 state=finished raised RateLimitError>]
!!! Error processing Helene_and_Milton_ACM.pdf: RetryError[<Future at 0x7b5bad3e34d0 state=finished raised RateLimitError>]
!!! Error processing LSA email.pdf: RetryError[<Future at 0x7b5bad338e90 state=finished raised RateLimitError>]




!!! Error processing Lessons from Advising in Afghanistan.pdf: RetryError[<Future at 0x7b5bac1a08d0 state=finished raised RateLimitError>]
!!! Error processing Food Addiction 20231222 v3.pdf: RetryError[<Future at 0x7b5bac94e9d0 state=finished raised RateLimitError>]
!!! Error processing Social_Det_COVID_Mortality.pdf: RetryError[<Future at 0x7b5bacb19650 state=finished raised RateLimitError>]
!!! Error processing Chat GPT Bias final w copyright.pdf: RetryError[<Future at 0x7b5bac121a10 state=finished raised RateLimitError>]




!!! Error processing IkekNet1.pdf: RetryError[<Future at 0x7b5bac39ab10 state=finished raised RateLimitError>]
!!! Error processing Designed Networks.pdf: RetryError[<Future at 0x7b5bad307410 state=finished raised RateLimitError>]




!!! Error processing LongNetViewerORA.pdf: RetryError[<Future at 0x7b5bad32aa10 state=finished raised RateLimitError>]
!!! Error processing MIPB-CDA.pdf: RetryError[<Future at 0x7b5bac33fa90 state=finished raised RateLimitError>]
!!! Error processing Leadership of Data Annotation 20180304v2.pdf: RetryError[<Future at 0x7b5bad39ef90 state=finished raised RateLimitError>]
!!! Error processing Tweets-to-touchdowns.pdf: RetryError[<Future at 0x7b5bace40450 state=finished raised RateLimitError>]
!!! Error processing Spectral Analysis SNA.pdf: RetryError[<Future at 0x7b5baa882290 state=finished raised RateLimitError>]
!!! Error processing Network Simulation Models.pdf: RetryError[<Future at 0x7b5bacb6b210 state=finished raised RateLimitError>]
!!! Error processing EmergencyResponseAI.pdf: RetryError[<Future at 0x7b5bad37d6d0 state=finished raised RateLimitError>]




!!! Error processing LLM_UQ.pdf: RetryError[<Future at 0x7b5bad304490 state=finished raised RateLimitError>]
!!! Error processing NAP Behavioral Sci Intel.pdf: RetryError[<Future at 0x7b5bad3f8250 state=finished raised RateLimitError>]
!!! Error processing IkeNet.pdf: RetryError[<Future at 0x7b5bad388510 state=finished raised RateLimitError>]
!!! Error processing improving-decision-support-for-organ-transplant.pdf: RetryError[<Future at 0x7b5ba654b010 state=finished raised RateLimitError>]
!!! Error processing Planning for AI Sustainment A Methodology for Maintenance and Cost Management_V5.pdf: RetryError[<Future at 0x7b5bac65a3d0 state=finished raised RateLimitError>]




!!! Error processing docnet.pdf: RetryError[<Future at 0x7b5bac1f2a10 state=finished raised RateLimitError>]
!!! Error processing Simmelian-Gamma-LDA.pdf: RetryError[<Future at 0x7b5bad354b50 state=finished raised RateLimitError>]
!!! Error processing SocNetAlQaeda.pdf: RetryError[<Future at 0x7b5bad651610 state=finished raised RateLimitError>]
!!! Error processing Social_Network_Probability_Mechanics.pdf: RetryError[<Future at 0x7b5bad3105d0 state=finished raised RateLimitError>]
!!! Error processing RatingsVRankings.pdf: RetryError[<Future at 0x7b5bad305810 state=finished raised RateLimitError>]
!!! Error processing MLTEing_Models_for_NIER_at_ICSE_2023.pdf: RetryError[<Future at 0x7b5bac0afa10 state=finished raised RateLimitError>]
!!! Error processing Supply Chain Excellence.pdf: RetryError[<Future at 0x7b5bad31d3d0 state=finished raised RateLimitError>]
!!! Error processing ALL18.pdf: RetryError[<Future at 0x7b5bad31e790 state=finished raised RateLimitError>]
!!! Error processing V



!!! Error processing Lead-Azide.pdf: RetryError[<Future at 0x7b5bac007d50 state=finished raised RateLimitError>]
!!! Error processing ONA-using-igraph.pdf: RetryError[<Future at 0x7b5bac65a990 state=finished raised RateLimitError>]




!!! Error processing LLM_Confidence_Metrics.pdf: RetryError[<Future at 0x7b5bacd5a6d0 state=finished raised RateLimitError>]




!!! Error processing FBI_Recruit_Hire_Final.pdf: RetryError[<Future at 0x7b5bad9abf50 state=finished raised RateLimitError>]
!!! Error processing SecurityPrivAIML.pdf: RetryError[<Future at 0x7b5bad3e1790 state=finished raised RateLimitError>]
!!! Error processing CUSUM Parameterization.pdf: RetryError[<Future at 0x7b5bac197710 state=finished raised RateLimitError>]
!!! Error processing Evolution_of_Terrorism_PNAS.pdf: RetryError[<Future at 0x7b5bad6da310 state=finished raised RateLimitError>]
!!! Error processing Course Info Security.pdf: RetryError[<Future at 0x7b5bad305e10 state=finished raised RateLimitError>]
!!! Error processing Analysis_of_Malware_Communities_Using_Multi_Modal_Features.pdf: RetryError[<Future at 0x7b5baaa9a6d0 state=finished raised RateLimitError>]
!!! Error processing A_Complex_Network_Approach_to_Find_Latent_Terorrist_Communities.pdf: RetryError[<Future at 0x7b5bac3caa10 state=finished raised RateLimitError>]
!!! Error processing Arrow White Paper DExTra.pdf: 



!!! Error processing Genetic_Algorithms_for_Prompt_Optimization.pdf: RetryError[<Future at 0x7b5bad353350 state=finished raised RateLimitError>]




!!! Error processing FSS-19_paper_137.pdf: RetryError[<Future at 0x7b5bac0466d0 state=finished raised RateLimitError>]
!!! Error processing ClassifiersCrowdSource.pdf: RetryError[<Future at 0x7b5bace9ab50 state=finished raised RateLimitError>]
!!! Error processing 2024_ICWSM_Data_Challenge__Post_API_Data_Collection.pdf: RetryError[<Future at 0x7b5bad335550 state=finished raised RateLimitError>]
!!! Error processing Misinformation_Simulation.pdf: RetryError[<Future at 0x7b5bac1586d0 state=finished raised RateLimitError>]
!!! Error processing Organizational risk using network analysis.pdf: RetryError[<Future at 0x7b5bac65aad0 state=finished raised RateLimitError>]




!!! Error processing Overcoming_Social_Media_API_Restrictions__Building_an_Effective_Web_Scraper.pdf: RetryError[<Future at 0x7b5ba7805dd0 state=finished raised RateLimitError>]
Department of Health and Human Services
Part 1. Overview Information
Key Dates
The following table includes NIH standard due dates (https://grants.nih.gov/grants/how-to-apply-application-guide/due-dates-and-submission-policies/due-dates.htm) marked with anasterisk.
Application Due Dates Review and Award Cycles
New
Renewal /Resubmission /Revision (asallowed)
AIDS -New/Renewal/Resubmission/Revision,as allowed Scientiﬁc MeritReview Advisory CouncilReviewEarliest Start Date
February 05, 2025 * March
Relevant Papers: 0/112
Irrelevant Papers: 112/112


In [None]:
from langchain.prompts import PromptTemplate

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

file_list = [
    "/content/research_papers/brain_ai_2023.pdf",
    "/content/research_papers/equity_neuro_biomarkers.pdf"
]

response = pipeline.invoke(file_list)
print(response)

NameError: name 'nofo_summary_text' is not defined

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

file_list = [
    "/content/research_papers/brain_ai_2023.pdf",
    "/content/research_papers/equity_neuro_biomarkers.pdf"
]

response = pipeline.invoke(file_list)
print(response)

ValueError: File path /content/research_papers/brain_ai_2023.pdf is not a valid file or url

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

response = pipeline.invoke(file_list)
print(response)



RateLimitError: Error code: 429 - {'reason': {'error': 'You exceeded your current quota!!'}}

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
            loader = PyPDFLoader(path)
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        except Exception as e:
            print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
            loader = PyPDFLoader(path)
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        except Exception as e:
            print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
              loader = PyPDFLoader(path)
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        except Exception as e:
            print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        # except Exception as e:
            # print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            # docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        # except Exception as e:
            # print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            # docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		try:
        loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        # except Exception as e:
            # print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            # docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        # except Exception as e:
            # print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            # docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

import tenacity
import logging
from openai import RateLimitError # Import RateLimitError for retry logic
import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
		loader = PyPDFLoader(path)
        docs = loader.load()
        full_text = "\n".join([doc.page_content for doc in docs])
        docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...") # Increased chunk size for potential benefit
        # except Exception as e:
            # print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            # docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)


    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 56)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
# Correct the import path for ChatOpenAI

from langchain_community.chat_models import ChatOpenAI
# Correct the import path for PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
import os

import tenacity
import logging
# Ensure openai is imported correctly if RateLimitError is used
try:
    from openai import RateLimitError
except ImportError:
    # Handle cases where RateLimitError might be in a different location or version
    print("Warning: openai.RateLimitError not found. Retry might not handle rate limits correctly.")
    # Define a dummy exception class if not found, or use a more general retry condition
    class RateLimitError(Exception):
        pass

import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
!pip install langchain-community
import os

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
        try:
            loader = PyPDFLoader(path)
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            # Increased chunk size for potential benefit
            docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...")
        except Exception as e:
            print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)

    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)

ModuleNotFoundError: No module named 'langchain_community'

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
# Correct the import path for ChatOpenAI
!pip install langchain-community
from langchain_community.chat_models import ChatOpenAI
# Correct the import path for PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader
import os

import tenacity
import logging
# Ensure openai is imported correctly if RateLimitError is used
try:
    from openai import RateLimitError
except ImportError:
    # Handle cases where RateLimitError might be in a different location or version
    print("Warning: openai.RateLimitError not found. Retry might not handle rate limits correctly.")
    # Define a dummy exception class if not found, or use a more general retry condition
    class RateLimitError(Exception):
        pass

import tiktoken

# Ensure logging is configured for tenacity
logging.basicConfig(level=logging.INFO)

proposal_prompt = PromptTemplate(
    input_variables=["nofo_summary", "research_chunks"],
    template="""
You are a research strategy assistant tasked with generating 5 actionable and innovative research proposal ideas based on the content of scientific research papers and the requirements outlined in the NOFO.

Using the context provided below, produce exactly **5 distinct project ideas** in the following structured format:

---
1. **Idea {{i}}:** [Concise Title of the Project Idea]
2. **Description:** [Brief and targeted description summarizing the objectives, innovative elements, scientific rationale, and anticipated impact.]
3. **Citation:** [Author(s), Year or Paper Title]
4. **NOFO Alignment:** [List two or more specific NOFO requirements that this idea directly addresses]
5. **File Path of the Research Paper:** [Exact file path, ending in .pdf]
---

**NOFO Summary:**
{nofo_summary}

**Research Paper Chunks (with file paths):**
{research_chunks}

Respond in plain text. Be specific, and make sure file paths and citations align.
"""
)

from langchain.schema.runnable import RunnableParallel, RunnableLambda
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader

# LLM setup
llm = ChatOpenAI(temperature=0.5)
llm_chain = LLMChain(llm=llm, prompt=proposal_prompt)

# Load and parse PDF files into text chunks + file path
# Load and parse PDF files into text chunks + file path
def load_and_combine_docs(file_paths: list):
    docs_with_paths = []
    for path in file_paths:
        try:
            loader = PyPDFLoader(path)
            docs = loader.load()
            full_text = "\n".join([doc.page_content for doc in docs])
            # Increased chunk size for potential benefit
            docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:2000]}...")
        except Exception as e:
            print(f"Error loading or processing {path}: {e}")
            # Optionally skip this file or add a placeholder
            docs_with_paths.append(f"Paper: {path}\nContent: Error loading file.")
    return "\n\n".join(docs_with_paths)

    #     loader = PyPDFLoader(path)
    #   docs = loader.load()
    #   full_text = "\n".join([doc.page_content for doc in docs])
    #   docs_with_paths.append(f"Paper: {path}\nContent: {full_text[:1000]}...")  # limit to 1000 chars per paper
    # return "\n\n".join(docs_with_paths)

# Lambda wrapper for integration
doc_loader_runnable = RunnableLambda(lambda file_paths: {"research_chunks": load_and_combine_docs(file_paths)})

# Define nofo_summary_text using the previously loaded NOFO_text
# Make sure NOFO_text is accessible in this cell or pass it appropriately
# For simplicity, assuming NOFO_text is available from a previous cell's execution
# If not, you would need to load or pass it here.
try:
    # Attempt to use the NOFO_text from the previous cell
    # If running as separate cells, this variable must be defined in a preceding cell
    nofo_summary_text = NOFO_text
except NameError:
    # Handle the case where NOFO_text is not defined (e.g., if cells are run out of order)
    print("Warning: NOFO_text not found. Loading NOFO again for summary.")
    # Reload NOFO for use in prompt if NOFO_text is not available
    nofo_loader = PyPDFLoader("/content/NOFO_pdf.pdf")
    nofo_docs = nofo_loader.load()
    nofo_summary_text = "\n\n".join([doc.page_content for doc in nofo_docs])
    nofo_summary_text = nofo_summary_text[:12000] # Apply truncation again


# Static NOFO summary (can also be dynamic)
nofo_summary_runnable = RunnableLambda(lambda _: {"nofo_summary": nofo_summary_text})

# Compose the pipeline
pipeline = RunnableParallel({
    "nofo_summary": nofo_summary_runnable,
    "research_chunks": doc_loader_runnable
}) | llm_chain

# Dynamically build the file_list from the correct directory
# Ensure papers_path is accessible (it should be from the preceding cells)
try:
    papers_path
except NameError:
    print("Error: papers_path is not defined. Please ensure the previous cells were run.")
    # You might need to define papers_path here if the previous cell wasn't run.
    # For example: papers_path = "/content/Papers/" # Or the correct path

# Build the list of PDF files from the correct directory
file_list = [os.path.join(papers_path, f) for f in os.listdir(papers_path) if f.endswith(".pdf")]

# Implement retry mechanism for the main pipeline execution
@tenacity.retry(
    wait=tenacity.wait_exponential(multiplier=1, min=4, max=60), # Wait exponentially between retries
    stop=tenacity.stop_after_attempt(5), # Stop after 5 attempts
    before_sleep=tenacity.before_sleep_log(logging.getLogger(__name__), logging.INFO), # Log before sleeping
    retry=tenacity.retry_if_exception_type(RateLimitError) # Retry specifically on RateLimitError
)
def invoke_pipeline_with_retry(file_list):
    """Helper function to invoke the pipeline with retry logic."""
    return pipeline.invoke(file_list)

# Call the pipeline using the retry function
try:
    response = invoke_pipeline_with_retry(file_list)
    print(response)
except RateLimitError as e:
    print(f"Failed to generate proposal ideas after multiple retries due to rate limit: {e}")
except Exception as e:
    print(f"An unexpected error occurred during pipeline execution: {e}")


# file_list = [
#   "/content/papers/brain_ai_2023.pdf",
#   "/content/papers/equity_neuro_biomarkers.pdf"
# ]

# response = pipeline.invoke(file_list)
# print(response)



ValidationError: 1 validation error for ChatOpenAI
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'temperature': 0.5, 'mod...ne, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error