In [7]:
# !pip install openai
# !pip install requests
# !pip install bioRxiv_retriever

- Using gpt-3.5-turbo because it's the cheapest
- https://info.arxiv.org/help/api/user-manual.html#arxiv-api-users-manual

To Do:
- The current approach is basic. Some prompt engineering could be helpful. Use OAI Playground Compare to do this
    - suggest_research_direction() needs to be split up into a few functions, and should probably make its own arxiv calls based on user_feedback and research_interests. 
    - incorporate RAG 
- Have a more stable way of inserting the user's API key

In [16]:
from openai import OpenAI
import requests
import re
from biorxiv_retriever import BiorxivRetriever

In [17]:
# Can hardcode your OpenAI API key here
OpenAI.api_key = ''
#client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

client = OpenAI(api_key=OpenAI.api_key)

In [21]:
def analyze_and_route_query(original_query):
    """
    Analyzes the user's query to determine the most appropriate pre-print server.
    
    Parameters:
    - original_query (str): The user's query in natural language.
    
    Returns:
    - str: The name of the pre-print server ('arXiv', 'bioRxiv').
    """
    prompt = f"""
    You are a routing system. You are able to route requests to the following 2 pre-print servers:
    1) arXiv is a preprint server for physics, math, computer sciences, quantitative biology and statistics.
    2) bioRxiv is a preprint server for biology.
    
    Given the following user query, your task is to determine what pre-print server is best suited to answer the user's query:
    Query: "{original_query}"

    Given this query, which pre-print server is most appropriate to address this question? You are only allowed to answer with 1 word: bioRxiv or arXiv.
    """
    
    response = client.chat.completions.create(
        model="gpt-4", # 3.5 is shockingly bad at this, had to switch to 4
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
    )
    server_suggestion = response.choices[0].message.content.strip()
    return server_suggestion

def enhance_query_for_server(server, original_query):
    """
    Enhances the query based on the specified pre-print server.
    
    Parameters:
    - server (str): The name of the server ('arXiv', 'bioRxiv').
    - original_query (str): The user's original query.
    
    Returns:
    - str: The enhanced query specific to the server's API.
    """
    if server == "arXiv":
        return enhance_arxiv_query(original_query)
    elif server == "bioRxiv":
        return enhance_bioRxiv_query(original_query)
    else:
        raise ValueError("Unsupported server specified.")

def enhance_arxiv_query(original_query):
    prompt = f"""
    Given a natural language query, convert it into a structured search query for the arXiv API. The arXiv API query format uses field prefixes like 'au' for author, 'ti' for title, 'cat' for category, and logical operators like 'AND', 'OR'. Below are examples of converting natural language queries into structured arXiv API queries:

    Here are a few examples:
    
    Natural Language Query: Papers by Albert Einstein about relativity
    Structured arXiv API Query: au:Albert Einstein AND all:relativity

    Natural Language Query: Quantum computing research after 2015
    Structured arXiv API Query: all:quantum computing AND submittedDate:[2015 TO *]

    Natural Language Query: Machine learning applications in finance
    Structured arXiv API Query: all:machine learning AND all:finance

    Now, convert the following natural language query into a structured arXiv API query:
    '{original_query}'
    Structured arXiv API Query:
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
    )
    # Extracting the structured query from the response
    arxiv_query = response.choices[0].message.content.strip()
    return arxiv_query

def enhance_bioRxiv_query(original_query):
    """
    Enhances a user's natural language query using OpenAI's language model. This function aims to 
    critically interpret the user's request and generate a more detailed and refined query for searching bioRxiv articles.

    Parameters:
    - original_query (str): The initial user's query in natural language.

    Returns:
    - str: An enhanced query string.
    """

    # Correcting the variable name in the prompt
    prompt = f"Given a research query: '{original_query}', " \
             "think critically and formulate a more detailed and specific query " \
             "that could yield more relevant and comprehensive results in a scientific articles database search."
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
    )
    
    bioRxiv_query = response.choices[0].message.content.strip()
    return bioRxiv_query

def search_server(server, enhanced_query):
    """
    Searches the specified pre-print server using the enhanced query.
    
    Parameters:
    - server (str): The name of the server ('arXiv', 'bioRxiv', 'medRxiv').
    - enhanced_query (str): The query enhanced for the server's API.
    
    Returns:
    - str: The XML or JSON response from the server.
    """
    if server == "arXiv":
        return search_arxiv(enhanced_query)
    elif server == "bioRxiv":
        return search_bioRxiv(enhanced_query)
    else:
        raise ValueError("Unsupported server specified.")

def search_arxiv(query):
    url = f'http://export.arxiv.org/api/query?search_query={query}&start=0&max_results=5'
    response = requests.get(url)
    return response.text

def search_bioRxiv(enhanced_query):
    """
    Search for relevant papers on bioRxiv using an enhanced natural language query.

    Parameters:
    - enhanced_query (str): A user's query prepared for searching.

    Returns:
    - list: A list of dictionaries, each containing information about a paper.
    """
    # Initialize the bioRxivRetriever
    br = BiorxivRetriever()
    
    # Execute the query using bioRxivRetriever
    papers = br.query(enhanced_query, metadata=True, full_text=True)
    
    return papers

def extract_titles_and_summaries(xml_response):
    # Regex patterns to match titles and summaries
    title_pattern = re.compile(r'<title>(.*?)<\/title>')
    summary_pattern = re.compile(r'<summary>(.*?)<\/summary>', re.DOTALL)  # re.DOTALL to match across newlines

    titles = title_pattern.findall(xml_response)
    summaries = summary_pattern.findall(xml_response)

    # The first 'title' match is always "ArXiv Query: ..." so we skip it
    titles = titles[1:]

    # Pairing titles with summaries
    papers_info = [{"title": title, "summary": summary.strip()} for title, summary in zip(titles, summaries)]

    return papers_info

def summarize(initial_query, papers_info):
    prompt = f"The user asked: '{initial_query}'. Based on the following titles and summaries from academic papers, provide a detailed and accessible explanation of the topic:\n\n"
    
    for paper in papers_info:
        prompt += f"Title: {paper['title']}\nSummary: {paper['summary']}\n\n"
    
    prompt += "Please review the titles and summaries to provide a thoughtful response to the user's question."

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": prompt,
            }
        ],
    )
    
    thoughtful_response = response.choices[0].message.content.strip()
    return thoughtful_response

def suggest_research_directions(initial_query, thoughtful_response):
    """
    Generates novel research directions based on the user's feedback on a provided summary
    and their specific interests.

    Parameters:
    - initial_query: The original query posed by the user.
    - thoughtful_response: A comprehensive response to the initial query, summarizing relevant
      academic papers and insights.

    Returns:
    - A string containing suggestions for research trends, gaps, next steps, or future directions.
    """

    print("\n--- Research Direction Suggestion ---")
    user_feedback = input("What are your thoughts on the provided summary? Any specific areas of interest or questions that arise? ")

    research_interests = input("Could you specify any particular research interests or areas where you're seeking innovation? ")
    
    prompt = f"""
    Based on the initial inquiry about '{initial_query}' and the provided summary, the researcher shared their thoughts: '{user_feedback}'. They expressed a particular interest in '{research_interests}'.

    Considering the current state of research and potential future developments, identify emerging trends, and gaps in the literature, and suggest novel research directions or next steps that could significantly advance the field. Emphasize novelty and innovation in your suggestions.
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": prompt,
            }
        ],
    )

    research_suggestions = response.choices[0].message.content.strip()
    return research_suggestions

In [22]:
def main():
    print(f"Welcome. This is a semantic search program for arXiv and bioRxiv.\n")
    user_query = input("Enter your search query: ")
    # Determine the appropriate server for the query
    server = analyze_and_route_query(user_query)
        
    print(f"Based on your query, I think the {server} pre-print server is most appropriate.\n")

    if server == "bioRxiv":
        print(f"Please note that the API for connecting to bioRxiv is very slow. This request may take some time.\n")
    
    # Enhance the query based on the server (this helps build more informative searches, and also accounts for typos and poorly-specified queries)
    enhanced_query = enhance_query_for_server(server, user_query)
    print(f"Here is the enhanced query I am using: {enhanced_query}\n\nFetching papers from {server}...\n")
    
    # Search the server
    if server == "arXiv":
        search_results = search_arxiv(enhanced_query)
        extracted_information = extract_titles_and_summaries(search_results)
    elif server == "bioRxiv":
        # Assuming search_bioRxiv returns similar structure to extract_titles_and_summaries
        extracted_information = search_bioRxiv(enhanced_query)
        print(extracted_information)
    else:
        print("Unsupported server. Exiting...")
        return
    
    # Summarize the results
    thoughtful_response = summarize(user_query, extracted_information)
    print("\nHere's what I found:\n", thoughtful_response)
    
    # Suggest research directions
    research_suggestions = suggest_research_directions(user_query, thoughtful_response)
    print("\nHere are a few research ideas to inspire your work:\n", research_suggestions)

if __name__ == "__main__":
    main()


Welcome. This is a semantic search program for arXiv and bioRxiv.



Enter your search query:  What are the latest advancements in oncology?


Based on your query, I think the bioRxiv pre-print server is most appropriate.

Please note that the API for connecting to bioRxiv is very slow. This request may take some time.

Here is the enhanced query I am using: What are the current trends in targeted therapies for specific types of cancer in the field of oncology and how do they compare to traditional chemotherapy treatments in terms of efficacy and side effects?
Fetching papers from bioRxiv...


 17%|█████████████▎                                                               | 380/2190 [19:28<1:32:45,  3.07s/it]


KeyboardInterrupt: 

There is no obvious way to do any form of text search using the bioRxiv API. The same issue holds for medRxiv. These APIs are only for metadata.

We found the biorxiv-retriever project which looked like it would solve our issues: https://pypi.org/project/biorxiv-retriever/ 

However, it is extremely slow (1 query was estimated to take ~1.5 hours to return a result).