In [11]:
!pip install -q openai langchain playwright beautifulsoup4 playwright install chroma chromadb tiktoken langchainhub crewai

In [1]:
from getpass import getpass
import os
os.environ['OPENAI_API_KEY'] = getpass()

 ········


In [3]:
#Extract text from CDC website
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup

import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from openai import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import time

#Send to GPT4 for cleanup
def clean_text_with_gpt4(text):
    """
    This function takes a string of text and uses GPT-4 to clean it up using the OpenAI ChatCompletion API.
    It handles large texts by breaking them into smaller chunks.
    :param text: String containing the text to be cleaned.
    :return: Cleaned text as a string.
    """
    cleaned_texts = []
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
    chunks = text_splitter.split_documents(text)

    for chunk in chunks:
        try:
            response = client.chat.completions.create(
                model="gpt-4-0314",  # Assuming using the latest GPT-4 model
                messages=[{"role": "system", "content": "You are a helpful assistant."},
                          {"role": "user", "content": f"Please clean up the following text:\n\n{chunk}"}]
            )
            #print(response)
            cleaned_texts.append(response.choices[0].message.content.strip())
            #print(cleaned_texts)
            time.sleep(1)  # Delay to respect rate limits
        except Exception as e:
            print(f"An error occurred: {e}")
            return None
    return ' '.join(cleaned_texts)

file_name = "cleaned_texts.txt"
if os.path.exists(file_name):
    print(f"The file {file_name} already exists.")
    try:
        with open(file_name, 'r', encoding='utf-8') as file:
            content = file.read()
            #print("File content:\n")
            #print(content)
    except FileNotFoundError:
        print(f"The file {file_name} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    url = "https://www.cdc.gov"
    loader = RecursiveUrlLoader(
        url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text
    )
    docs = loader.load()
    #client = OpenAI()
    #cleaned_text = clean_text_with_gpt4(docs)
    #with open(file_name, 'w', encoding='utf-8') as file:
    #    for text in cleaned_texts:
    #        file.write(text + "\n\n")  # Adding two newlines as a separator between texts
    #print(f"Cleaned texts saved to {file_name}")
    cleaned_text = docs

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

text_splitter2 = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter2.split_documents(cleaned_text)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
CDC_retriever = vectorstore.as_retriever()

In [11]:
import zipfile
from langchain_community.document_loaders import DirectoryLoader

loader2 = DirectoryLoader('./EpiHiper-Schema-master', glob="**/*.py", show_progress=True)
abm_codes = loader2.load()

#code_splits = split_by_length_with_overlap(abm_codes)
vectorstore = Chroma.from_documents(documents=abm_codes, embedding=OpenAIEmbeddings())
ABM_retriever = vectorstore.as_retriever()






100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 46.05it/s][A[A[A[A[A


In [15]:
from langchain_openai import ChatOpenAI
from crewai import Agent, Task, Crew, Process
import textwrap

# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=.2)
llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0.0) #"gpt-4-1106-preview", "gpt-4-0314"

query_planner = Agent(
    role="Simulation Planner",
    goal="Plan the steps needed to parameterize an agent-based simulation from existing knowledge",
    backstory=textwrap.dedent("""
        You are an expert at identifying modeling parameters from code base that implements 
        an agent-based model and listing the model choices, parameters, and json config files 
        whose values need to be determined. You will break down each model choice, parameter, 
        and json config file into sub-questions such that the answer to each sub-question will 
        inform the value to be used in the agent-based simulation.
        Accept the user-question and determine if it requires sub-questions to either the
        CDC website which provides an official source of recent infectious disease outbreaks
        or Wikipedia for information about a geographical location, country, infectious agent
        characteristics, transmission dynamics, infection states, or other epidemiological
        modeling efforts.
        Your final answer MUST be a description of sub-questions that explain the best model
        choices, model parameters, and config files for an agent-based modeling code base.
    """),
    verbose=True,
    allow_delegation=False,
    tools=[],  ###
    llm=llm,
)


In [28]:
#from langchain import hub
#prompt = hub.pull("hwchase17/self-ask-with-search")
#print(f'{prompt.format(agent_scratchpad = "AGENTSCRATCHPAD", input = "INPUT")}')

from langchain.tools.retriever import create_retriever_tool
from langchain.tools import DuckDuckGoSearchRun

CDC_retriever_tool = create_retriever_tool(
    CDC_retriever,
    "Intermediate Answer",
    """As an AI assistant you provide answers based on the given context, ensuring accuracy and briefness. 

        You always follow these guidelines:

        -If the answer isn't available within the context, state that fact
        -Otherwise, answer to your best capability, refering to source of documents provided
        -Only use examples if explicitly requested
        -Do not introduce examples outside of the context
        -Do not answer if context is absent
        -Limit responses to three or four sentences for clarity and conciseness
        
        Search for data related to outbreaks. For questions about outbreaks, use this tool to return 
        relevant data for answering questions about outbreaks""",
)

ABM_retriever_tool = create_retriever_tool(
    ABM_retriever,
    "Intermediate Answer",
    """As an AI assistant you provide answers based on the given context, ensuring accuracy and briefness. 

        You always follow these guidelines:

        -If the answer isn't available within the context, state that fact
        -Otherwise, answer to your best capability, refering to source of documents provided
        -Only use examples if explicitly requested
        -Do not introduce examples outside of the context
        -Do not answer if context is absent
        -Limit responses to three or four sentences for clarity and conciseness
        
        Search for data related to outbreaks. For questions about outbreaks, use this tool to return 
        relevant data for answering questions about outbreaks""",
)

web_search = DuckDuckGoSearchRun()

query_executor = Agent(
    role="""
        Agent Role: Information Searcher

        Primary Objectives:
        1. Utilize the CDC_retriever_tool to gather current outbreak information. This includes statistics, affected regions, and latest guidelines related to the outbreak.

        2. Employ the ABM_retriever_tool to access the codebase for agent-based modeling simulations. Extract relevant parameters and settings that are crucial for understanding the dynamics of the disease spread in the simulations.

        3. Conduct thorough internet searches using the web_search tool. Focus on disease-specific information such as modes of transmission, vectors involved, the role of asymptomatic infectious carriers, and insights from past modeling efforts.

        Key Responsibilities:
        - Ensure accurate and up-to-date information is retrieved from each tool.
        - Synthesize information from diverse sources to provide a comprehensive understanding of the disease and its impact.
        - Adhere to the principles of clarity and conciseness in reporting findings.
        """,
    goal="Information Searcher",
    backstory=textwrap.dedent("""
        Accept list of sub-questions from the query_planner agent and perform
        the necessary searches to answer the questions.
        Perform the tasks in the order given and report the result out.
        Your final answer MUST be a correct response to the original user-query.
    """),
    verbose=True,
    llm=llm,
    # tools=[SqlTools.do_sql_query, RagTools.do_rag_query],
    tools=[CDC_retriever_tool, ABM_retriever_tool, web_search],
    allow_delegation=True,
)

In [None]:
user_query = "What are the latest statistics on the current flavivirus outbreak from the CDC?"

task1 = Task(
    description=textwrap.dedent(f"""
        Your task is to retrieve the latest information on the current outbreak from the CDC database. 
        This includes key statistics such as number of cases, affected regions, and any new guidelines issued. 
        Prepare the necessary sub-queries or search queries to handle this user-query:
            {user_query}
        You should use the CDC_retriever_tool to access the relevant data.
    """),
    agent=query_executor
)

task2 = Task(
    description=textwrap.dedent(f"""
        You will receive a set of queries from the previous task. 
        Use the ABM_retriever_tool to query the agent-based modeling codebase for parameters 
        and settings related to disease spread simulation. 
        Your final answer must be a correct response to the original user-query:
            {user_query}
    """),
    agent=query_executor
)

task3 = Task(
    description=textwrap.dedent(f"""
        Conduct a comprehensive web search to gather detailed information on the infectious agent. 
        Include data on transmission methods, known vectors, the role of asymptomatic carriers, 
        and historical modeling efforts. Use the web_search tool to perform this task.
        Your findings should directly support the original user-query:
            {user_query}
    """),
    agent=query_executor
)

crew = Crew(
    agents=[query_planner, query_executor],
    tasks=[task1, task2, task3],
    verbose=2,  # print what tasks are being worked on, can set it to 1 or 2
    process=Process.sequential,
)

result = crew.kickoff()

print("######################")
print(result)

Working Agent: 
        Agent Role: Information Searcher

        Primary Objectives:
        1. Utilize the CDC_retriever_tool to gather current outbreak information. This includes statistics, affected regions, and latest guidelines related to the outbreak.

        2. Employ the ABM_retriever_tool to access the codebase for agent-based modeling simulations. Extract relevant parameters and settings that are crucial for understanding the dynamics of the disease spread in the simulations.

        3. Conduct thorough internet searches using the web_search tool. Focus on disease-specific information such as modes of transmission, vectors involved, the role of asymptomatic infectious carriers, and insights from past modeling efforts.

        Key Responsibilities:
        - Ensure accurate and up-to-date information is retrieved from each tool.
        - Synthesize information from diverse sources to provide a comprehensive understanding of the disease and its impact.
        - Adhere to 