From 3ba175450085b03d69ac1c68156c41d3ae843204 Mon Sep 17 00:00:00 2001 From: Zhiyu Wang <121875294+zhiyu-01@users.noreply.github.com> Date: Thu, 5 Oct 2023 20:00:52 +0800 Subject: [PATCH] feat: add web search (#274) Co-authored-by: Guohao Li --- .github/workflows/pytest_apps.yml | 4 + .github/workflows/pytest_package.yml | 6 + camel/functions/search_functions.py | 259 +++++++++++++++++++++++- camel/societies/role_playing.py | 1 - test/functions/test_search_functions.py | 30 ++- 5 files changed, 295 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pytest_apps.yml b/.github/workflows/pytest_apps.yml index ffac4c78f..bf6c0fb57 100644 --- a/.github/workflows/pytest_apps.yml +++ b/.github/workflows/pytest_apps.yml @@ -26,6 +26,8 @@ jobs: - name: Run pytest env: OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + SEARCH_ENGINE_ID: "${{ secrets.SEARCH_ENGINE_ID }}" run: poetry run pytest -v apps/ pytest_examples: @@ -41,4 +43,6 @@ jobs: - name: Run pytest env: OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + SEARCH_ENGINE_ID: "${{ secrets.SEARCH_ENGINE_ID }}" run: poetry run pytest -v examples/ diff --git a/.github/workflows/pytest_package.yml b/.github/workflows/pytest_package.yml index 4ce22b9b2..883fe859c 100644 --- a/.github/workflows/pytest_package.yml +++ b/.github/workflows/pytest_package.yml @@ -25,6 +25,8 @@ jobs: - name: Run pytest env: OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + SEARCH_ENGINE_ID: "${{ secrets.SEARCH_ENGINE_ID }}" run: poetry run pytest --fast-test-mode test/ pytest_package_llm_test: @@ -38,6 +40,8 @@ jobs: - name: Run pytest env: OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + SEARCH_ENGINE_ID: "${{ secrets.SEARCH_ENGINE_ID }}" run: poetry run pytest --llm-test-only test/ pytest_package_very_slow_test: @@ -51,4 +55,6 @@ jobs: - name: Run pytest env: OPENAI_API_KEY: "${{ secrets.OPENAI_API_KEY }}" + GOOGLE_API_KEY: "${{ secrets.GOOGLE_API_KEY }}" + SEARCH_ENGINE_ID: "${{ secrets.SEARCH_ENGINE_ID }}" run: poetry run pytest --very-slow-test-only test/ diff --git a/camel/functions/search_functions.py b/camel/functions/search_functions.py index faaca8e77..2784ab41a 100644 --- a/camel/functions/search_functions.py +++ b/camel/functions/search_functions.py @@ -11,9 +11,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. =========== -from typing import List +import os +from typing import Any, Dict, List -from .openai_function import OpenAIFunction +import camel.agents +from camel.functions import OpenAIFunction +from camel.messages import BaseMessage +from camel.prompts import TextPrompt def search_wiki(entity: str) -> str: @@ -45,6 +49,255 @@ def search_wiki(entity: str) -> str: return result +def search_google(query: str) -> List[Dict[str, Any]]: + r"""Use google search engine to search information for the given query. + + Args: + query (string): The query to be searched. + + Returns: + List[Dict[str, Any]]: A list of dictionaries where each dictionary + represents a website. + Each dictionary contains the following keys: + - 'result_id': A number in order. + - 'title': The title of the website. + - 'description': A brief description of the website. + - 'long_description': More detail of the website. + - 'url': The URL of the website. + + Example: + { + 'result_id': 1, + 'title': 'OpenAI', + 'description': 'An organization focused on ensuring that + artificial general intelligence benefits all of humanity.', + 'long_description': 'OpenAI is a non-profit artificial + intelligence research company. Our goal is to advance digital + intelligence in the way that is most likely to benefit humanity + as a whole', + 'url': 'https://www.openai.com' + } + title, descrption, url of a website. + """ + import requests + + # https://developers.google.com/custom-search/v1/overview + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + # https://cse.google.com/cse/all + SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") + + # Using the first page + start_page_idx = 1 + # Different language may get different result + search_language = "en" + # How many pages to return + num_result_pages = 10 + # Constructing the URL + # Doc: https://developers.google.com/custom-search/v1/using_rest + url = f"https://www.googleapis.com/customsearch/v1?" \ + f"key={GOOGLE_API_KEY}&cx={SEARCH_ENGINE_ID}&q={query}&start=" \ + f"{start_page_idx}&lr={search_language}&num={num_result_pages}" + + responses = [] + # Fetch the results given the URL + try: + # Make the get + result = requests.get(url) + data = result.json() + + # Get the result items + if "items" in data: + search_items = data.get("items") + + # Iterate over 10 results found + for i, search_item in enumerate(search_items, start=1): + if "og:description" in search_item["pagemap"]["metatags"][0]: + long_description = \ + search_item["pagemap"]["metatags"][0]["og:description"] + else: + long_description = "N/A" + # Get the page title + title = search_item.get("title") + # Page snippet + snippet = search_item.get("snippet") + + # Extract the page url + link = search_item.get("link") + response = { + "result_id": i, + "title": title, + "description": snippet, + "long_description": long_description, + "url": link + } + responses.append(response) + else: + responses.append({"error": "google search failed."}) + + except requests.RequestException: + responses.append({"erro": "google search failed."}) + + return responses + + +def text_extract_from_web(url: str) -> str: + r"""Get the text information from given url. + + Args: + url (string): The web site you want to search. + + Returns: + string: All texts extract from the web. + """ + import requests + from bs4 import BeautifulSoup + + try: + # Request the target page + response_text = requests.get(url).text + + # Parse the obtained page + soup = BeautifulSoup(response_text, features="html.parser") + + for script in soup(["script", "style"]): + script.extract() + + text = soup.get_text() + # Strip text + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines + for phrase in line.split(" ")) + text = ".".join(chunk for chunk in chunks if chunk) + + except requests.RequestException: + text = f"can't access {url}" + + return text + + +# Split a text into smaller chunks of size n +def create_chunks(text: str, n: int) -> List[str]: + r"""Returns successive n-sized chunks from provided text." + + Args: + text (string): The text to be split. + n (int): The max length of a single chunk. + + Returns: + List[str]: A list of splited texts. + """ + + chunks = [] + i = 0 + while i < len(text): + # Find the nearest end of sentence within a range of 0.5 * n + # and 1.5 * n tokens + j = min(i + int(1.2 * n), len(text)) + while j > i + int(0.8 * n): + # Decode the tokens and check for full stop or newline + chunk = text[i:j] + if chunk.endswith(".") or chunk.endswith("\n"): + break + j -= 1 + # If no end of sentence found, use n tokens as the chunk size + if j == i + int(0.8 * n): + j = min(i + n, len(text)) + chunks.append(text[i:j]) + i = j + return chunks + + +def prompt_single_step_agent(prompt: str) -> str: + """Prompt a single-step agent to summarize texts or answer a question.""" + + assistant_sys_msg = BaseMessage.make_assistant_message( + role_name="Assistant", + content="You are a helpful assistant.", + ) + agent = camel.agents.ChatAgent(assistant_sys_msg) + agent.reset() + + user_msg = BaseMessage.make_user_message( + role_name="User", + content=prompt, + ) + assistant_response = agent.step(user_msg) + if assistant_response.msgs is not None: + return assistant_response.msg.content + return "" + + +def summarize_text(text: str, query: str) -> str: + r"""Summarize the information from the text, base on the query if query is + given. + + Args: + text (string): Text to summarise. + query (string): What information you want. + + Returns: + string: Strings with information. + """ + summary_prompt = TextPrompt( + '''Gather information from this text that relative to the question, but + do not directly answer the question.\nquestion: {query}\ntext ''') + summary_prompt = summary_prompt.format(query=query) + # Max length of each chunk + max_len = 3000 + results = "" + chunks = create_chunks(text, max_len) + # Summarize + for i, chunk in enumerate(chunks, start=1): + prompt = summary_prompt + str(i) + ": " + chunk + result = prompt_single_step_agent(prompt) + results += result + "\n" + + # Final summarise + final_prompt = TextPrompt( + '''Here are some summarized texts which split from one text, Using the + information to answer the question: {query}.\n\nText: ''') + final_prompt = final_prompt.format(query=query) + prompt = final_prompt + results + + response = prompt_single_step_agent(prompt) + + return response + + +def search_google_and_summarize(query: str) -> str: + r"""Search webs for information. Given a query, this function will use + the google search engine to search for related information from the + internet, and then return a summarized answer. + + Args: + query (string): Question you want to be answered. + + Returns: + string: Summarized information from webs. + """ + # Google search will return a list of urls + responses = search_google(query) + for item in responses: + if "url" in item: + url = item.get("url") + # Extract text + text = text_extract_from_web(str(url)) + # Using chatgpt summarise text + answer = summarize_text(text, query) + + # Let chatgpt decide whether to continue search or not + prompt = TextPrompt( + '''Do you think the answer: {answer} can answer the query: + {query}. Use only 'yes' or 'no' to answer.''') + prompt = prompt.format(answer=answer, query=query) + reply = prompt_single_step_agent(prompt) + if "yes" in str(reply).lower(): + return answer + + return "Failed to find the answer from google search." + + SEARCH_FUNCS: List[OpenAIFunction] = [ - OpenAIFunction(func) for func in [search_wiki] + OpenAIFunction(func) + for func in [search_wiki, search_google_and_summarize] ] diff --git a/camel/societies/role_playing.py b/camel/societies/role_playing.py index d65786736..00bc265d3 100644 --- a/camel/societies/role_playing.py +++ b/camel/societies/role_playing.py @@ -368,7 +368,6 @@ def init_chat(self) -> Tuple[BaseMessage, List[BaseMessage]]: content=(f"{self.user_sys_msg.content}. " "Now start to give me instructions one by one. " "Only reply with Instruction and Input.")) - user_msg = BaseMessage.make_user_message( role_name=self.user_sys_msg.role_name, content=f"{self.assistant_sys_msg.content}") diff --git a/test/functions/test_search_functions.py b/test/functions/test_search_functions.py index 3852410e0..ea878fb59 100644 --- a/test/functions/test_search_functions.py +++ b/test/functions/test_search_functions.py @@ -11,9 +11,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. =========== +import os + +import requests import wikipedia -from camel.functions.search_functions import search_wiki +from camel.functions.search_functions import ( + search_google_and_summarize, + search_wiki, +) def test_search_wiki_normal(): @@ -38,3 +44,25 @@ def test_search_wiki_with_ambiguity(): expected_output = wikipedia.summary("New York (state)", sentences=5, auto_suggest=False) assert search_wiki("New York") == expected_output + + +def test_google_api(): + # Check the google search api + + # https://developers.google.com/custom-search/v1/overview + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + # https://cse.google.com/cse/all + SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") + + url = f"https://www.googleapis.com/customsearch/v1?" \ + f"key={GOOGLE_API_KEY}&cx={SEARCH_ENGINE_ID}&q=any" + result = requests.get(url) + + assert result.status_code == 200 + + +def test_web_search(): + query = "What big things are happening in 2023?" + answer = search_google_and_summarize(query) + + assert answer is not None