基于llm的爬虫数据抽取

In [1]:
# 基本配置
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
from langchain_community.embeddings.cloudflare_workersai import CloudflareWorkersAIEmbeddings
from supabase.client import Client, create_client

load_dotenv(override=True)

qw_llm_openai = ChatOpenAI(
    openai_api_base=os.getenv('DASHSCOPE_API_BASE'),
    openai_api_key=os.getenv('DASHSCOPE_API_KEY'),
    model_name="qwen2-1.5b-instruct",
    temperature=0.7,
    streaming=True,
)
# embeddings = CloudflareWorkersAIEmbeddings(
#     account_id=os.getenv('CF_ACCOUNT_ID'),
#     api_token=os.getenv('CF_API_TOKEN'),
#     model_name="@cf/baai/bge-large-en-v1.5",
# )

# supabase_url = os.environ.get("SUPABASE_URL")
# supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
# 
# supabase: Client = create_client(supabase_url, supabase_key)

In [2]:
from langchain.prompts import PromptTemplate

template = """Between >>> and <<< are the raw search result text from provided RSS URL.
Extract the answer to the question '{query}' or say "not found" if the information is not contained, and summarize all the information.
>>> {requests_result} <<<
Use the following JSON format to include all the titles:
{{
  "titles": [
    'aaa',
    'bbb',
  ]
}} 
Extracted:"""

PROMPT = PromptTemplate(
    input_variables=["query", "requests_result"],
    template=template,
)

In [8]:
from langchain.chains.llm import LLMChain
from langchain_community.chains.llm_requests import LLMRequestsChain

chain = LLMRequestsChain(llm_chain=LLMChain(llm=qw_llm_openai, prompt=PROMPT))

In [9]:
question = "What are all the titles in this RSS feed?"
inputs = {
    "query": question,
    "url": "https://rss.nytimes.com/services/xml/rss/nyt/US.xml"
}

In [11]:
response = chain.invoke(inputs)
print(response['output'])

  k = self.parse_starttag(i)
2024-07-04 11:42:51,626:INFO - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


{
  "titles": [
    'Some Californians Found Dream Homes Inland. But It Sure Is Hot There.',
    'Biden’s Debate Rattles Even the Most Faithful Democrats',
    'A Plummeting Murder Rate Stuns Boston. But Can It Survive the Summer?',
    'Schools Got a Record $190 Billion in Pandemic Aid. Did It Work?',
    'Ex-Engineer Charged With Obstructing Inquiry Into Military Crash That Killed 16'
  ]
}


In [ ]:
# 第二个例子

In [12]:
template = """在 >>> 和 <<< 之间是网页的返回的HTML内容。

网页是新浪财经A股上市公司的每季度股东信息表格。

请抽取参数请求的信息。每个截至日期作为JSON返回数据的date_of_quarter。因此，当表格中有多个截止日期时，返回数据应当包括所有的日期作为key。

>>> {requests_result} <<<
请使用如下的JSON格式返回数据
{{
  "date_of_quarter": [
    {{
      "holder_name": "a",
      "percentage": "50"
    }},
    {{
      "holder_name": "b",
      "percentage": "30"
    }},
  ]
}} 

例如，截至日期为2023-03-31，JSON数据应该是如下形式:

{{
  "2023-03-31": [
    {{
      "holder_name": "a",
      "percentage": "50"
    }},
    {{
      "holder_name": "b",
      "percentage": "30"
    }},
  ]
}}
Extracted:"""

PROMPT = PromptTemplate(
    input_variables=["requests_result"],
    template=template,
)

In [13]:
chain = LLMRequestsChain(llm_chain=LLMChain(llm=qw_llm_openai, prompt=PROMPT))

In [14]:
inputs = {
    "url": "https://vip.stock.finance.sina.com.cn/corp/go.php/vCI_StockHolder/stockid/600519/displaytype/30.phtml"
}

In [15]:
response = chain.invoke(inputs)
print(response['output'])

2024-07-04 11:43:50,625:INFO - HTTP Request: POST https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions "HTTP/1.1 200 OK"


{
  "2024-03-31": [
    {
      "holder_name": "c",
      "percentage": "10"
    },
    {
      "holder_name": "d",
      "percentage": "5"
    }
  ],
  "2023-12-31": [
    {
      "holder_name": "e",
      "percentage": "20"
    },
    {
      "holder_name": "f",
      "percentage": "15"
    }
  ],
  "2023-09-30": [
    {
      "holder_name": "g",
      "percentage": "15"
    },
    {
      "holder_name": "h",
      "percentage": "10"
    }
  ]
}
