In [1]:
import os
from langchain.document_loaders import UnstructuredURLLoader, UnstructuredPowerPointLoader, ReadTheDocsLoader, \
    PyPDFLoader
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks import get_openai_callback
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
from dotenv import load_dotenv
import os
from langchain_community.llms.cloudflare_workersai import CloudflareWorkersAI
from langchain_community.llms.tongyi import Tongyi
from langchain_openai import ChatOpenAI

load_dotenv(override=True)

account_id = os.getenv('CF_ACCOUNT_ID')
api_token = os.getenv('CF_API_TOKEN')
print(account_id)
print(api_token)

# CloudflareWorkersAI
model = '@cf/meta/llama-3-8b-instruct'
cf_llm = CloudflareWorkersAI(
    account_id=account_id,
    api_token=api_token,
    model=model
)

DASHSCOPE_API_KEY = os.getenv('DASHSCOPE_API_KEY')
print(DASHSCOPE_API_KEY)

# qwen
qw_llm = Tongyi(
    model='qwen2-1.5b-instruct'
)

# qwen 兼容 openai的接口
qw_llm_openai = ChatOpenAI(
    openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
    openai_api_key=DASHSCOPE_API_KEY,
    model_name="qwen2-1.5b-instruct",
    temperature=0,
    streaming=True,
    verbose=True,
)

api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_API_BASE')
print(api_key)
print(base_url)

# openai/moonshot
ms_llm = ChatOpenAI(
    openai_api_base=base_url,
    openai_api_key=api_key,
    model_name="moonshot-v1-8k",
    temperature=0.7,
)

8483c3ec7a0cbc54a8d660b5b9002b04
Gcllof8ze6dgtcqFI5FQZ2SD_5tfCD4Db7NuS6jn
sk-01c5003340c3453b934052d737d45e01
sk-UGVpjuTwo2Q8pewoqUDfckw1A0pbSDli9ElFMeS9WareKknG
https://api.moonshot.cn/v1/


In [7]:
def summarize_docs(docs, doc_url):
    print(f'You have {len(docs)} document(s) in your {doc_url} data')
    print(f'There are {len(docs[0].page_content)} characters in your document')

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    split_docs = text_splitter.split_documents(docs)

    print(f'You have {len(split_docs)} split document(s)')

    chain = load_summarize_chain(qw_llm_openai, chain_type="map_reduce", verbose=False)

    response = ""
    with get_openai_callback() as cb:
        response = chain.invoke(split_docs)
        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Prompt Tokens: {cb.prompt_tokens}")
        print(f"Completion Tokens: {cb.completion_tokens}")
        print(f"Successful Requests: {cb.successful_requests}")
        print(f"Total Cost (USD): ${cb.total_cost}")

    return response

In [6]:
url = "https://www.crab233.cloudns.biz/proxy/edition.cnn.com/2023/04/13/business/delta-earnings/index.html"
loader = UnstructuredURLLoader(urls=[url]).load()

In [8]:
summarize_docs(loader, url)

You have 1 document(s) in your https://www.crab233.cloudns.biz/proxy/edition.cnn.com/2023/04/13/business/delta-earnings/index.html data
There are 5159 characters in your document
You have 6 split document(s)
Total Tokens: 0
Prompt Tokens: 0
Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0


{'input_documents': [Document(page_content="Video Ad Feedback\n\nAirlines warn they may have to cut flights unless this problem is solved\n\n02:26\n\nSource:\n                CNN\n\nTop business news\n\n16 videos\n\nVideo Ad Feedback\n\nAirlines warn they may have to cut flights unless this problem is solved\n\n02:26\n\nNow playing\n\nSource:\n                CNN\n\nVideo Ad Feedback\n\nHow to craft the perfect maternity leave out of office message\n\n01:33\n\nNow playing\n\nSource:\n                CNN\n\nVideo Ad Feedback\n\nCNN writer explains how Microsoft's new AI model works\n\n02:20\n\nNow playing\n\nSource:\n                CNN\n\nVideo Ad Feedback\n\nToyota sells only two electric vehicles. Executive insists they're not holding back\n\n02:21\n\nNow playing\n\nSource:\n                CNN\n\nVideo Ad Feedback\n\nAn implant in his brain lets him do incredible tasks with his thoughts\n\n05:37\n\nNow playing\n\nSource:\n                CNN\n\nVideo Ad Feedback\n\n'We're cooking ou