### 文件加载（Document loaders）


In [15]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxx"
os.environ["OPENAI_API_BASE"] = "https://api.chatanywhere.tech/v1"
os.environ["OPENAI_API_MODEL"] = "gpt-4-turbo"

#### 加载 markdown

In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./index.md")
loader.load()

[Document(page_content='# This is index markdown\n## Test the Document Loader\n### The paragraph text', metadata={'source': './index.md'})]

#### 加载 pdf


In [8]:
! pip install pypdf
! pip install langchain_community
! pip install -U langchain-text-splitters

Collecting langchain-text-splitters
  Using cached langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)
Using cached langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)
Installing collected packages: langchain-text-splitters
Successfully installed langchain-text-splitters-0.0.1


In [13]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("test.pdf")
pages = loader.load_and_split()

print(len(pages))
print(pages)

32
[Document(page_content='易速鲜花集团 \n \n \n \n \n \n \n \n \n \n \n易速鲜花 服务中心  \n易速鲜花 股份有限公司  易\n速\n鲜\n花\n员\n工\n手\n册', metadata={'source': 'test.pdf', 'page': 0}), Document(page_content='易速鲜花集团 \n 1 董事长致辞  \n亲爱的同事：  \n您好！欢迎您加入 易速鲜花 旅游文化股份有限公司 ! \n我代表易速鲜花 对您的到来表示热烈的欢迎！并为公司拥有您\n这样优秀的员工而感到自豪和骄傲。 易速鲜花 将会因您的努力工作\n而稳步健康发展，将会因您的贡献而更加精彩。在此，我为您即将\n为易速鲜花 而付出的辛勤汗水表示诚挚的感谢！  \n易速鲜花 这个大家庭需要您、我、他每一位员工积极发扬“ 团\n结向上，完美无缺 ”的企业精神，以高度的主人翁责任感、使命感，\n与易速鲜花 同呼吸、共命 运，在各自的岗位上，勤奋敬业，尽职尽\n责，奋力拼搏。作为大家庭的一员，同仁之间应默契配合，相互接\n纳，取长补短，共同奋进，同舟共济。这样，我们就可以成为一个\n坚强的战斗堡垒， 在创 易速鲜花 发展的道路上披荆斩棘， 乘风破浪，\n无往不胜，所向披靡。最终以一流的服务、一流的管理、一流的信\n誉让我们的 易速鲜花 成为全国的一颗明珠。  \n希望各位以本手册为指南，共创 易速鲜花 美好明天 ! \n最后，诚挚地祝愿大家在公司工作愉快，前程似锦 ! \n \n \n                    签名：', metadata={'source': 'test.pdf', 'page': 1}), Document(page_content='易速鲜花集团 \n 2 易速鲜花 晨会宣言  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n  \n今天 \n我开始新的生活  \n我要用全身心的爱迎接今天  \n我会用我全部的热情  \n关爱我的同事  \n热爱我们的公司  \n服务于我们的客户  \n忠于我们的工作  \n在努力工作中充实、完善，并超越自我  \n每天进步一点点  \n今天 \n我就付诸于行动！', metadata={'sou

### 文档分割


**为了使文档存储和解析更加方便和语义化，需要对文档进行分割**


工作原理：

- 把文本分成小块，语义上有意义的块(通常是句子)。

- 开始将这些小块组合成一个更大的块，直到达到一定的大小(通过某个函数衡量)。

- 一旦达到这个大小，就将该块作为自己的文本块，然后开始创建具有一些重叠的新文本块(以保持块之间的上下文)。


#### 按字符递归分割 RecursiveCharacterTextSplitter


In [14]:
! pip install -qU langchain-text-splitters

In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# This is a long document we can split up.
with open("index.txt") as f:
    state_of_the_union = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
    ],
    chunk_size=10,
    chunk_overlap=2,
    length_function=len,
    is_separator_regex=False,
)

texts = text_splitter.create_documents([state_of_the_union])
print(len(texts))
print(texts)

11
[Document(page_content='这是长文本块'), Document(page_content='\n这里是: 第一段第一段第一段第一段第一段第一段第一段第一段第一段第一段第一段'), Document(page_content='\n这里是: 第二段第二段第二段第二段第二段第二段第二段第二段第二段第二段第二段'), Document(page_content='\n这里是: 第三段第三段第三段第三段第三段第三段第三段第三段第三段第三段第三段'), Document(page_content='\n这里是: 第四段第四段第四段第四段第四段第四段第四段第四段第四段第四段第四段'), Document(page_content='\n这里是: 第五段第五段第五段第五段第五段第五段第五段第五段第五段第五段第五段'), Document(page_content='\n这里是: 第六段第六段第六段第六段第六段第六段第六段第六段第六段第六段第六段'), Document(page_content='\n这里是: 第七段第七段第七段第七段第七段第七段第七段第七段第七段第七段第七段'), Document(page_content='\n这里是: 第八段第八段第八段第八段第八段第八段第八段第八段第八段第八段第八段'), Document(page_content='\n这里是: 第九段第九段第九段第九段第九段第九段第九段第九段第九段第九段第九段'), Document(page_content='\n这里是: 第十段第十段第十段第十段第十段第十段第十段第十段第十段第十段第十段')]


#### 按字符串分割

In [52]:
from langchain.text_splitter import CharacterTextSplitter

#加载要切分的文档
with open("test.txt") as f:
    zuizhonghuanxiang = f.read()

#初始化切分器
text_splitter = CharacterTextSplitter(
    separator="。",#切割的标志字符，默认是\n\n
    chunk_size=100,#切分的文本块大小，一般通过长度函数计算
    chunk_overlap=20,#切分的文本块重叠大小，一般通过长度函数计算
    length_function=len,#长度函数,也可以传递tokenize函数
    add_start_index=True,#是否添加起始索引
    is_separator_regex=False,#是否是正则表达式
)
text = text_splitter.create_documents([zuizhonghuanxiang])

print(len(text))
print(text[0])

Created a chunk of size 125, which is longer than the specified 100
Created a chunk of size 105, which is longer than the specified 100


38
page_content='蒂法介绍\n蒂法·洛克哈特(日语:ティファ・ロックハート，Tifa Rokkuhāto，英语:Tifa Lockhart)为电子游戏《最终幻想VII》及《最终幻想VII补完计划》相关作品中的虚构⻆ 色，由\U0010fc00村哲也创作和设计，此后也在多个游戏中客串登场' metadata={'start_index': 1}


#### 按语义块分割 Semantic Chunking


In [40]:
! pip install --quiet langchain_experimental langchain_openai

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
# This is a long document we can split up.
with open("mit.txt") as f:
    state_of_the_union = f.read()
    
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings())
docs = text_splitter.create_documents([state_of_the_union])

print(docs[0].page_content)

#### 文档的简练、清晰

In [2]:
! pip install doctran
! pip install dotenv

Collecting dotenv
  Using cached dotenv-0.0.5.tar.gz (2.4 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install backend dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[31 lines of output][0m
  [31m   [0m Collecting distribute
  [31m   [0m   Using cached distribute-0.7.3.zip (145 kB)
  [31m   [0m   Installing build dependencies: started
  [31m   [0m   Installing build dependencies: finished with status 'done'
  [31m   [0m   Getting requirements to build wheel: started
  [31m   [0m   Getting requirements to build wheel: finished with status 'done'
  [31m   [0m   Installing backend dependencies: started
  [31m   [0m   Installing backend dependencies: finished with status 'done'
  [31m   [0m   Preparing met

In [4]:
content = """[Generated with ChatGPT]

Confidential Document - For Internal Use Only

Date: July 1, 2023

Subject: Updates and Discussions on Various Topics

Dear Team,

I hope this email finds you well. In this document, I would like to provide you with some important updates and discuss various topics that require our attention. Please treat the information contained herein as highly confidential.

Security and Privacy Measures
As part of our ongoing commitment to ensure the security and privacy of our customers' data, we have implemented robust measures across all our systems. We would like to commend John Doe (email: john.doe@example.com) from the IT department for his diligent work in enhancing our network security. Moving forward, we kindly remind everyone to strictly adhere to our data protection policies and guidelines. Additionally, if you come across any potential security risks or incidents, please report them immediately to our dedicated team at security@example.com.

HR Updates and Employee Benefits
Recently, we welcomed several new team members who have made significant contributions to their respective departments. I would like to recognize Jane Smith (SSN: 049-45-5928) for her outstanding performance in customer service. Jane has consistently received positive feedback from our clients. Furthermore, please remember that the open enrollment period for our employee benefits program is fast approaching. Should you have any questions or require assistance, please contact our HR representative, Michael Johnson (phone: 418-492-3850, email: michael.johnson@example.com).

Marketing Initiatives and Campaigns
Our marketing team has been actively working on developing new strategies to increase brand awareness and drive customer engagement. We would like to thank Sarah Thompson (phone: 415-555-1234) for her exceptional efforts in managing our social media platforms. Sarah has successfully increased our follower base by 20% in the past month alone. Moreover, please mark your calendars for the upcoming product launch event on July 15th. We encourage all team members to attend and support this exciting milestone for our company.

Research and Development Projects
In our pursuit of innovation, our research and development department has been working tirelessly on various projects. I would like to acknowledge the exceptional work of David Rodriguez (email: david.rodriguez@example.com) in his role as project lead. David's contributions to the development of our cutting-edge technology have been instrumental. Furthermore, we would like to remind everyone to share their ideas and suggestions for potential new projects during our monthly R&D brainstorming session, scheduled for July 10th.

Please treat the information in this document with utmost confidentiality and ensure that it is not shared with unauthorized individuals. If you have any questions or concerns regarding the topics discussed, please do not hesitate to reach out to me directly.

Thank you for your attention, and let's continue to work together to achieve our goals.

Best regards,

Jason Fan
Cofounder & CEO
Psychic
jason@psychic.dev
"""

In [13]:
import os

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_BASE = os.environ.get("OPENAI_API_BASE")
OPENAI_MODEL = "gpt-4-turbo"
OPENAI_TOKEN_LIMIT = 8000

from doctran import Doctran
doctrans = Doctran(
    openai_api_key=OPENAI_API_KEY,
    openai_model=OPENAI_MODEL,
    openai_token_limit=OPENAI_TOKEN_LIMIT,
)
documents = doctrans.parse(content=content)


In [8]:
#总结文档
summary = documents.summarize(token_limit=100).execute()
print(summary.transformed_content)

This internal document dated July 1, 2023, from Psychic's CEO Jason Fan, discusses updates on security measures, HR, marketing initiatives, and R&D projects. It highlights contributions from specific employees and reminds the team of upcoming events and policies. The document stresses the importance of confidentiality and encourages team collaboration to achieve company goals.


In [9]:
#翻译一下文档
translation = documents.translate(language="chinese").execute()
print(translation.transformed_content)

保密文件 - 仅供内部使用

日期：2023年7月1日

主题：关于各种话题的更新和讨论

亲爱的团队，

希望这封邮件找到你们时一切安好。在这份文件中，我想向你们提供一些重要的更新，并讨论一些需要我们关注的话题。请将此文件中包含的信息视为高度机密。

安全与隐私措施
作为我们持续承诺确保客户数据的安全与隐私，我们已在所有系统中实施了强有力的措施。我们想表扬IT部门的John Doe（电子邮件：john.doe@example.com）为提升我们网络安全所做的勤奋工作。展望未来，我们恳请大家严格遵守我们的数据保护政策和指南。此外，如果你发现任何潜在的安全风险或事件，请立即报告给我们的专门团队security@example.com。

人力资源更新和员工福利
最近，我们欢迎了几位新团队成员，他们已对各自的部门做出了重大贡献。我想表扬客户服务部门的Jane Smith（社会安全号码：049-45-5928）的杰出表现。Jane一直收到客户的正面反馈。此外，请记住，我们员工福利计划的开放注册期即将到来。如果你有任何问题或需要帮助，请联系我们的人力资源代表Michael Johnson（电话：418-492-3850，电子邮件：michael.johnson@example.com）。

市场营销倡议和活动
我们的市场团队一直在积极开发新策略，以提高品牌知名度和推动客户参与。我们想感谢Sarah Thompson（电话：415-555-1234）在管理我们社交媒体平台方面的卓越努力。Sarah在过去一个月内成功地将我们的粉丝基础增加了20%。此外，请在你的日历上标记即将到来的产品发布活动，日期为7月15日。我们鼓励所有团队成员参加并支持这一公司的激动人心的里程碑。

研发项目
在我们追求创新的过程中，我们的研发部门一直在不懈地进行各种项目的工作。我想表扬David Rodriguez（电子邮件：david.rodriguez@example.com）在担任项目负责人的角色中所做的杰出工作。David对开发我们尖端技术的贡献至关重要。此外，我们想提醒大家在我们每月一次的研发头脑风暴会议上分享你们的想法和建议，会议定于7月10日。

请将本文件中的信息视为最高机密，并确保不与未经授权的个人分享。如果你对讨论的话题有任何疑问或担忧，请随时直接联系我。

感谢你的关注，让我们继续

In [17]:

import json
from langchain_community.document_transformers import DoctranPropertyExtractor
from langchain_core.documents import Document
documents = [Document(page_content=content)]
properties = [
    {
        "name": "category",
        "description": "What type of email this is.",
        "type": "string",
        "enum": ["update", "action_item", "customer_feedback", "announcement", "other"],
        "required": True,
    },
    {
        "name": "mentions",
        "description": "A list of all people mentioned in this email.",
        "type": "array",
        "items": {
            "name": "full_name",
            "description": "The full name of the person mentioned.",
            "type": "string",
        },
        "required": True,
    },
    {
        "name": "eli5",
        "description": "Explain this email to me like I'm 5 years old.",
        "type": "string",
        "required": True,
    },
]
property_extractor = DoctranPropertyExtractor(properties=properties)
extracted_document = property_extractor.transform_documents(
    documents, properties=properties
)
print(json.dumps(extracted_document[0].metadata, indent=2))

{
  "extracted_properties": {
    "category": "update",
    "mentions": [
      "John Doe",
      "Jane Smith",
      "Michael Johnson",
      "Sarah Thompson",
      "David Rodriguez",
      "Jason Fan"
    ]
  }
}
