Data Ingestion - Document Loader

In [2]:
from langchain_community.document_loaders import TextLoader

sample_text = TextLoader('lorem.txt')
text_documents = sample_text.load()
text_documents


[Document(metadata={'source': 'lorem.txt'}, page_content='Lorem ipsum is typically a corrupted version of De finibus bonorum et malorum, \na 1st-century BC text by the Roman statesman and philosopher Cicero, with words altered, added, \nand removed to make it nonsensical and improper Latin. The first two words themselves are a truncation of dolorem ipsum ("pain itself").')]

In [8]:
from langchain_community.document_loaders import PyPDFLoader
sample_pdf = PyPDFLoader('resume.pdf')
doc = sample_pdf.load()
type(doc[0])

langchain_core.documents.base.Document

Webbased loader

In [10]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader =WebBaseLoader(
    web_paths=['https://www.assistents.ai/'],
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(class_="page-wrapper"),
    },
    bs_get_text_kwargs={"separator": " | ", "strip": True},
)

loader.load()

[Document(metadata={'source': 'https://www.assistents.ai/'}, page_content="Home | Products | Use cases | Sales | Customer Prospecting | AI-powered outreach for discovering opportunities and building connections. | Lead Qualification | Instant AI analysis for enriching and engaging high-potential leads. | Sales Performance Optimization | AI-driven insights for strategy, prediction, and sales coaching. | Marketing | Targeted Account Campaigns | AI-precision ABM for hyper-relevant, high-converting campaigns. | Rapid Content Production | AI-assisted content creation across channels in minutes. | Global Market Expansion | Real-time AI translations for effortless market expansion. | Company | Customer Insight Analysis | 360-degree customer views with AI-powered predictive analytics. | Data Quality Management | Continuous AI data cleansing and enrichment across systems. | Business Process Integration | AI-orchestrated tech stack integration for optimized operations. | Get started | Setup 101 

In [None]:
from langchain_community.document_loaders import ArxivLoader

loader = ArxivLoader(
    query="reasoning",
    load_max_docs=2,
    # doc_content_chars_max=1000,
    # load_all_available_meta=False,
    # ...
)
loader.load()

 Document(metadata={'Published': '2024-05-09', 'Title': 'Hypothesis Testing Prompting Improves Deductive Reasoning in Large Language Models', 'Authors': 'Yitian Li, Jidong Tian, Hao He, Yaohui Jin', 'Summary': 'Combining different forms of prompts with pre-trained large language models\nhas yielded remarkable results on reasoning tasks (e.g. Chain-of-Thought\nprompting). However, along with testing on more complex reasoning, these\nmethods also expose problems such as invalid reasoning and fictional reasoning\npaths. In this paper, we develop \\textit{Hypothesis Testing Prompting}, which\nadds conclusion assumptions, backward reasoning, and fact verification during\nintermediate reasoning steps. \\textit{Hypothesis Testing prompting} involves\nmultiple assumptions and reverses validation of conclusions leading to its\nunique correct answer. Experiments on two challenging deductive reasoning\ndatasets ProofWriter and RuleTaker show that hypothesis testing prompting not\nonly significant

In [16]:
from langchain_community.document_loaders import WikipediaLoader

docs = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=2).load()
len(docs)
docs

[Document(metadata={'title': 'Hunter × Hunter', 'summary': 'Hunter × Hunter (pronounced "hunter hunter") is a Japanese manga series written and illustrated by Yoshihiro Togashi. It has been serialized in Shueisha\'s shōnen manga magazine Weekly Shōnen Jump since March 1998, although the manga has frequently gone on extended hiatuses since 2006. Its chapters have been collected in 38 tankōbon volumes as of September 2024. The story focuses on a young boy named Gon Freecss who discovers that his father, who left him at a young age, is actually a world-renowned Hunter, a licensed professional who specializes in fantastical pursuits such as locating rare or unidentified animal species, treasure hunting, surveying unexplored enclaves, or hunting down lawless individuals. Gon departs on a journey to become a Hunter and eventually find his father. Along the way, Gon meets various other Hunters and encounters the paranormal.\nHunter × Hunter was adapted into a 62-episode anime television serie

##### How to recursively split text by characters

In [47]:
txt = """If you have tried doing any form of important work that requires text analysis, natural language processing, and machine learning, you will soon find that text splitting is either going to make your analysis very effective or worse than even if you had never gone down that road at all.

There are many different applications and use cases for this task but a more common hurdle you’ll run into is how to do this process of text splitting, most libraries have the chunk size and chunk overlap parameters to aid in this process, which is the subject of this article.
texts = text_splitter.create_documents([state_of_the_union])
print(texts[0])
print(texts[1])"""

In [48]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

sample = RecursiveCharacterTextSplitter(chunk_size=50,chunk_overlap=5)
sample

<langchain_text_splitters.character.RecursiveCharacterTextSplitter at 0x11b07c6a0>

In [49]:
texts = sample.create_documents([txt])
print(len(texts))

18


In [51]:
for i in texts:
    print(i.page_content.__len__(), i.page_content)

50 If you have tried doing any form of important work
41 work that requires text analysis, natural
46 language processing, and machine learning, you
48 you will soon find that text splitting is either
45 going to make your analysis very effective or
45 or worse than even if you had never gone down
22 down that road at all.
45 There are many different applications and use
48 use cases for this task but a more common hurdle
49 you’ll run into is how to do this process of text
45 text splitting, most libraries have the chunk
48 size and chunk overlap parameters to aid in this
42 this process, which is the subject of this
13 this article.
7 texts =
49 text_splitter.create_documents([state_of_the_unio
8 _union])
31 print(texts[0])
print(texts[1])


In [61]:
import json
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()
json_data

from langchain_text_splitters import RecursiveJsonSplitter
splitter = RecursiveJsonSplitter(max_chunk_size=300)
json_chunks = splitter.split_json(json_data)

for i in json_chunks[:3]:
    print(i)

#can also o/p the documents

docs = splitter.create_documents(texts=[json_data])
for i in docs[:3]:
    print(i)


{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'version': '0.1.0'}, 'paths': {'/api/v1/sessions/{session_id}': {'get': {'tags': ['tracer-sessions'], 'summary': 'Read Tracer Session', 'description': 'Get a specific session.'}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'operationId': 'read_tracer_session_api_v1_sessions__session_id__get', 'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}}
{'paths': {'/api/v1/sessions/{session_id}': {'get': {'parameters': [{'name': 'session_id', 'in': 'path', 'required': True, 'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}}, {'name': 'include_stats', 'in': 'query', 'required': False, 'schema': {'type': 'boolean', 'default': False, 'title': 'Include Stats'}}, {'name': 'accept', 'in': 'header', 'required': False, 'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'title': 'Accept'}}]}}}}
page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "paths": {"/