In [1]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("./test.txt")
loader.load()

[Document(metadata={'source': './test.txt'}, page_content='Converting Your Documents into Text\nAs mentioned at the beginning of the chapter, the first step in preprocessing your document is to convert it to text. In order to achieve this, you would need to build logic to parse and extract the document with minimal loss of quality. Fortunately, LangChain provides document loaders that handle the parsing logic and enable you to “load” data from various sources into a Document class that consists of text and associated metadata.\n\nFor example, consider a simple .txt file. You can simply import a LangChain TextLoader class to extract the text, like this:\n')]

In [2]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.langchain.com/")
loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://www.langchain.com/', 'title': 'LangChain', 'description': 'LangChain’s suite of products supports developers along each step of their development journey.', 'language': 'en'}, page_content="LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricing\n\nLangSmithLangGraph PlatformGet a demoSign up\n\n\n\n\n\n\n\n\n\n\n\n\nProducts\n\nFrameworksLangGraphLangChainPlatformsLangSmithLangGraph PlatformResources\n\nResources HubBlogCustomer StoriesLangChain AcademyCommunityExpertsChangelogDocs\n\nPythonLangGraphLangSmithLangChainJavaScriptLangGraphLangSmithLangChainCompany\n\nAboutCareersPricing\n\nLangSmithLangGraph PlatformGet a demoSign upThe platform for reliable agents. Tools for every step

In [3]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./test.pdf")
pages = loader.load()


In [4]:
if pages:
    print(pages[0].metadata)

{'producer': 'Qt 5.15.2', 'creator': 'wkhtmltopdf 0.12.6', 'creationdate': '2023-01-31T11:10:39+00:00', 'title': '', 'source': './test.pdf', 'total_pages': 251, 'page': 0, 'page_label': '1'}


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)
splitted_docs = splitter.split_documents(pages)


In [6]:
print(splitted_docs[0])

page_content='id
UNITED	STATES
SECURITIES	AND	EXCHANGE	COMMISSION
Washington,	D.C.	20549
FORM	
10-K
(Mark	One)
☒
ANNUAL	REPORT	PURSUANT	TO	SECTION	13	OR	15(d)	OF	THE	SECURITIES	EXCHANGE	ACT	OF	1934
For	the	fiscal	year	ended	
December	31,	
2022
OR
☐
TRANSITION	REPORT	PURSUANT	TO	SECTION	13	OR	15(d)	OF	THE	SECURITIES	EXCHANGE	ACT	OF	1934
For	the	transition	period	from	_________	to	_________
Commission	File	Number:	
001-34756
Tesla,	Inc.
(Exact	name	of	registrant	as	specified	in	its	charter)
		
Delaware
	
91-2197729
(State	or	other	jurisdiction	of
incorporation	or	organization)
	
(I.R.S.	Employer
Identification	No.)
		
1	Tesla	Road
Austin
,	
Texas
	
	
78725
(Address	of	principal	executive	offices)
	
(Zip	Code)
(
512
)	
516-8177
(Registrant’s	telephone	number,	including	area	code)
Securities	registered	pursuant	to	Section	12(b)	of	the	Act:
	
Title	of	each	class
Trading	Symbol(s)
Name	of	each	exchange	on	which	registered
Common	stock
TSLA
The	Nasdaq	Global	Select	Market' metadata={'producer

In [7]:
print(splitted_docs[1])

page_content='Securities	registered	pursuant	to	Section	12(b)	of	the	Act:
	
Title	of	each	class
Trading	Symbol(s)
Name	of	each	exchange	on	which	registered
Common	stock
TSLA
The	Nasdaq	Global	Select	Market
	
Securities	registered	pursuant	to	Section	12(g)	of	the	Act:
None
Indicate	by	check	mark	whether	the	registrant	is	a	well-known	seasoned	issuer,	as	defined	in	Rule	405	of	the	Securities	Act.				
Yes
		
☒
				No		
☐
Indicate	by	check	mark	if	the	registrant	is	not	required	to	file	reports	pursuant	to	Section	13	or	15(d)	of	the	Act.				Yes		
☐
				
No
		
☒
Indicate	by	check	mark	whether	the	registrant	(1)	has	filed	all	reports	required	to	be	filed	by	Section	13	or	15(d)	of	the	Securities	Exchange	Act	of	1934	(“Exchange	Act”)
	
during	the	preceding	12	months	(or	for	such	shorter	period	that	the	registrant	was	required	to	file	such	reports),	and	(2)	has	been	subject	to	such	filing	requirements	for	the	past
	
90	days.				
Yes
		
☒
				No		
☐' metadata={'producer': 'Qt 5.15.2', 'creator': 

In [8]:
from langchain_ollama import OllamaEmbeddings
from langchain_core.documents import Document

embeddings_model = OllamaEmbeddings(model="nomic-embed-text")

embeddings = embeddings_model.embed_documents(
    [doc.page_content for doc in splitted_docs]
)

In [9]:
print (embeddings[0])

[-0.03872489, 0.023606237, -0.19438979, -0.03949202, 0.04539431, -0.00613678, 0.020779695, 0.07084683, 0.045518365, 0.0068996055, -0.02086316, 0.030896533, 0.064901434, -0.037871256, 0.009500222, -0.0302396, 0.028646145, -0.060046207, -0.021261252, 0.04803567, 0.016015876, -0.050683144, -0.051463425, -0.045333687, 0.12429878, 0.039487038, 0.011174643, -0.007039442, 0.029215865, -0.064363785, 0.00209143, 0.06566816, 0.014631624, 0.0015876871, -0.050272636, -0.020159226, 0.0131184785, 0.02331312, -0.035152283, -0.0573146, 0.07348488, 0.01648492, -0.022805637, -0.0069020195, 0.0057785125, 0.008063853, 0.04251616, -0.011403681, 0.07715545, -0.007819214, 0.03231097, 0.056330338, 0.03022923, 0.051684797, -0.03431308, -0.06478576, -0.01819964, 0.036908746, -0.0059801596, -0.009867489, 0.089598, 0.03381385, 0.041707966, -0.010945764, 0.061015774, -0.01347239, 0.013622403, 0.07687324, 0.016041297, -0.06611349, 0.06919098, 0.021729985, 0.059943125, 0.034874223, -0.012591499, -0.039678495, -0.021

In [11]:
from langchain_postgres.vectorstores import PGVector
connection = 'postgresql+psycopg://langchain:langchain@localhost:6024/langchain'
db = PGVector.from_documents(splitted_docs, embeddings_model, connection=connection)

In [12]:
db.similarity_search("financial balance", k=4)

[Document(id='68493dbd-1da8-4c60-9c63-cf71bbbef139', metadata={'page': 86, 'title': '', 'source': './test.pdf', 'creator': 'wkhtmltopdf 0.12.6', 'producer': 'Qt 5.15.2', 'page_label': '87', 'total_pages': 251, 'creationdate': '2023-01-31T11:10:39+00:00'}, page_content='funds\thave\tbeen\tpledged\tas\tcollateral\tfor\ttheir\tobligations.\nThe\taggregate\tcarrying\tvalues\tof\tthe\tVIEs’\tassets\tand\tliabilities,\tafter\telimination\tof\tany\tintercompany\ttransactions\tand\tbalances,\tin\tthe\n\t\nconsolidated\tbalance\tsheets\twere\tas\tfollows\t(in\tmillions):\n\t\n\t\n\t\nDecember\t31,\n\t\n\t\nDecember\t31,\n\t\n\t\n\t\n2022\n\t\n\t\n2021\n\t\nAssets\n\t\n\t\n\t\n\t\n\t\n\t\nCurrent\tassets\n\t\n\t\n\t\n\t\n\t\n\t\nCash\tand\tcash\tequivalents\n\t\n$\n68\n\t\n\t\n$\n87\n\t\nAccounts\treceivable,\tnet\n\t\n\t\n22\n\t\n\t\n\t\n24\n\t\nPrepaid\texpenses\tand\tother\tcurrent\tassets\n\t\n\t\n274\n\t\n\t\n\t\n152\n\t\nTotal\tcurrent\tassets\n\t\n\t\n364\n\t\n\t\n\t\n263\n\t\nSolar\tener