### Chroma 

In [2]:
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

In [3]:
loader = TextLoader('speech.txt')
doc = loader.load()
doc

[Document(metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.\n\nTo tackle this challenging problem, we propose a novel text structure feature extractor based on a Text Structure Component Detector (TSCD) layer and residual network for Chinese texts.\n\nInspired by the three-layer Chinese text cognition model of the human brain, we combine the TSCD layer and the residual network to extract features suitable for both text extraction stages:\n\nThe TSCD layer specializes in modeling Chinese character structures and simulates the key structure component cognition layer in the psychological model.\n\nThe residual mechanism in the residual network simulates the 

In [4]:
split = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_split = split.split_documents(doc)
text_split

[Document(metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
 Document(metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature extractor based on a Text Structure Component Detector (TSCD) layer and residual network for Chinese texts.\n\nInspired by the three-layer Chinese text cognition model of the human brain, we combine the TSCD layer and the residual network to extract features suitable for both text extraction stages:'),
 Document(metadata={'source': 'speech.txt'}, page_content='The TSCD layer specializes in modeling Chinese character structures and simulates the key structure 

In [5]:
embedding = OllamaEmbeddings(model="mxbai-embed-large")

  embedding = OllamaEmbeddings(model="mxbai-embed-large")


In [9]:
vectordb=Chroma.from_documents(documents=text_split,embedding=embedding)
vectordb

<langchain_chroma.vectorstores.Chroma at 0x1ed8e3ec400>

In [11]:
query = "Scene text information extraction plays an important role in many computer vision applications"
doc = vectordb.similarity_search(query)
doc[0]

Document(id='ee542bf9-9201-4385-bf9a-13f246ddca94', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.')

In [12]:
vectordb = Chroma.from_documents(documents=text_split, embedding=embedding, persist_directory="./chroma_db")

In [13]:
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
docs = db2.similarity_search(query)
doc[0].page_content

'Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'

In [14]:
docs = vectordb.similarity_search_with_score(query)
docs

[(Document(id='f5c5e88c-0224-49b5-af38-7276d0b03f6a', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
  100.67315673828125),
 (Document(id='6230fa74-07eb-45bc-8083-7336d2112ad8', metadata={'source': 'speech.txt'}, page_content='Through the organic combination of the TSCD layer and residual network, the extracted features become applicable to both text detection and recognition, mirroring the human process of understanding written Chinese.'),
  167.1912841796875),
 (Document(id='77776e60-eb1d-423a-bb2a-6551815a6ab5', metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature extractor

In [17]:
retriever = vectordb.as_retriever()
retriever.invoke(query)[0]

Document(id='f5c5e88c-0224-49b5-af38-7276d0b03f6a', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.')