### faiss

- used for similarity serach and clustering of dense vectors. 
- can search in any size of set of vectores.
- evalution and parameter tuning

In [15]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [16]:
loader = TextLoader('speech.txt')
doc = loader.load()


In [17]:
splitt = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
spilted_text = splitt.split_documents(doc)
spilted_text

[Document(metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
 Document(metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature extractor based on a Text Structure Component Detector (TSCD) layer and residual network for Chinese texts.\n\nInspired by the three-layer Chinese text cognition model of the human brain, we combine the TSCD layer and the residual network to extract features suitable for both text extraction stages:'),
 Document(metadata={'source': 'speech.txt'}, page_content='The TSCD layer specializes in modeling Chinese character structures and simulates the key structure 

In [18]:
embedding = OllamaEmbeddings(model="mxbai-embed-large")
db = FAISS.from_documents(spilted_text, embedding)
db

<langchain_community.vectorstores.faiss.FAISS at 0x1dddfe0ee90>

In [19]:
query = "Scene text information extraction plays an important role in many computer vision applications"
docs = db.similarity_search(query)
docs[0].page_content

'Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'

### As a retriever
- it will give same result but this allow us to easily use it in other langchain models

In [21]:
retriever = db.as_retriever()
doc = retriever.invoke(query)
doc[0].page_content

'Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'

### similarity search with score 

In [22]:
docs_with_score = db.similarity_search_with_score(query)
docs_with_score

[(Document(id='2f0bdcc6-9130-4112-b916-ec2846963e00', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
  np.float32(100.673164)),
 (Document(id='8b9ad776-25f9-4b7f-b3b9-4f9f917f0a5b', metadata={'source': 'speech.txt'}, page_content='Through the organic combination of the TSCD layer and residual network, the extracted features become applicable to both text detection and recognition, mirroring the human process of understanding written Chinese.'),
  np.float32(167.19125)),
 (Document(id='49596f12-4dd3-4aab-b06e-dbd0fdec6873', metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature e

In [23]:
embedding_vector = embedding.embed_query(query)
embedding_vector

[0.28076326847076416,
 -0.24275606870651245,
 0.001868419349193573,
 0.13699528574943542,
 0.302728533744812,
 0.1295846402645111,
 0.05562996119260788,
 -0.21711832284927368,
 0.6893871426582336,
 0.511511504650116,
 -0.21789324283599854,
 -0.15510010719299316,
 -0.13105428218841553,
 -0.8235628008842468,
 0.15347716212272644,
 -0.06356548517942429,
 0.2520473003387451,
 -0.13716191053390503,
 -0.009776942431926727,
 -0.1484605073928833,
 -0.20431703329086304,
 0.4694657623767853,
 -1.621290922164917,
 -0.6881332993507385,
 0.1487150937318802,
 0.3690752387046814,
 0.954238772392273,
 -0.18353277444839478,
 0.5916241407394409,
 0.7886762022972107,
 0.08883614093065262,
 -0.03459890931844711,
 0.06785620748996735,
 0.15212392807006836,
 0.14421845972537994,
 -0.001962020993232727,
 0.643288791179657,
 -0.8585948944091797,
 0.1605362892150879,
 -0.3036390542984009,
 0.10884937644004822,
 -0.0066560134291648865,
 0.534733772277832,
 -1.030842900276184,
 -0.7856374979019165,
 0.0363753587

In [24]:
docs_score = db.similarity_search_by_vector(embedding_vector)
docs_score

[Document(id='2f0bdcc6-9130-4112-b916-ec2846963e00', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
 Document(id='8b9ad776-25f9-4b7f-b3b9-4f9f917f0a5b', metadata={'source': 'speech.txt'}, page_content='Through the organic combination of the TSCD layer and residual network, the extracted features become applicable to both text detection and recognition, mirroring the human process of understanding written Chinese.'),
 Document(id='49596f12-4dd3-4aab-b06e-dbd0fdec6873', metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature extractor based on a Text Structure Component Detector (T

In [26]:
db.save_local("FAISS")

In [27]:
newdb = FAISS.load_local("FAISS", embedding,allow_dangerous_deserialization=True)
doc = newdb.similarity_search(query)
doc

[Document(id='2f0bdcc6-9130-4112-b916-ec2846963e00', metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications. Most features in existing text extraction algorithms are only applicable to one text extraction stage (text detection or recognition), which significantly weakens the consistency in an end-to-end system, especially for complex Chinese texts.'),
 Document(id='8b9ad776-25f9-4b7f-b3b9-4f9f917f0a5b', metadata={'source': 'speech.txt'}, page_content='Through the organic combination of the TSCD layer and residual network, the extracted features become applicable to both text detection and recognition, mirroring the human process of understanding written Chinese.'),
 Document(id='49596f12-4dd3-4aab-b06e-dbd0fdec6873', metadata={'source': 'speech.txt'}, page_content='To tackle this challenging problem, we propose a novel text structure feature extractor based on a Text Structure Component Detector (T