### Embedding Techniques
- convert text into vectors

In [1]:
import os
from dotenv import load_dotenv
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

In [4]:
text = "This is sample text"

In [5]:
embedding = genai.embed_content(model="models/embedding-001", content=text, task_type="retrieval_document")


In [6]:
query_result = embedding["embedding"]
query_result

[0.050625753,
 -0.0024339894,
 -0.04872835,
 -0.010206068,
 0.06986839,
 0.038418252,
 0.009699713,
 -0.04497615,
 -0.00078940415,
 0.047394693,
 -0.025186023,
 0.021722592,
 -0.009819174,
 0.028439509,
 0.0038702094,
 -0.035920914,
 0.03151065,
 0.026032941,
 0.026091015,
 0.0017677632,
 0.01962293,
 0.024820734,
 -0.009300542,
 -0.016026067,
 0.02939369,
 -0.025137022,
 0.01096918,
 -0.053478535,
 -0.029003384,
 0.016148437,
 -0.039077062,
 0.020666763,
 -0.049324207,
 0.026817624,
 0.009205957,
 -0.037060764,
 -0.020062027,
 0.019268973,
 -0.016012175,
 -0.011289068,
 0.010853951,
 -0.011013122,
 -0.037337612,
 0.013172986,
 0.0052494104,
 0.011981663,
 -0.033013094,
 0.03326897,
 -0.001523896,
 -0.046574924,
 0.022282138,
 0.0033049088,
 0.04643832,
 -0.028201422,
 0.008139837,
 -0.041947536,
 0.060411073,
 -0.006315455,
 -0.019235875,
 0.00013669045,
 -0.018410916,
 0.022093149,
 0.034599893,
 0.009327793,
 -0.057329632,
 -0.07554439,
 -0.057555504,
 0.023352398,
 0.04373242,
 0.0

In [7]:
len(query_result)

768

In [8]:
from langchain_community.document_loaders import TextLoader

In [9]:
loader = TextLoader('speech.txt')
data = loader.load()
data

[Document(metadata={'source': 'speech.txt'}, page_content=' Scene text information extraction plays an important role in many computer vision applications.\n Most features in existing text extraction algorithms are only applicable to one text extraction stage (text\n detection or recognition), which signi cantly weakens the consistency in an end-to-end system, especially\n for the complex Chinese texts. \n To tackle this challenging problem, we propose a novel text structure feature\n extractor based on a text structure component detector (TSCD) layer and residual network for Chinese texts.\n Inspired by the three-layer Chinese text cognition model of a human, we combine the TSCD layer and\n the residual network to extract features suitable for both text extraction stages. The specialized modeling\n for Chinese characters in the TSCD layer simulates the key structure component cognition layer in the\n psychological model. And the residual mechanism in the residual network simulates the

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

txt_split = RecursiveCharacterTextSplitter(chunk_size= 300, chunk_overlap=50)
splited_text = txt_split.split_documents(data)
splited_text

[Document(metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications.\n Most features in existing text extraction algorithms are only applicable to one text extraction stage (text'),
 Document(metadata={'source': 'speech.txt'}, page_content='detection or recognition), which signi cantly weakens the consistency in an end-to-end system, especially\n for the complex Chinese texts. \n To tackle this challenging problem, we propose a novel text structure feature'),
 Document(metadata={'source': 'speech.txt'}, page_content='extractor based on a text structure component detector (TSCD) layer and residual network for Chinese texts.\n Inspired by the three-layer Chinese text cognition model of a human, we combine the TSCD layer and'),
 Document(metadata={'source': 'speech.txt'}, page_content='the residual network to extract features suitable for both text extraction stages. The specialized modeling\n for Chines

In [11]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

In [12]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

In [14]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [15]:
db = Chroma.from_documents(splited_text,embedding)

In [16]:
query = "Why did America decide to join the war despite being peaceful?"
retrived_result = db.similarity_search(query)
retrived_result

[Document(metadata={'source': 'speech.txt'}, page_content='There are, it may be, many months of fiery trial and sacrifice ahead of us. It is a fearful thing to lead this great peaceful people into war, into the most terrible and disastrous of all wars, civilization itself seeming to be in the balance.'),
 Document(metadata={'source': 'speech.txt'}, page_content='But the right is more precious than peace, and we shall fight for the things which we have always carried nearest our hearts—for democracy, for the right of those who submit to authority to have a voice in their own governments, for the rights and liberties of small nations, for a universal'),
 Document(metadata={'source': 'speech.txt'}, page_content='and liberties of small nations, for a universal dominion of right by such a concert of free peoples as shall bring peace and safety to all nations and make the world itself at last free.'),
 Document(metadata={'source': 'speech.txt'}, page_content='To such a task we can dedicate o

In [17]:
query1 = "Scene text information extraction plays an important role in many computer vision applications"
r1 = db.similarity_search(query1)
r1

[Document(metadata={'source': 'speech.txt'}, page_content='Scene text information extraction plays an important role in many computer vision applications.\n Most features in existing text extraction algorithms are only applicable to one text extraction stage (text'),
 Document(metadata={'source': 'speech.txt'}, page_content='layer and the residual network, the extracted features are applicable to both text detection and recognition,\n as humans do. \n In evaluation, both text detection and recognition models based on our proposed text structure'),
 Document(metadata={'source': 'speech.txt'}, page_content='detection or recognition), which signi cantly weakens the consistency in an end-to-end system, especially\n for the complex Chinese texts. \n To tackle this challenging problem, we propose a novel text structure feature'),
 Document(metadata={'source': 'speech.txt'}, page_content='the residual network to extract features suitable for both text extraction stages. The specialized modeli