In [35]:
# Import Libraries

import spacy            
import torch     
import transformers           

# langchain

from langchain_chroma import Chroma # vector store
from langchain_community.document_loaders import TextLoader # load text /pdf is different
from langchain_text_splitters import RecursiveCharacterTextSplitter # text splitters are used to create text split for embeddings
from langchain_huggingface.embeddings import HuggingFaceEmbeddings # create embeddings


In [None]:
# Import Data 
text_data = "mosquitoes.txt"

# load the text 
langchain_object = TextLoader(text_data, encoding="utf-8")

data = langchain_object.load()

print(data)

[Document(metadata={'source': 'mosquitoes.txt'}, page_content="Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them. Medical parasitologists view mosquitoes as vectors of disease, carrying protozoan parasites or bacterial or viral pathogens from one host to another.\n\nThe mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic alg

In [15]:
for token in data:
    print(token)

page_content='Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them. Medical parasitologists view mosquitoes as vectors of disease, carrying protozoan parasites or bacterial or viral pathogens from one host to another.

The mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic algae and organic material. These larvae are important

In [None]:
# use SPACY 

nlp = spacy.load("en_core_web_sm")

with open("mosquitoes.txt", "r", encoding="utf-8") as f:
    content = f.read()
    
    
doc = nlp(content)
      
for sent in doc.sents:
    print(len(sent))

16
11
32
18
8
22
24
18
22
23
35
11
20
20
24
16


In [26]:
for index,sent in enumerate(doc.sents):
    print(f"Index - {index}: length of sentence: {len(sent)}")

Index - 0: length of sentence: 16
Index - 1: length of sentence: 11
Index - 2: length of sentence: 32
Index - 3: length of sentence: 18
Index - 4: length of sentence: 8
Index - 5: length of sentence: 22
Index - 6: length of sentence: 24
Index - 7: length of sentence: 18
Index - 8: length of sentence: 22
Index - 9: length of sentence: 23
Index - 10: length of sentence: 35
Index - 11: length of sentence: 11
Index - 12: length of sentence: 20
Index - 13: length of sentence: 20
Index - 14: length of sentence: 24
Index - 15: length of sentence: 16


In [18]:
for sent in doc.sents:
    print((sent))

Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species.
The word mosquito is Spanish and Portuguese for little fly.
Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking mouthparts.
All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood.
The group diversified during the Cretaceous period.
Evolutionary biologists view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them.
Medical parasitologists view mosquitoes as vectors of disease, carrying protozoan parasites or bacterial or viral pathogens from one host to another.


The mosquito life cycle consists of four stages: egg, larva, pupa, and adult.
Eggs are laid on the water surface; they hatch into motile larvae that feed on aquatic algae and organic material.
These larvae are important food sources

In [None]:
# Recursive Character Text Splitters

chunk_size = 300
chunk_percent = 0.2 

# create the RAG OBJECT 
splitters = RecursiveCharacterTextSplitter(
                chunk_size = chunk_size,
                chunk_overlap = chunk_size * chunk_percent
            )

split_doc = splitters.split_documents(data)

In [38]:
for index, sentence in enumerate(split_doc):
    print(f"{index}: {len(sentence.page_content)} :{sentence.page_content}")
    
    

0: 292 :Mosquitoes, the Culicidae, are a family of small flies consisting of 3,600 species. The word mosquito is Spanish and Portuguese for little fly. Mosquitoes have a slender segmented body, one pair of wings, three pairs of long hair-like legs, and specialized, highly elongated, piercing-sucking
1: 299 :legs, and specialized, highly elongated, piercing-sucking mouthparts. All mosquitoes drink nectar from flowers; females of many species have adapted to also drink blood. The group diversified during the Cretaceous period. Evolutionary biologists view mosquitoes as micropredators, small animals that
2: 284 :view mosquitoes as micropredators, small animals that parasitise larger ones by drinking their blood without immediately killing them. Medical parasitologists view mosquitoes as vectors of disease, carrying protozoan parasites or bacterial or viral pathogens from one host to another.
3: 298 :The mosquito life cycle consists of four stages: egg, larva, pupa, and adult. Eggs are lai

In [39]:
embedding = HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2")
                                 

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 4c024680-f89f-4498-aa33-9af4ce8e7f90)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


In [None]:
embedding1 = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2") 
myText = "mosquitoes are little insects that fly and buzz"
testText = "little insects"

embd_num = embedding1.embed_query(myText)
embd_num1 = embedding1.embed_query(testText)


In [34]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity([embd_num],[embd_num1])[0][0]

print("Cosine similarity:", similarity)


Cosine similarity: 0.6514923534687006


In [None]:
# Create the vector store
from langchain_chroma import Chroma

vectorStore = Chroma.from_documents(
                documents= split_doc,
                embedding=  embedding
                    )

retriever = vectorStore.as_retriever(search_kargs ={"k":3})