### **PineCone Vector db**

In [37]:
from pinecone import Pinecone

In [None]:
pc = Pinecone(
    # api_key = ""
)

In [39]:
index_name = "furr-bot"

In [40]:
if not pc.has_index(index_name): 
    pc.create_index_for_model(
        name=index_name, 
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )
    print("Index created")
else:
    print("Index already exists")

Index created


In [41]:
pc.list_indexes()

[
    {
        "name": "hotelbookings",
        "metric": "cosine",
        "host": "hotelbookings-tqz6la2.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 384,
        "deletion_protection": "disabled",
        "tags": null
    },
    {
        "name": "furr-bot",
        "metric": "cosine",
        "host": "furr-bot-tqz6la2.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1024,
        "deletion_protection": "disabled",
        "tags": null,
        "embed": {
  

Let's Prepare the Docs to upload

In [42]:
data_path = "vector_database/text_data"

In [43]:
import os
# Get files ending with .json 
file_names = [file if file in os.listdir(data_path) else None for file in os.listdir(data_path) if file.endswith(".json")]

In [44]:
file_names[:4]

['Bees.json', 'Birds.json', 'Cats.json', 'Chicken and Ducks.json']

In [45]:
import json
# Read sample

with open(os.path.join(data_path, file_names[0]), "r") as f:
    sample = json.load(f)

In [46]:
sample[0]

{'text': 'beekeeping 101 | petmd petmd.com /farm-animal/beekeeping-101 there are 17,000 species of bees in the world and around 80 million beehives. there are 115-125,000 beekeepers in the united states and beekeeping has been around since 15,000 bc. honeybees are integral to agriculture and crops. a third of the diet in the u.s. comes from insect-pollinated plants alone and 80% of those crops are pollinated by honeybees. these crops include blueberries, cranberries, apples, pumpkins, and peaches. 90% of all wild plants depend on animal pollination. without bees, these crops would not thrive. there are other pollinators, like insects, birds, and bats, but honeybees can be moved wherever they need to pollinate. in addition, without honeybees, it is costly to produce certain fruits, legumes, vegetables, nuts, and seeds.',
 'topic': 'Beekeeping 101 | PetMD',
 'blog_name': 'Beekeeping 101 _ PetMD',
 'pet': 'Bees',
 'word_count': 125,
 'chunk_size': 162}

In [47]:
# Sample data point format to upsert to Pinecone
sample_format = [
    {
        "_id": "rec1", 
        "chunk_text": "This is a sample text",
        "category": "cat1",
    }
]

In [48]:
# Create NameSpaces from file-names 
# -- These will be the actual namespaces in the index
# -- Lower case and replace spaces with underscores
# -- Remove .json at the end

namespaces = [file_name.lower().replace(" ", "_").replace(".json", "") for file_name in file_names]
namespaces[:4]

['bees', 'birds', 'cats', 'chicken_and_ducks']

Create data to upsert

In [49]:
data_dict = {}
for namespace, file_name in zip(namespaces, file_names):
    # Load the data from the file
    with open(os.path.join(data_path, file_name), "r") as f:
        sample = json.load(f)
    # Data list
    data = []
    # Create points 
    for e, i in enumerate(sample): 
        point = {
            "_id": f"rec_{e}", 
            "chunk_text": i["text"],
            "pet": i["pet"],
            "topic": i["topic"],
            "blog_name": i["blog_name"],
            "word_count": i["word_count"],
            "chunk_size": i["chunk_size"]
        }
        data.append(point)

    # Add to the dictionary
    data_dict[namespace] = data

In [50]:
for k, v in data_dict.items():
    print(f"|| {k} || --> {len(v)} entries")

|| bees || --> 8 entries
|| birds || --> 884 entries
|| cats || --> 966 entries
|| chicken_and_ducks || --> 107 entries
|| chinchillas || --> 268 entries
|| dogs || --> 4949 entries
|| ferrets || --> 495 entries
|| fish || --> 666 entries
|| gerbils || --> 137 entries
|| guinea_pigs || --> 353 entries
|| hamsters || --> 339 entries
|| horses || --> 714 entries
|| pigs || --> 13 entries
|| prairie_dogs || --> 79 entries
|| rabbits || --> 842 entries
|| rats || --> 256 entries
|| reptiles || --> 119 entries
|| sugar_gliders || --> 31 entries


### Upsert to Index

In [51]:
pc

<pinecone.control.pinecone.Pinecone at 0x27d869df850>

In [52]:
index = pc.Index(host="https://furr-bot-tqz6la2.svc.aped-4627-b74a.pinecone.io")

In [53]:
index

<pinecone.data.index.Index at 0x27d88ea3550>

In [None]:
from tqdm import tqdm
from itertools import islice

def batch_records(records, batch_size):
    """Yield successive batches from the records list."""
    it = iter(records)
    while True:
        batch = list(islice(it, batch_size))
        if not batch:
            break
        yield batch

# Upsert the data points to the index in batches
for namespace, data_points in tqdm(data_dict.items()):
    # Upsert the data points to the index
    for batch in batch_records(data_points, 96):
        index.upsert_records(
            namespace,
            batch
        )
    print(f"Up-serted {len(data_points)} records to namespace {namespace}")

In [54]:
print(namespaces)

['bees', 'birds', 'cats', 'chicken_and_ducks', 'chinchillas', 'dogs', 'ferrets', 'fish', 'gerbils', 'guinea_pigs', 'hamsters', 'horses', 'pigs', 'prairie_dogs', 'rabbits', 'rats', 'reptiles', 'sugar_gliders']


In [55]:
# Upload Bees 
for batch in batch_records(data_dict["bees"], 96):
    index.upsert_records(
        "bees",
        batch
    )
    print("Up-serted Bees records")

Up-serted Bees records


In [57]:
# Upload Birds 
for batch in batch_records(data_dict["birds"], 96):
    index.upsert_records(
        "birds",
        batch
    )
print("Up-serted Birds records")

Up-serted Birds records


In [58]:
# Upload Cats 
for batch in batch_records(data_dict["cats"], 96):
    index.upsert_records(
        "cats",
        batch
    )
print("Up-serted Cats records")

Up-serted Cats records


In [60]:
# Upload Chicken and Ducks
for batch in batch_records(data_dict["chicken_and_ducks"], 96):
    index.upsert_records(
        "chicken_and_ducks",
        batch
    )
print("Up-serted Chickens and Ducks records")

Up-serted Chickens and Ducks records


In [61]:
# Upload Chinchillas 
for batch in batch_records(data_dict["chinchillas"], 96):
    index.upsert_records(
        "chinchillas",
        batch
    )
print("Up-serted Chinchillas records")

Up-serted Chinchillas records


In [62]:
# Upload Ferrets 
for batch in batch_records(data_dict["ferrets"], 96):
    index.upsert_records(
        "ferrets",
        batch
    )
print("Up-serted Ferrets records")

Up-serted Ferrets records


In [63]:
# Upload Fish 
for batch in batch_records(data_dict["fish"], 96):
    index.upsert_records(
        "fish",
        batch
    )
print("Up-serted Fish records")

Up-serted Fish records


In [68]:
# Upload Gerbils 
# check if gerbils have any empty chunk_text values
for i in data_dict["gerbils"]:
    if i["chunk_text"] == "":
        print("Empty chunk_text found in gerbils")
        break

# Remove it 
data_dict["gerbils"] = [i for i in data_dict["gerbils"] if i["chunk_text"] != ""]

Empty chunk_text found in gerbils


In [69]:
for batch in batch_records(data_dict["gerbils"], 96):
    index.upsert_records(
        "gerbils",
        batch
    )
print("Up-serted Gerbils records")

Up-serted Gerbils records


In [70]:
# Upload Guinea Pigs
for batch in batch_records(data_dict["guinea_pigs"], 96):
    index.upsert_records(
        "guinea_pigs",
        batch
    )
print("Up-serted Guinea Pigs records")

Up-serted Guinea Pigs records


In [71]:
# Upload Hamsters 
for batch in batch_records(data_dict["hamsters"], 96):
    index.upsert_records(
        "hamsters",
        batch
    )
print("Up-serted Hamsters records")

Up-serted Hamsters records


In [72]:
# Upload Horses 
for batch in batch_records(data_dict["horses"], 96):
    index.upsert_records(
        "horses",
        batch
    )
print("Up-serted Horses records")

Up-serted Horses records


In [73]:
# Upload Pigs 
for batch in batch_records(data_dict["pigs"], 96):
    index.upsert_records(
        "pigs",
        batch
    )
print("Up-serted Pigs records")

Up-serted Pigs records


In [74]:
# Upload Prairie Dogs
for batch in batch_records(data_dict["prairie_dogs"], 96):
    index.upsert_records(
        "prairie_dogs",
        batch
    )
print("Up-serted Prairie Dogs records")

Up-serted Prairie Dogs records


In [75]:
# Upload Rabbits 
for batch in batch_records(data_dict["rabbits"], 96):
    index.upsert_records(
        "rabbits",
        batch
    )
print("Up-serted Rabbits records")

Up-serted Rabbits records


In [97]:
# Upload Rats 
for batch in batch_records(data_dict["rats"], 96):
    index.upsert_records(
        "rats",
        batch
    )
print("Up-serted Rats records")

Up-serted Rats records


In [76]:
# Upload Reptiles
for batch in batch_records(data_dict["reptiles"], 96):
    index.upsert_records(
        "reptiles",
        batch
    )
print("Up-serted Reptiles records")

Up-serted Reptiles records


In [77]:
# Upload Sugar Gliders
for batch in batch_records(data_dict["sugar_gliders"], 96):
    index.upsert_records(
        "sugar_gliders",
        batch
    )
print("Up-serted Sugar Gliders records")

Up-serted Sugar Gliders records


Upload Dogs Data

Since there are limitations on upserting per minute in Pinecone, We will divide the data into batches of 900..

In [79]:
len(data_dict.get('dogs'))

4949

In [81]:
4949/5
# Divide the data into 5 batches of 1000 records each
dogs = data_dict.get('dogs')
dogs_batches = [dogs[i:i + 1000] for i in range(0, len(dogs), 1000)]

In [85]:
len(dogs_batches[0])

1000

In [86]:
# Upload Dogs 
for batch in batch_records(dogs_batches[0], 96):
    index.upsert_records(
        "dogs",
        batch
    )
print("Up-serted Dogs records batch 1")

Up-serted Dogs records batch 1


In [87]:
for batch in batch_records(dogs_batches[1], 96):
    index.upsert_records(
        "dogs",
        batch
    )
print("Up-serted Dogs records batch 2")

Up-serted Dogs records batch 2


In [88]:
for batch in batch_records(dogs_batches[2], 96):
    index.upsert_records(
        "dogs",
        batch
    )
print("Up-serted Dogs records batch 3")

Up-serted Dogs records batch 3


In [89]:
for batch in batch_records(dogs_batches[3], 96):
    index.upsert_records(
        "dogs",
        batch
    )
print("Up-serted Dogs records batch 4")

Up-serted Dogs records batch 4


In [94]:
for batch in batch_records(dogs_batches[4], 96):
    index.upsert_records(
        "dogs",
        batch
    )
print("Up-serted Dogs records batch 5")

Up-serted Dogs records batch 5


In [98]:
# Sum the total number of data points thats uploaded 
total = 0
for k, v in data_dict.items():
    total += len(v)
print(f"Total number of data points uploaded: {total}")

Total number of data points uploaded: 11225


**Data Up-sertion Successful**

---
### **Querying**

In [None]:
if not index:
    # index = pc.Index(host="")

index 

<pinecone.data.index.Index at 0x27d88ea3550>

In [125]:
import time 

s = time.time()
results = index.search(
    namespace="dogs", 
    query={
        "inputs": {"text": "How do i train my shihtzu to stop barking?"}, 
        "top_k": 10
    },
    rerank = {
        "model": "bge-reranker-v2-m3", 
        "top_n": 5, 
        "rank_fields": ["chunk_text"]
    },
    fields=["chunk_text"]
)

print(f"Took {time.time() - s} seconds")

Took 0.5260226726531982 seconds


In [135]:
# results.result.hits

In [133]:
for i in results.result.hits:
    print(i.get('fields').get('chunk_text'))

caring for a shih tzu shih tzu are pleasant and energetic companions. they are generally happy and sociable with people—and other animals—of all ages and prefer to not spend their time alone. like all dogs, shih tzu puppies should be socialized at a young age to avoid any anxiety. otherwise, they might display bad behaviors such as barking excessively or digging in your back yard. all dogs, including the shih tzu, require exercise, but this breed is often satisfied with a 30-minute walk and at-home playtime. shih tzu love to be spoiled with attention and treats; they love to please their pet parents and are usually quick to learn new tricks to get the treats and praise they crave. most of a shih tzu dog’s upkeep deals with their long, luxurious double haircoat that grows continuously. they don’t shed much and are considered a “hypoallergenic” dog breed, though no dog breed is truly hypoallergenic. their coat is also prone to tangling and matting. , this coat is often shaved short into 

In [106]:
type(results)

pinecone.core.openapi.db_data.model.search_records_response.SearchRecordsResponse

In [117]:
type(results.result.hits)

list

In [118]:
results.result.hits[0]

{'_id': 'rec_4274',
 '_score': 0.5192688703536987,
 'fields': {'chunk_text': 'siberian husky training training should start when '
                          'your siberian husky is a puppy to correct unwanted '
                          'behaviors before they become adults. huskies need '
                          'to be leash trained so they don’t run off to '
                          'explore on their own. they also need to be trained '
                          'to avoid excessive barking and howling. huskies can '
                          'be independent, which can make them more difficult '
                          'to train than some other breeds. training your '
                          'husky needs to start as soon as you bring them home '
                          'and be consistent. training with treats before '
                          'mealtimes can motivate your pup to work extra hard. '
                          'as with most puppies, training sessions should be '
  

In [119]:
results

{'result': {'hits': [{'_id': 'rec_4274',
                      '_score': 0.5192688703536987,
                      'fields': {'chunk_text': 'siberian husky training '
                                               'training should start when '
                                               'your siberian husky is a puppy '
                                               'to correct unwanted behaviors '
                                               'before they become adults. '
                                               'huskies need to be leash '
                                               'trained so they don’t run off '
                                               'to explore on their own. they '
                                               'also need to be trained to '
                                               'avoid excessive barking and '
                                               'howling. huskies can be '
                                               'indepe

In [120]:
data_dict.get('dogs')[0]

{'_id': 'rec_0',
 'chunk_text': 'affenpinscher dog breed health and care petmd.com /dog/breeds/affenpinscher small-dog breed known for having a spirited, terrier-like personality and an almost monkey-like face that gives them the nickname the monkey dog. originating in 17th-century germany, the breed was initially developed to hunt rats in homes, stables, and shops, according to the affenpinscher club of america (aca). but over time, the affenpinscher transitioned from a working dog to a beloved companion. physically, affenpinschers are compact dogs that stand 9–12 inches tall and weigh 7–10 pounds. they have a scruffy coat that can be black, gray, silver, or red. affenpinscher dogs are best known for their expressive face, with a short muzzle and dark, round eyes that give them an appearance not unlike a monkey. the affenpinscher personality is just as quirky as their looks—the dogs are bold, confident, and animated. matthew mullin, director of the affenpinscher club of america, descr

In [122]:
filtered_results = index.search(
    namespace="dogs", 
    query={
        "inputs": {"text": "Disease prevention"}, 
        "top_k": 3,
        "filter": {"pet": "Dogs"},
    },
    fields=["chunk_text", "word_count", "topic"]
)

print(filtered_results)

{'result': {'hits': [{'_id': 'rec_1418',
                      '_score': 0.2878890335559845,
                      'fields': {'chunk_text': 'dental care the biggest part '
                                               'of their upkeep will be their '
                                               'teeth, which need to be '
                                               'brushed daily to prevent '
                                               'dental disease.',
                                 'topic': 'Dental Care',
                                 'word_count': 22.0}},
                     {'_id': 'rec_659',
                      '_score': 0.254560649394989,
                      'fields': {'chunk_text': 'dental disease as with all '
                                               'small dogs, the bichon frise '
                                               'may develop dental disease. '
                                               'prevention is the best weapon '
                