In [13]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
import re

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
with open('Aneesh_Patne-2.md', 'r') as file:
    markdown_content = file.read()

In [4]:
def segment_markdown(markdown_content):
    segments = re.split(r'\n\n+', markdown_content.strip())
    return [segment.strip() for segment in segments if segment.strip()]

In [5]:
def generate_embeddings(segments):
    embeddings = model.encode(segments, convert_to_tensor=True)
    return [{"text": segment, "embedding": embedding} for segment, embedding in zip(segments, embeddings)]

In [6]:
segments = segment_markdown(markdown_content)

In [7]:
embeddings = generate_embeddings(segments)

In [8]:
for item in embeddings:
    print("Segment:", item["text"])
    print("Embedding:", item["embedding"][:5], "...")  # Print first 5 values of the embedding for brevity
    print("-" * 80)

Segment: ## Aneesh Patne
Embedding: tensor([-0.0851, -0.0109,  0.0758, -0.0146, -0.0183], device='cuda:0') ...
--------------------------------------------------------------------------------
Segment: +91 0000000000 - xxxxxxxx@gmail.com - aneeshpatne.com - Linkedin - LeetCode - GitHub
Embedding: tensor([-0.1158,  0.0272,  0.0062,  0.0147,  0.0203], device='cuda:0') ...
--------------------------------------------------------------------------------
Segment: EDUCATION
Embedding: tensor([ 0.0309,  0.0999, -0.0206,  0.0770, -0.0156], device='cuda:0') ...
--------------------------------------------------------------------------------
Segment: ## Veermata Jijabai Technological Institute
Embedding: tensor([-0.1242,  0.0387,  0.0512, -0.0556, -0.0129], device='cuda:0') ...
--------------------------------------------------------------------------------
Segment: Mumbai, Maharashtra
Embedding: tensor([ 0.0101, -0.0051, -0.0412,  0.0403, -0.0042], device='cuda:0') ...
--------------------------

In [11]:
load_dotenv()

True

In [12]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_KEY"))

In [14]:
index_name = "rag-resume-data-1"
embedding_dim = 384
metric = "cosine"
if index_name not in pc.list_indexes().names():
    # Create the index
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",  # Choose your cloud provider
            region="us-east-1"  # Choose your region
        )
    )
print(f"Index '{index_name}' is ready.")


Index 'rag-resume-data-1' is ready.


In [20]:
index = pc.Index('rag-resume-data-1')

In [23]:
def save_embeddings_to_pinecone(segments):
    embeddings = model.encode(segments, convert_to_tensor=False)
    upserts = [
        {"id": f"segment-{i}", "values": embeddings[i].tolist(), "metadata": {"text": segments[i]}}
        for i in range(len(segments))
    ]
    index.upsert(vectors=upserts)
    print(f"Upserted {len(upserts)} segments to Pinecone.")

In [24]:
save_embeddings_to_pinecone(segments)

Upserted 71 segments to Pinecone.


In [25]:
def embeded_input(text):
    return model.encode([text])[0].tolist()

In [28]:
query = "Does Aneesh Know React?"
query_embedding = embeded_input(query)
print(query_embedding)

[-0.013240753673017025, 0.00530521385371685, 0.017008282244205475, 0.023842858150601387, 0.011986448429524899, 0.03599923849105835, 0.12794603407382965, -0.045647088438272476, 0.002487019170075655, 0.037020452320575714, -0.011012700386345387, -0.08413926512002945, -0.011481170542538166, 0.03138140216469765, -0.024311399087309837, 0.0415307953953743, 0.024881917983293533, -0.08573972433805466, -0.039645858108997345, -0.05428287014365196, -0.06969363987445831, 0.012839214876294136, 0.06143064796924591, -0.01825057715177536, -0.07224048674106598, 0.031244225800037384, 0.017644992098212242, 2.598902938188985e-05, 0.010947935283184052, -0.0429825484752655, 0.026075579226017, 0.034756459295749664, 0.0017463566036894917, 0.022537557408213615, -0.03317252919077873, 0.12354178726673126, 0.054611340165138245, 0.03680451959371567, -0.07695917785167694, 0.040380269289016724, 0.010807269252836704, -0.05370583012700081, 0.018135225400328636, -0.04715479910373688, -0.02445712685585022, 0.044794183224

In [29]:
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)
results

{'matches': [{'id': 'segment-0',
              'metadata': {'text': '## Aneesh Patne'},
              'score': 0.326395512,
              'values': []},
             {'id': 'segment-37',
              'metadata': {'text': 'EDUCATION'},
              'score': 0.160454094,
              'values': []},
             {'id': 'segment-2',
              'metadata': {'text': 'EDUCATION'},
              'score': 0.160454094,
              'values': []},
             {'id': 'segment-14',
              'metadata': {'text': '( LLM, Fine Tuning ,Huggingface)'},
              'score': 0.155857936,
              'values': []},
             {'id': 'segment-49',
              'metadata': {'text': '( LLM, Fine Tuning ,Huggingface)'},
              'score': 0.155857936,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}