In [1]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import json
from dotenv import load_dotenv
import os


  from tqdm.autonotebook import tqdm


In [None]:
load_dotenv()

In [3]:
pc = Pinecone(api_key=os.getenv("PINECONE_KEY"))

In [4]:
with open("data.json", "r") as file:
    data = json.load(file)


In [5]:
def collect_data(data, key_filter=None, prefix=""):
    collected_data = []

    if isinstance(data, dict): 
        for key, value in data.items():
            new_prefix = f"{prefix}.{key}" if prefix else key 
            if key_filter is None or key in key_filter:
                if isinstance(value, str):  
                    collected_data.append((new_prefix, value))
                elif isinstance(value, (dict, list)): 
                    collected_data.extend(collect_data(value, key_filter, new_prefix))
            elif isinstance(value, (dict, list)):
                collected_data.extend(collect_data(value, key_filter, new_prefix))
    elif isinstance(data, list):  
        for i, item in enumerate(data):
            new_prefix = f"{prefix}[{i}]"  
            collected_data.extend(collect_data(item, key_filter, new_prefix))
    elif isinstance(data, str): 
        collected_data.append((prefix, data))

    return collected_data
collected_data = collect_data(data)


In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [29]:
texts = [item[1] for item in collected_data]
texts

['A dedicated Electronics and Telecommunication professional with a strong background in machine learning, web development, and cloud platforms. Proven ability to lead projects, solve complex problems, and continuously learn new technologies to drive innovation and efficiency.',
 'Aneesh Patne',
 'aneeshpatne12@gmail.com',
 'https://www.linkedin.com/in/aneeshpatne',
 'https://github.com/aneeshpatne',
 'https://leetcode.com/aneeshpatne',
 'M.Tech in Electronics and Telecommunication',
 'Veermata Jijabai Technological Institute',
 'Mumbai, Maharashtra',
 '2023 - 2025',
 'Specialized in Machine Learning and Signal Processing. Relevant coursework includes Advanced Algorithms, Neural Networks, and Communication Systems.',
 'B.Tech in Electronics and Telecommunication',
 'Thakur College of Engineering and Technology',
 'Mumbai, Maharashtra',
 '2019 - 2023',
 'Graduated with First Class Honors. Engaged in multiple projects focusing on embedded systems and wireless communication.',
 'Python',


In [30]:
embeddings = model.encode(texts)

In [31]:
embedded_data = [
    {"prefix": collected_data[i][0], "text": texts[i], "embedding": embeddings[i].tolist()}
    for i in range(len(texts))
]

Save to PineCode

In [34]:
load_dotenv()

True

In [37]:
pc = Pinecone(
    api_key=os.getenv("PINECONE_KEY")
)

In [38]:
index_name = "rag-resume-data"
embedding_dim = 384
metric = "cosine"
if index_name not in pc.list_indexes().names():
    # Create the index
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric=metric,
        spec=ServerlessSpec(
            cloud="aws",  # Choose your cloud provider
            region="us-east-1"  # Choose your region
        )
    )
print(f"Index '{index_name}' is ready.")


Index 'rag-resume-data' is ready.


In [7]:
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'rag-resume-data-icsq7e1.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'rag-resume-data',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [44]:
data_to_upsert = [
    {
        "id": f"doc-{i}",  
        "values": item["embedding"], 
        "metadata": {  
            "prefix": item["prefix"], 
            "text": item["text"] 
        }
    }
    for i, item in enumerate(embedded_data)
]

In [11]:
indexes = pc.list_indexes()
print(indexes)

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'rag-resume-data-icsq7e1.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'rag-resume-data',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}


In [13]:
index = pc.Index("rag-resume-data")

In [None]:
index.upsert(data_to_upsert)

In [15]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 69}},
 'total_vector_count': 69}


In [17]:
def embeded_input(text):
    return model.encode([text])[0].tolist()

In [26]:
query = "Does Aneesh Know dance?"
query_embedding = embeded_input(query)
print(query_embedding)

[-0.010107128880918026, -0.04968219995498657, -0.039474911987781525, -0.017603961750864983, -0.1250545084476471, 0.015618768520653248, 0.1096700131893158, -0.14793065190315247, -0.018482867628335953, 0.03703707456588745, -0.02054527960717678, -0.003957622684538364, 0.013477451168000698, -0.04549214616417885, -0.006243489682674408, 0.036690741777420044, -0.010237528942525387, 0.010079842992126942, 0.003939739894121885, -0.054095834493637085, -0.11581249535083771, -0.04581153020262718, 0.0470491386950016, 0.017357056960463524, -0.012152349576354027, 0.007284059189260006, 0.029166018590331078, -0.009236005134880543, -0.007039881311357021, -0.04266420379281044, 0.0022258078679442406, 0.07909633964300156, 0.010137906298041344, 0.05628261715173721, -0.011744050309062004, 0.07038737088441849, 0.07474038004875183, 0.010937669314444065, -0.10540402680635452, 0.10816764086484909, -0.010990804992616177, -0.002360441256314516, 0.02849445678293705, -0.021888459101319313, 0.013698265887796879, 0.045

In [27]:
results = index.query(vector=query_embedding, top_k=5, include_metadata=True)
results

{'matches': [{'id': 'doc-1',
              'metadata': {'prefix': 'contact.name', 'text': 'Aneesh Patne'},
              'score': 0.428705513,
              'values': []},
             {'id': 'doc-5',
              'metadata': {'prefix': 'contact.leetcode',
                           'text': 'https://leetcode.com/aneeshpatne'},
              'score': 0.378607,
              'values': []},
             {'id': 'doc-4',
              'metadata': {'prefix': 'contact.github',
                           'text': 'https://github.com/aneeshpatne'},
              'score': 0.338567346,
              'values': []},
             {'id': 'doc-2',
              'metadata': {'prefix': 'contact.email',
                           'text': 'aneeshpatne12@gmail.com'},
              'score': 0.265222788,
              'values': []},
             {'id': 'doc-3',
              'metadata': {'prefix': 'contact.linkedin',
                           'text': 'https://www.linkedin.com/in/aneeshpatne'},
             