In [1]:
import chromadb
import json
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
persist_directory = "chroma_db"

In [3]:
if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)


In [4]:
chroma_client = chromadb.PersistentClient(path=persist_directory)

In [5]:
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [6]:
collection = chroma_client.get_or_create_collection(name="resume_data", embedding_function=default_ef)


In [7]:
with open("data.json", "r") as f:
    resume_data = json.load(f)

In [9]:
def chunkMaker(data):
    chunks = []
    chunks.append(
        {
            "id": "summary",
            "content": data["summary"],
            "metadata":{
                "section": "summary",
                "title": "Summary"
            }
        }
    )
    chunks.append({
        "id":"contact",
        "content": f" I am {data['contact']['name']}, my github id is {data['contact']['github']} and my email is {data['contact']['email']}, my linkedin is {data['contact']['linkedin']} my leetcode where i practice my Data structures and algorithms is {data['contact']['leetcode']}",
        "metadata":{
            "section": "contact"

        }
            
    })
    for edu in resume_data['education']:
        key = edu['institution'].lower().replace(" ", "_")
        chunks.append({
            "id": f"education_{key}",
            "content": f"I {'am pursuing' if '2025' in edu['duration'] else 'completed'} {edu['degree']} from {edu['institution']} in {edu['location']} ({edu['duration']}). {edu['details']}",
            "metadata":{
                "section": "education",
                "title": edu['institution'],
            }

        })
    chunks.append({
        "id":"skills_technical",
        "content" : f"my technical skills include {", ".join(resume_data['skills']['technical_skills'])}",
        "metadata":{
            "section": "skills",
            "title": "Technical Skills"
        }
    })
    chunks.append({
        "id":"skills_soft",
        "content" : f"my soft skills include {", ".join(resume_data['skills']['soft_skills'])}",
        "metadata":{
            "section": "skills",
            "title": "soft Skills"
        }
    })
    for project in resume_data["projects"]:
        key = project['title'].lower().replace(" ", "_")
        chunks.append({
            "id": f"project_{key}",
            "content":f"Project {project['title']} ({project['duration']}). {", ".join(project['description'])} skills: {', '.join(project['skills'])} links: {', '.join(project['links'])}",
            "metadata":{
                "section": "projects",
                "title": project['title'],

            }
        })
    for cert in resume_data['certifications']:
        cert_key = cert['title'].lower().replace(' ', '_')
        chunks.append({
            "id": f"certification_{cert_key}",  # e.g., "certification_machine_learning_specialist"
            "content": f"I earned {cert['title']} certification from {cert['issuing_organization']} in {cert['date_awarded']}",
            "metadata": {
                "section": "certifications",
                "title": cert['title']
            }
        })
    return chunks
        

In [12]:
chunks = chunkMaker(resume_data)

In [10]:
val = default_ef(["foo"])

In [14]:
ids = []
documents = []
metadatas = []

for chunk in chunks:
    ids.append(chunk["id"])
    documents.append(chunk["content"])
    metadatas.append(chunk["metadata"])

In [15]:
collection.upsert(ids=ids, documents=documents, metadatas=metadatas)

In [16]:
collection.query(query_texts="What is Aneesh Patne Education", n_results=4)

{'ids': [['summary',
   'contact',
   'certification_full_stack_web_development',
   'project_t.rex:_electronics_circuits_game']],
 'embeddings': None,
 'documents': [['A dedicated Electronics and Telecommunication professional with a strong background in machine learning, web development, and cloud platforms. Proven ability to lead projects, solve complex problems, and continuously learn new technologies to drive innovation and efficiency.',
   ' I am Aneesh Patne, my github id is https://github.com/aneeshpatne and my email is aneeshpatne12@gmail.com, my linkedin is https://www.linkedin.com/in/aneeshpatne my leetcode where i practice my Data structures and algorithms is https://leetcode.com/aneeshpatne',
   'I earned Full Stack Web Development certification from Udemy in April 2022',
   "Project T.rex: Electronics Circuits Game (Nov 2022 - Dec 2022). - A basic HTML/JS game intended to test the basic knowledge of the player regarding the basics of FET., - During the teaching-learning p