In [1]:
from chromadb import Client, Settings as ChromaSettings
from llama_index.core import Document, Settings
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import JSONNodeParser
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore


# from chromadb import Client, Settings as ChromaSettings
# from llama_index.core import Document, Settings
# from llama_index.core import VectorStoreIndex, StorageContext
# from llama_index.core.node_parser import JSONNodeParser
# from llama_index.embeddings.openai import OpenAIEmbedding
# from llama_index.vector_stores.chroma import ChromaVectorStore
# from typing import List, Dict
# import json
# import pandas as pd

from typing import List, Dict
import json
import pandas as pd

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.readers.json import JSONReader
from typing import List, Dict
import json
import numpy as np
import pandas as pd


In [2]:
# READ
reader = JSONReader()
document = reader.load_data(input_file="../resumes/processed/evaluations_JSON/Antonio Quinonez - Resume.json")

# # PARSE
# parser = JSONNodeParser.from_defaults()
# nodes = parser.get_nodes_from_documents(document)

In [3]:

class EntitySkillsProcessor:
    def __init__(self, persist_dir: str = "./entity_skills_db"):
        # Initialize ChromaDB settings
        chroma_settings = ChromaSettings(
            persist_directory=persist_dir,
            anonymized_telemetry=False
        )
        self.chroma_client = Client(chroma_settings)
        self.skills_collection = self._get_or_create_collection("entity_skills")
        self.vector_store = ChromaVectorStore(chroma_collection=self.skills_collection)
        
        # Initialize LlamaIndex settings with OpenAI embeddings
        self.embed_model = OpenAIEmbedding()
        Settings.embed_model = self.embed_model

    def _get_or_create_collection(self, name: str):
        try:
            return self.chroma_client.create_collection(
                name=name,
                metadata={"hnsw:space": "cosine"}  # Specify distance metric
            )
        except ValueError:
            return self.chroma_client.get_collection(name=name)

    def process_entity_skills(self, entity_json: Dict):
        """Process skills JSON maintaining entity relationship"""
        entity_name = entity_json.get('entity_name', 'Unknown Entity')
        skills = entity_json.get('skills_df', {}).get('value', [])
        
        enhanced_docs = []
        
        for idx, skill_info in enumerate(skills):
            # Create rich text content including entity name
            content = f"""
            Entity: {entity_name}
            Skill: {skill_info.get('skill', '')}
            Type: {skill_info.get('type', '')}
            Evaluation: {skill_info.get('eval', '')}
            Source Details: {skill_info.get('source_details', '')}
            Labels: {', '.join(skill_info.get('labels', []))}
            """

            # Create document with entity metadata and unique ID
            doc = Document(
                text=content.strip(),
                metadata={
                    'entity_name': entity_name,
                    'skill_name': skill_info.get('skill', ''),
                    'type': skill_info.get('type', ''),
                    'source_details': skill_info.get('source_details', ''),
                    'labels': ', '.join(skill_info.get('labels', [])),
                    'doc_id': f"{entity_name}-{idx}"  # Add unique identifier
                }
            )
            enhanced_docs.append(doc)

        # Create index from enhanced documents
        storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
        index = VectorStoreIndex.from_documents(
            enhanced_docs,
            storage_context=storage_context,
            show_progress=True
        )

        return index

    def search_skills(self, 
                     query_text: str, 
                     entity_name: str = None,
                     similarity_cutoff: float = 0.7):
        """
        Search for skills with optional entity filter
        """
        # Create the base query
        query_kwargs = {
            "similarity_top_k": 10,  # Get more results initially for filtering
        }
        
        # Add entity filter if specified
        if entity_name:
            query_kwargs["filter"] = {"entity_name": {"$eq": entity_name}}
            
        # Create the index and query engine with the specified parameters
        index = VectorStoreIndex.from_vector_store(self.vector_store)
        query_engine = index.as_query_engine(
            vector_store_kwargs=query_kwargs
        )
        
        # Execute the query
        response = query_engine.query(query_text)
        
        results = []
        for node in response.source_nodes:
            if node.score >= similarity_cutoff:
                results.append({
                    'entity_name': node.metadata.get('entity_name', ''),
                    'skill_name': node.metadata.get('skill_name', ''),
                    'type': node.metadata.get('type', ''),
                    'source_details': node.metadata.get('source_details', ''),
                    'labels': node.metadata.get('labels', ''),
                    'similarity_score': node.score,
                    'content': node.text
                })
        
        return pd.DataFrame(results)

    def find_entities_with_skill(self, skill_query: str, similarity_cutoff: float = 0.7):
        """Find all entities that have a particular skill"""
        results = self.search_skills(skill_query, similarity_cutoff=similarity_cutoff)
        return results[['entity_name', 'skill_name', 'similarity_score']].sort_values('similarity_score', ascending=False)

In [4]:
# Import required libraries
import json
from typing import Dict, List

# Sample data for multiple entities
entities_data = [
    {
        "entity_name": "Alice Chen",
        "skills_df": {
            "value": [
                {
                    "skill": "Python",
                    "type": "Programming",
                    "eval": "Expert level with 8 years experience in data science and ML applications",
                    "source_details": "Multiple projects and GitHub repositories",
                    "labels": ["verified", "expert"]
                },
                {
                    "skill": "TensorFlow",
                    "type": "Machine Learning",
                    "eval": "Led multiple deep learning projects, created custom architectures",
                    "source_details": "AI research papers and production deployments",
                    "labels": ["verified", "advanced"]
                },
                {
                    "skill": "AWS",
                    "type": "Cloud",
                    "eval": "Certified Solutions Architect, extensive experience with ML deployments",
                    "source_details": "AWS certification and production experience",
                    "labels": ["certified", "advanced"]
                }
            ]
        }
    },
    {
        "entity_name": "Bob Martinez",
        "skills_df": {
            "value": [
                {
                    "skill": "Python",
                    "type": "Programming",
                    "eval": "Intermediate level, mainly web development focus",
                    "source_details": "Web application projects",
                    "labels": ["verified", "intermediate"]
                },
                {
                    "skill": "React",
                    "type": "Frontend",
                    "eval": "Expert level, built multiple production applications",
                    "source_details": "Portfolio of web applications",
                    "labels": ["verified", "expert"]
                },
                {
                    "skill": "AWS",
                    "type": "Cloud",
                    "eval": "Basic experience with EC2 and S3",
                    "source_details": "Personal projects",
                    "labels": ["beginner"]
                }
            ]
        }
    },
    {
        "entity_name": "Carol Wong",
        "skills_df": {
            "value": [
                {
                    "skill": "Machine Learning",
                    "type": "AI",
                    "eval": "PhD research focused on reinforcement learning",
                    "source_details": "Published papers and GitHub projects",
                    "labels": ["verified", "expert", "research"]
                },
                {
                    "skill": "Python",
                    "type": "Programming",
                    "eval": "Advanced level, focus on scientific computing and ML",
                    "source_details": "Research implementations and teaching",
                    "labels": ["verified", "advanced"]
                },
                {
                    "skill": "PyTorch",
                    "type": "Machine Learning",
                    "eval": "Core contributor, deep expertise in custom implementations",
                    "source_details": "Open source contributions",
                    "labels": ["verified", "expert"]
                }
            ]
        }
    }
]

def demonstrate_skills_analysis():
    # Initialize the processor
    processor = EntitySkillsProcessor()
    
    # Process each entity's skills
    for entity_data in entities_data:
        processor.process_entity_skills(entity_data)
    
    print("Demonstration of skill searching and analysis:")
    
    # Example 1: Search for Python skills across all entities
    print("\n1. Finding all Python developers:")
    python_skills = processor.search_skills("Python", similarity_cutoff=0.7)
    print(python_skills[['entity_name', 'skill_name', 'similarity_score', 'content']].to_string())
    
    # Example 2: Search for machine learning experts
    print("\n2. Finding machine learning experts:")
    ml_experts = processor.search_skills("machine learning artificial intelligence deep learning", similarity_cutoff=0.7)
    print(ml_experts[['entity_name', 'skill_name', 'similarity_score']].to_string())
    
    # Example 3: Search for cloud computing skills
    print("\n3. Finding cloud computing expertise:")
    cloud_experts = processor.find_entities_with_skill("AWS cloud computing")
    print(cloud_experts.to_string())
    
    # Example 4: Search for frontend development skills
    print("\n4. Finding frontend developers:")
    frontend_devs = processor.search_skills("frontend development React", similarity_cutoff=0.7)
    print(frontend_devs[['entity_name', 'skill_name', 'similarity_score']].to_string())
    
    # Example 5: Search for specific entity's skills
    print("\n5. Finding Carol's machine learning skills specifically:")
    carol_ml = processor.search_skills("machine learning", entity_name="Carol Wong")
    print(carol_ml[['skill_name', 'similarity_score', 'content']].to_string())



In [5]:

demonstrate_skills_analysis()

Demonstration of skill searching and analysis:

1. Finding all Python developers:


ValueError: Expected where to have exactly one operator, got {} in query.