In [1]:
import pandas as pd

def calculate_scores(df):
    """
    Calculate scores for priority columns while preserving other columns.
    Adds a total column that sums only the priority-based scores.
    
    Rules:
    - 'Critical' prefix & True value = 20
    - 'Required' prefix & True value = 10
    - 'Preferred' prefix & True value = 7
    - 'Optional' prefix & True value = 3
    - All other columns preserved as-is
    - 'total_priority_score' column added with sum of priority scores
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame with boolean columns
    
    Returns:
    pandas.DataFrame: New DataFrame with scores and total
    """
    # Define priority prefixes
    priority_prefixes = ['Critical', 'Required', 'Preferred', 'Optional']
    
    # Create a copy of the input DataFrame
    result_df = df.copy()
    
    # Track which columns are priority columns for summing later
    priority_columns = []
    
    # Process each column in the source DataFrame
    for col in df.columns:
        # Check if column starts with any of our priority prefixes
        if any(col.startswith(prefix) for prefix in priority_prefixes):
            new_values = pd.Series(0, index=df.index)
            
            if col.startswith('Critical'):
                new_values[df[col]] = 20
            elif col.startswith('Required'):
                new_values[df[col]] = 10
            elif col.startswith('Preferred'):
                new_values[df[col]] = 7
            elif col.startswith('Optional'):
                new_values[df[col]] = 3
                
            result_df[col] = new_values
            priority_columns.append(col)
    
    # Add total column that sums only priority-based scores
    result_df['total_priority_score'] = result_df[priority_columns].sum(axis=1)
    
    return result_df

In [None]:
# %%
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import os
from dotenv import load_dotenv
import pandas as pd
import json

def create_skills_matrix(skills_dict, distance_threshold=.8):
    """
    Create a DataFrame showing which entities possess which skills based on ChromaDB similarity search.
    Skills are categorized by priority level and reflected in column names.
    
    Args:
        skills_dict (dict): Dictionary of skills categorized by priority level
                           Format: {
                               'Critical': ['skill1', 'skill2'],
                               'Required': ['skill3', 'skill4'],
                               'Preferred': ['skill5', 'skill6'],
                               'Optional': ['skill7', 'skill8']
                           }
        distance_threshold (float): Maximum distance to consider a skill match (default: .8)
        
    Returns:
        pandas.DataFrame: Matrix of entities and their skills with priority level prefixes
    """
    # Load environment variables
    load_dotenv()

    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path="../entity_skills_db")

    # Initialize the OpenAI embedding function
    embedding_function = OpenAIEmbeddingFunction(
        api_key=os.getenv("OPENAI_API_KEY"),
        model_name="text-embedding-3-large"
    )

    try:
        # Get existing collection
        collection = client.get_collection(
            name="entity_skills",
            embedding_function=embedding_function
        )
        
        # Dictionary to store all results
        all_entity_skills = {}
        
        # Create a mapping of all skills to their priority levels
        skill_priority_map = {
            skill: priority
            for priority, skills in skills_dict.items()
            for skill in skills
        }
        
        # Get all skills across all priority levels
        all_skills = [skill for skills in skills_dict.values() for skill in skills]
        
        # Query each skill
        for skill in all_skills:
            results = collection.query(
                query_texts=[skill],
                n_results=1000,  # Large number to get all potential matches
                include=["documents", "metadatas", "distances"]
            )
            
            # Process results for this skill
            for entity_metadata, distance in zip(
                results['metadatas'][0],
                results['distances'][0]
            ):
                entity_id = entity_metadata.get('entity_name')
                
                # Initialize entity in dictionary if not present
                if entity_id not in all_entity_skills:
                    # Initialize with prefixed column names
                    all_entity_skills[entity_id] = {
                        f"{priority}_{skill}": False
                        for priority, skills in skills_dict.items()
                        for skill in skills
                    }
                
                # Mark skill as True if distance is below threshold
                if distance < distance_threshold:
                    priority = skill_priority_map[skill]
                    all_entity_skills[entity_id][f"{priority}_{skill}"] = True
        
        # Convert to DataFrame
        df = pd.DataFrame.from_dict(all_entity_skills, orient='index')
        
        # Reset index and rename it to entity_id
        df.index.name = 'entity_id'
        df.reset_index(inplace=True)
        
        return df

    except Exception as e:
        print(f"Error accessing collection: {str(e)}")
        raise

# Example usage
if __name__ == "__main__":
    # Example dictionary of skills to query
    skills_to_query = {
        'Critical': ['Python', 'Data Analysis'],
        'Required': ['Amazon Web Services', 'Machine Learning', 'PyTorch'],
        'Preferred': ['Docker', 'SQL', 'SQL Server', 'PostgreSQL'],
        'Optional': ['Kubernetes', 'React', 'GCP'],
    }
    
    # Create the skills matrix
    skills_df = create_skills_matrix(skills_to_query)

    
    # Display some summary statistics
    print("\nSkill Distribution:")
    for priority, skills in skills_to_query.items():
        print(f"\n{priority} Skills:")
        for skill in skills:
            column_name = f"{priority}_{skill}"
            count = skills_df[column_name].sum()
            total = len(skills_df)
            percentage = (count / total) * 100
            print(f"{skill}: {count} entities ({percentage:.1f}%)")



In [None]:
skills_df

In [None]:
scored_df = calculate_scores(skills_df)
scored_df

In [None]:
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import os
from dotenv import load_dotenv
import pandas as pd
import json

def create_skills_matrix_with_distances(skills_dict):
    """
    Create a DataFrame showing the best (smallest) distance scores between entities and skills 
    based on ChromaDB similarity search.
    
    Args:
        skills_dict (dict): Dictionary of skills categorized by priority level
                           Format: {
                               'Critical': ['skill1', 'skill2'],
                               'Required': ['skill3', 'skill4'],
                               'Preferred': ['skill5', 'skill6'],
                               'Optional': ['skill7', 'skill8']
                           }
        
    Returns:
        pandas.DataFrame: Matrix of entities and their best skill distances with priority level prefixes
    """
    # Load environment variables
    load_dotenv()

    # Initialize ChromaDB client with persistence
    client = chromadb.PersistentClient(path="../entity_skills_db")

    # Initialize the OpenAI embedding function
    embedding_function = OpenAIEmbeddingFunction(
        api_key=os.getenv("OPENAI_API_KEY"),
        model_name="text-embedding-3-large"
    )

    # Get existing collection
    collection = client.get_collection(
        name="entity_skills",
        embedding_function=embedding_function
    )
    
    # Dictionary to store all results
    all_entity_skills = {}
    
    # Create a mapping of all skills to their priority levels
    skill_priority_map = {
        skill: priority
        for priority, skills in skills_dict.items()
        for skill in skills
    }
    
    # Get all skills across all priority levels
    all_skills = [skill for skills in skills_dict.values() for skill in skills]
    
    # Query each skill
    for skill in all_skills:
        results = collection.query(
            query_texts=[skill],
            n_results=1000,  # Large number to get all potential matches
            include=["documents", "metadatas", "distances"]
        )
        
        # Process results for this skill
        for entity_metadata, distance in zip(
            results['metadatas'][0],
            results['distances'][0]
        ):
            entity_id = entity_metadata.get('entity_name')
            
            # Initialize dictionary for new entity if needed
            if entity_id not in all_entity_skills:
                all_entity_skills[entity_id] = {}
            
            # Get the column name
            priority = skill_priority_map[skill]
            column_name = f"{priority}_{skill}"
            
            # Update the distance if it's either not set yet or if this one is smaller
            current_distance = all_entity_skills[entity_id].get(column_name, None)
            if current_distance is None or distance < current_distance:
                all_entity_skills[entity_id][column_name] = round(distance, 2)
    
    # Convert to DataFrame
    df = pd.DataFrame.from_dict(all_entity_skills, orient='index')
    
    # Reset index and rename it to entity_id
    df.index.name = 'entity_id'
    df.reset_index(inplace=True)
    
    return df

# Example usage
if __name__ == "__main__":
    # Example dictionary of skills to query
    skills_to_query = {
        'Critical': ['Python Programming', 'Data Analysis'],
        'Required': ['Amazon Web Services', 'Machine Learning', 'PyTorch'],
        'Preferred': ['Docker', 'SQL', 'SQL Server', 'PostgreSQL'],
        'Optional': ['Kubernetes', 'React', 'GCP', 'Google Cloud', 'Google Cloud Platform'],
    }
    
    # Create the skills matrix with distances
    skills_distances_df = create_skills_matrix_with_distances(skills_to_query)
    
    # Display some summary statistics
    print("\nSkill Distance Statistics:")
    for priority, skills in skills_to_query.items():
        print(f"\n{priority} Skills:")
        for skill in skills:
            column_name = f"{priority}_{skill}"
            mean_distance = skills_distances_df[column_name].mean()
            close_matches = (skills_distances_df[column_name] < 0.8).sum()
            total = len(skills_distances_df)
            percentage = (close_matches / total) * 100
            print(f"{skill}:")
            print(f"  Mean distance: {mean_distance:.2f}")
            print(f"  Close matches (<0.8): {close_matches} entities ({percentage:.1f}%)")
            
    # Optional: Display the top 5 closest matches for each skill
    print("\nTop 5 Closest Matches by Skill:")
    for priority, skills in skills_to_query.items():
        print(f"\n{priority} Skills:")
        for skill in skills:
            column_name = f"{priority}_{skill}"
            print(f"\n{skill}:")
            top_5 = skills_distances_df.nsmallest(5, column_name)[['entity_id', column_name]]
            print(top_5.to_string(index=False))

In [None]:
skills_distances_df