In [1]:
import re

SYNONYMS = {
        # Backend
        "backend dev": "Developer, back-end",
        "backend developer": "Developer, back-end",
        "back-end dev": "Developer, back-end",
        "back-end developer": "Developer, back-end",
        "backend engineer": "Developer, back-end",
        "be developer": "Developer, back-end",
        
        # Frontend
        "frontend dev": "Developer, front-end",
        "frontend developer": "Developer, front-end",
        "front-end dev": "Developer, front-end",
        "front-end developer": "Developer, front-end",
        "frontend engineer": "Developer, front-end",
        "fe developer": "Developer, front-end",
        
        # Full-stack
        "fullstack": "Developer, full-stack",
        "full stack": "Developer, full-stack",
        "fullstack developer": "Developer, full-stack",
        "full-stack dev": "Developer, full-stack",
        "full stack developer": "Developer, full-stack",
        "fs developer": "Developer, full-stack",
        
        # Data roles
        "data analyst": "Data or business analyst",
        "business analyst": "Data or business analyst",
        "analyst": "Data or business analyst",
        "ba": "Data or business analyst",
        "data scientist": "Data scientist",
        "ds": "Data scientist",
        "scientist": "Data scientist",
        "data engineer": "Data engineer",
        "de": "Data engineer",
        
        # AI/ML roles (UPDATED)
        "ml engineer": "AI/ML engineer",
        "machine learning engineer": "AI/ML engineer",
        "ai engineer": "AI/ML engineer",
        "artificial intelligence engineer": "AI/ML engineer",
        "ai developer": "Developer, AI apps or physical AI",
        "ai app developer": "Developer, AI apps or physical AI",
        "physical ai developer": "Developer, AI apps or physical AI",
        "applied scientist": "Applied scientist",
        
        # Cloud/Infrastructure (UPDATED)
        "cloud engineer": "Cloud infrastructure engineer",
        "cloud infrastructure": "Cloud infrastructure engineer",
        "infrastructure engineer": "Cloud infrastructure engineer",
        "sysadmin": "System administrator",
        "sys admin": "System administrator",
        "system admin": "System administrator",
        "devops": "DevOps engineer or professional",
        "devops engineer": "DevOps engineer or professional",
        "devops professional": "DevOps engineer or professional",
        
        # Database (UPDATED)
        "database admin": "Database administrator or engineer",
        "dba": "Database administrator or engineer",
        "db admin": "Database administrator or engineer",
        "database administrator": "Database administrator or engineer",
        "database engineer": "Database administrator or engineer",
        
        # QA/Testing
        "qa": "Developer, QA or test",
        "qa engineer": "Developer, QA or test",
        "tester": "Developer, QA or test",
        "test engineer": "Developer, QA or test",
        "quality assurance": "Developer, QA or test",
        "qa developer": "Developer, QA or test",
        
        # Management
        "project manager": "Project manager",
        "pm": "Product manager",  # Most common PM abbreviation
        "product manager": "Product manager",
        "engineering manager": "Engineering manager",
        "eng manager": "Engineering manager",
        "em": "Engineering manager",
        
        # Security (NEW)
        "security": "Cybersecurity or InfoSec professional",
        "cybersecurity": "Cybersecurity or InfoSec professional",
        "infosec": "Cybersecurity or InfoSec professional",
        "security engineer": "Cybersecurity or InfoSec professional",
        "security professional": "Cybersecurity or InfoSec professional",
        
        # Support (NEW)
        "support engineer": "Support engineer or analyst",
        "support analyst": "Support engineer or analyst",
        "customer support": "Support engineer or analyst",
        
        # Design (NEW)
        "ux": "UX, Research Ops or UI design professional",
        "ui": "UX, Research Ops or UI design professional",
        "ux designer": "UX, Research Ops or UI design professional",
        "ui designer": "UX, Research Ops or UI design professional",
        "designer": "UX, Research Ops or UI design professional",
        "ux researcher": "UX, Research Ops or UI design professional",
        
        # Executive/Leadership (NEW)
        "cto": "Senior executive (C-suite, VP, etc.)",
        "ceo": "Senior executive (C-suite, VP, etc.)",
        "vp": "Senior executive (C-suite, VP, etc.)",
        "executive": "Senior executive (C-suite, VP, etc.)",
        "c-suite": "Senior executive (C-suite, VP, etc.)",
        "founder": "Founder, technology or otherwise",
        "co-founder": "Founder, technology or otherwise",
        
        # Specialized developers
        "researcher": "Academic researcher",
        "academic": "Academic researcher",
        "academic researcher": "Academic researcher",
        "mobile dev": "Developer, mobile",
        "mobile developer": "Developer, mobile",
        "mobile engineer": "Developer, mobile",
        "ios developer": "Developer, mobile",
        "android developer": "Developer, mobile",
        "game dev": "Developer, game or graphics",
        "game developer": "Developer, game or graphics",
        "graphics developer": "Developer, game or graphics",
        "desktop dev": "Developer, desktop or enterprise applications",
        "desktop developer": "Developer, desktop or enterprise applications",
        "enterprise developer": "Developer, desktop or enterprise applications",
        "embedded developer": "Developer, embedded applications or devices",
        "embedded engineer": "Developer, embedded applications or devices",
        "iot developer": "Developer, embedded applications or devices",
        
        # Architecture (NEW)
        "architect": "Architect, software or solutions",
        "software architect": "Architect, software or solutions",
        "solutions architect": "Architect, software or solutions",
        "solution architect": "Architect, software or solutions",
        
        # Finance (NEW)
        "financial analyst": "Financial analyst or engineer",
        "financial engineer": "Financial analyst or engineer",
        "quant": "Financial analyst or engineer",
    }


In [2]:
def normalize_job_title_in_query(query: str) -> str:
    """
    Normalize the job title in the user query based on the SYNONYMS dictionary.
    Handles overlapping matches by prioritizing longer job titles.
    """
    # Find all matches with their positions
    matches = []
    
    for title, normalized_title in SYNONYMS.items():
        # Find all occurrences of this title in the query
        pattern = r'\b' + re.escape(title) + r'\b'
        for match in re.finditer(pattern, query, flags=re.IGNORECASE):
            matches.append({
                'start': match.start(),
                'end': match.end(),
                'original': match.group(),
                'normalized': normalized_title,
                'length': len(title)
            })
    
    # Sort by start position, then by length (longer first for overlaps)
    matches.sort(key=lambda x: (x['start'], -x['length']))
    
    # Remove overlapping matches, keeping only the longest one
    filtered_matches = []
    for match in matches:
        # Check if this match overlaps with any already selected match
        overlaps = False
        for selected in filtered_matches:
            if not (match['end'] <= selected['start'] or match['start'] >= selected['end']):
                # There's an overlap - skip this match
                overlaps = True
                break
        
        if not overlaps:
            filtered_matches.append(match)
    
    # Sort by start position (descending) to replace from end to start
    # This way positions don't shift as we replace
    filtered_matches.sort(key=lambda x: x['start'], reverse=True)
    
    # Apply replacements from end to start
    result = query
    for match in filtered_matches:
        result = result[:match['start']] + match['normalized'] + result[match['end']:]
    
    return result

In [3]:
test_queries = [
    "what is the skills needed to be a data Engineer?",
    "Can you tell me about the backend developer and frontend dev roles?",
    "I want to be a scientist",  # Should only match "scientist", not "data scientist"
    "ml engineer vs data scientist",
    "backend dev skills",
    "AI/ML Engineer"
]

for query in test_queries:
    normalized = normalize_job_title_in_query(query)
    print(f"Original:   {query}")
    print(f"Normalized: {normalized}")
    print()

Original:   what is the skills needed to be a data Engineer?
Normalized: what is the skills needed to be a Data engineer?

Original:   Can you tell me about the backend developer and frontend dev roles?
Normalized: Can you tell me about the Developer, back-end and Developer, front-end roles?

Original:   I want to be a scientist
Normalized: I want to be a Data scientist

Original:   ml engineer vs data scientist
Normalized: AI/ML engineer vs Data scientist

Original:   backend dev skills
Normalized: Developer, back-end skills

Original:   AI/ML Engineer
Normalized: AI/AI/ML engineer

