In [13]:
import re

SYNONYMS = {
        # Backend
        "backend dev": "Developer, back-end",
        "backend developer": "Developer, back-end",
        "back-end dev": "Developer, back-end",
        "back-end developer": "Developer, back-end",
        "backend engineer": "Developer, back-end",
        "be developer": "Developer, back-end",
        
        # Frontend
        "frontend dev": "Developer, front-end",
        "frontend developer": "Developer, front-end",
        "front-end dev": "Developer, front-end",
        "front-end developer": "Developer, front-end",
        "frontend engineer": "Developer, front-end",
        "fe developer": "Developer, front-end",
        
        # Full-stack
        "fullstack": "Developer, full-stack",
        "full stack": "Developer, full-stack",
        "fullstack developer": "Developer, full-stack",
        "full-stack dev": "Developer, full-stack",
        "full stack developer": "Developer, full-stack",
        "fs developer": "Developer, full-stack",
        
        # Data roles
        "data analyst": "Data or business analyst",
        "business analyst": "Data or business analyst",
        "analyst": "Data or business analyst",
        "ba": "Data or business analyst",
        "data scientist": "Data scientist",
        "ds": "Data scientist",
        "scientist": "Data scientist",
        "data engineer": "Data engineer",
        "de": "Data engineer",
        
        # AI/ML roles (UPDATED)
        "ml engineer": "AI/ML engineer",
        "machine learning engineer": "AI/ML engineer",
        "ai engineer": "AI/ML engineer",
        "artificial intelligence engineer": "AI/ML engineer",
        "ai developer": "Developer, AI apps or physical AI",
        "ai app developer": "Developer, AI apps or physical AI",
        "physical ai developer": "Developer, AI apps or physical AI",
        "applied scientist": "Applied scientist",
        
        # Cloud/Infrastructure (UPDATED)
        "cloud engineer": "Cloud infrastructure engineer",
        "cloud infrastructure": "Cloud infrastructure engineer",
        "infrastructure engineer": "Cloud infrastructure engineer",
        "sysadmin": "System administrator",
        "sys admin": "System administrator",
        "system admin": "System administrator",
        "devops": "DevOps engineer or professional",
        "devops engineer": "DevOps engineer or professional",
        "devops professional": "DevOps engineer or professional",
        
        # Database (UPDATED)
        "database admin": "Database administrator or engineer",
        "dba": "Database administrator or engineer",
        "db admin": "Database administrator or engineer",
        "database administrator": "Database administrator or engineer",
        "database engineer": "Database administrator or engineer",
        
        # QA/Testing
        "qa": "Developer, QA or test",
        "qa engineer": "Developer, QA or test",
        "tester": "Developer, QA or test",
        "test engineer": "Developer, QA or test",
        "quality assurance": "Developer, QA or test",
        "qa developer": "Developer, QA or test",
        
        # Management
        "project manager": "Project manager",
        "pm": "Product manager",  # Most common PM abbreviation
        "product manager": "Product manager",
        "engineering manager": "Engineering manager",
        "eng manager": "Engineering manager",
        "em": "Engineering manager",
        
        # Security (NEW)
        "security": "Cybersecurity or InfoSec professional",
        "cybersecurity": "Cybersecurity or InfoSec professional",
        "infosec": "Cybersecurity or InfoSec professional",
        "security engineer": "Cybersecurity or InfoSec professional",
        "security professional": "Cybersecurity or InfoSec professional",
        
        # Support (NEW)
        "support engineer": "Support engineer or analyst",
        "support analyst": "Support engineer or analyst",
        "customer support": "Support engineer or analyst",
        
        # Design (NEW)
        "ux": "UX, Research Ops or UI design professional",
        "ui": "UX, Research Ops or UI design professional",
        "ux designer": "UX, Research Ops or UI design professional",
        "ui designer": "UX, Research Ops or UI design professional",
        "designer": "UX, Research Ops or UI design professional",
        "ux researcher": "UX, Research Ops or UI design professional",
        
        # Executive/Leadership (NEW)
        "cto": "Senior executive (C-suite, VP, etc.)",
        "ceo": "Senior executive (C-suite, VP, etc.)",
        "vp": "Senior executive (C-suite, VP, etc.)",
        "executive": "Senior executive (C-suite, VP, etc.)",
        "c-suite": "Senior executive (C-suite, VP, etc.)",
        "founder": "Founder, technology or otherwise",
        "co-founder": "Founder, technology or otherwise",
        
        # Specialized developers
        "researcher": "Academic researcher",
        "academic": "Academic researcher",
        "academic researcher": "Academic researcher",
        "mobile dev": "Developer, mobile",
        "mobile developer": "Developer, mobile",
        "mobile engineer": "Developer, mobile",
        "ios developer": "Developer, mobile",
        "android developer": "Developer, mobile",
        "game dev": "Developer, game or graphics",
        "game developer": "Developer, game or graphics",
        "graphics developer": "Developer, game or graphics",
        "desktop dev": "Developer, desktop or enterprise applications",
        "desktop developer": "Developer, desktop or enterprise applications",
        "enterprise developer": "Developer, desktop or enterprise applications",
        "embedded developer": "Developer, embedded applications or devices",
        "embedded engineer": "Developer, embedded applications or devices",
        "iot developer": "Developer, embedded applications or devices",
        
        # Architecture (NEW)
        "architect": "Architect, software or solutions",
        "software architect": "Architect, software or solutions",
        "solutions architect": "Architect, software or solutions",
        "solution architect": "Architect, software or solutions",
        
        # Finance (NEW)
        "financial analyst": "Financial analyst or engineer",
        "financial engineer": "Financial analyst or engineer",
        "quant": "Financial analyst or engineer",
    }


In [14]:
def normalize_job_title_in_query(query: str) -> str:
    """
    Normalize the job title in the user query based on the SYNONYMS dictionary.
    Handles overlapping matches by prioritizing longer job titles.
    """
    # Find all matches with their positions
    matches = []
    
    for title, normalized_title in SYNONYMS.items():
        # Find all occurrences of this title in the query
        pattern = r'\b' + re.escape(title) + r'\b'
        for match in re.finditer(pattern, query, flags=re.IGNORECASE):
            matches.append({
                'start': match.start(),
                'end': match.end(),
                'original': match.group(),
                'normalized': normalized_title,
                'length': len(title)
            })
    
    # Sort by start position, then by length (longer first for overlaps)
    matches.sort(key=lambda x: (x['start'], -x['length']))
    
    # Remove overlapping matches, keeping only the longest one
    filtered_matches = []
    for match in matches:
        # Check if this match overlaps with any already selected match
        overlaps = False
        for selected in filtered_matches:
            if not (match['end'] <= selected['start'] or match['start'] >= selected['end']):
                # There's an overlap - skip this match
                overlaps = True
                break
        
        if not overlaps:
            filtered_matches.append(match)
    
    # Sort by start position (descending) to replace from end to start
    # This way positions don't shift as we replace
    filtered_matches.sort(key=lambda x: x['start'], reverse=True)
    
    # Apply replacements from end to start
    result = query
    for match in filtered_matches:
        result = result[:match['start']] + match['normalized'] + result[match['end']:]
    
    return result

# v2

In [15]:
import re

def normalize_job_title_in_query(query: str) -> str:
    """
    Normalize the job title in the user query based on the SYNONYMS dictionary.
    Only normalizes if the term isn't already in a normalized form.
    """
    # Create a set of all normalized values to avoid re-normalizing
    normalized_values = set(SYNONYMS.values())
    
    # Check if the entire query is already a normalized job title
    if query.strip() in normalized_values:
        return query.strip()
    
    # Sort synonyms by length (longest first) to prioritize longer matches
    sorted_synonyms = sorted(SYNONYMS.items(), key=lambda x: len(x[0]), reverse=True)
    
    # Track replacements to avoid overlaps
    replacements = []
    
    for title, normalized_title in sorted_synonyms:
        # Skip if we're trying to normalize to the same thing
        if title == normalized_title:
            continue
            
        # Escape special regex characters
        escaped_title = re.escape(title)
        
        # Use word boundaries that work with special characters
        pattern = r'(?<![a-zA-Z0-9])' + escaped_title + r'(?![a-zA-Z0-9])'
        
        # Find all matches (case-insensitive)
        for match in re.finditer(pattern, query, flags=re.IGNORECASE):
            start, end = match.start(), match.end()
            
            # Check for overlaps with existing replacements
            overlap = False
            for rep_start, rep_end, _ in replacements:
                if not (end <= rep_start or start >= rep_end):
                    overlap = True
                    break
            
            if not overlap:
                # Check that we're not replacing within an already normalized value
                # by checking if the surrounding context matches a normalized value
                context_start = max(0, start - 50)
                context_end = min(len(query), end + 50)
                context = query[context_start:context_end]
                
                # Skip if this match is within a normalized value
                is_within_normalized = False
                for norm_value in normalized_values:
                    if norm_value in context and title in norm_value:
                        is_within_normalized = True
                        break
                
                if not is_within_normalized:
                    replacements.append((start, end, normalized_title))
    
    # Sort replacements by position (reverse order to maintain positions)
    replacements.sort(key=lambda x: x[0], reverse=True)
    
    # Apply replacements
    result = query
    for start, end, normalized_title in replacements:
        result = result[:start] + normalized_title + result[end:]
    
    return result


In [16]:
# Test with actual user queries (not normalized titles)
test_queries = [
    "I want to be a backend developer",
    "What does a ml engineer do?",
    "How to become a data scientist?",
    "Skills for fullstack developer",
    "frontend dev vs backend dev comparison",
    "I'm looking for architect jobs",
    "cloud engineer salary",
    "How to become a security professional",
    "qa engineer interview questions",
    "devops vs developer",
    "financial analyst career path",
    "founder looking for cto",
    "support engineer responsibilities",
    "ux designer portfolio tips"
]

print("Testing with actual user queries:")
for test_query in test_queries:
    normalized = normalize_job_title_in_query(test_query)
    if test_query != normalized:  # Only show if something changed
        print(f"Original: {test_query}")
        print(f"Normalized: {normalized}\n")

# Test that normalized values stay unchanged
print("\nTesting that normalized values stay unchanged:")
normalized_titles = [
        'Academic researcher',
    'AI/ML engineer',
    'Applied scientist',
    'Architect, software or solutions',
    'Cloud infrastructure engineer',
    'Cybersecurity or InfoSec professional',
    'Data engineer',
    'Data or business analyst',
    'Data scientist',
    'Database administrator or engineer',
    'Developer, AI apps or physical AI',
    'Developer, back-end',
    'Developer, desktop or enterprise applications',
    'Developer, embedded applications or devices',
    'Developer, front-end',
    'Developer, full-stack',
    'Developer, game or graphics',
    'Developer, mobile',
    'Developer, QA or test',
    'DevOps engineer or professional',
    'Engineering manager',
    'Financial analyst or engineer',
    'Founder, technology or otherwise',
    'Product manager',
    'Project manager',
    'Senior executive (C-suite, VP, etc.)',
    'Support engineer or analyst',
    'System administrator',
    'UX, Research Ops or UI design professional'
]

for title in normalized_titles:
    result = normalize_job_title_in_query(title)
    if title == result:
        print(f"✓ '{title}' correctly unchanged")
    else:
        print(f"✗ '{title}' incorrectly changed to '{result}'")

Testing with actual user queries:
Original: I want to be a backend developer
Normalized: I want to be a Developer, back-end

Original: What does a ml engineer do?
Normalized: What does a AI/ML engineer do?

Original: How to become a data scientist?
Normalized: How to become a Data scientist?

Original: Skills for fullstack developer
Normalized: Skills for Developer, full-stack

Original: frontend dev vs backend dev comparison
Normalized: Developer, front-end vs Developer, back-end comparison

Original: I'm looking for architect jobs
Normalized: I'm looking for Architect, software or solutions jobs

Original: cloud engineer salary
Normalized: Cloud infrastructure engineer salary

Original: How to become a security professional
Normalized: How to become a Cybersecurity or InfoSec professional

Original: qa engineer interview questions
Normalized: Developer, QA or test interview questions

Original: devops vs developer
Normalized: DevOps engineer or professional vs developer

Original: fi