In [None]:
!pip install -e ..

## Data Loading

In [2]:
from instaffo_matching.data.loader import load_data, get_matching_dataframes
from instaffo_matching.data.preprocessor import standardize_data

import pandas as pd

data = load_data("../data/data.json")
talent_df, job_df, labels_df = get_matching_dataframes(data=data)

  from pandas.core import (


## Pre-Process Data and Deeper Anlysis

In [3]:
# Encodes categorical variables (degree, seniority, languages) into numerical values.
# This will be akin to ordinal encoding, preserves the relationships among categories
# Normalizes the data for machine learning model compatibility.
talent_df, job_df = standardize_data(talent_df, job_df)

### Explore hypotesis that I can use some criteria as filters to nerrow down search

In [4]:
# Join the dataframes on index
df = pd.concat([talent_df, job_df, labels_df], axis=1)

# CHECK IF FILTERED DATAFRAME HAS ALL MUST_HAVE LANGUAGES
def check_language_requirements(talent_df, job_df, labels_df):
    """ This is to validate hypothesis that all candidates have all must_have languages 
    and that it can be a filter to select the best candidates"""
    def rating_to_level(rating):
        levels = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
        return levels.get(rating, 0)
    
    def has_required_languages(candidate_languages, job_languages):
        candidate_dict = {lang['title']: rating_to_level(lang['rating']) for lang in candidate_languages}
        for job_lang in job_languages:
            if job_lang.get('must_have', False):
                job_title = job_lang['title']
                job_rating = rating_to_level(job_lang['rating'])
                if candidate_dict.get(job_title, 0) < job_rating:
                    return False
        return True
    
    for index in labels_df[labels_df['label'] == True].index:
        candidate_languages = talent_df.loc[index, 'languages']
        job_languages = job_df.loc[index, 'languages']
        if not has_required_languages(candidate_languages, job_languages):
            return False
    return True

# Tun the check language requirement
result = check_language_requirements(talent_df, job_df, labels_df)
print("Hypotesis that all rows with label=True have all must_have criteria fulfilled:", result)

def check_compliance(talent_df, job_df, labels_df):
    non_compliant_rows = []

    for index, label in labels_df[labels_df['label'] == True].iterrows():
        if not label['label']:
            continue  # Skip if the label is not True

        candidate = talent_df.loc[index]
        job = job_df.loc[index]

        # Check Salary
        if candidate['salary_expectation'] > job['max_salary']:
            non_compliant_rows.append({'index': index, 'reason': 'Salary expectation too high'})

        # Check Job Roles
        if not any(role in job['job_roles'] for role in candidate['job_roles']):
            non_compliant_rows.append({'index': index, 'reason': 'No matching job roles'})

        # Check Degree
        if candidate['degree'] < job['min_degree']:
            non_compliant_rows.append({'index': index, 'reason': 'Degree not sufficient'})

        # Check Seniority - compare against all listed seniorities in the job description
        candidate_seniority = candidate['seniority']
        job_seniorities = job['seniorities']
        if all(candidate_seniority < seniority for seniority in job_seniorities):
            non_compliant_rows.append({'index': index, 'reason': 'Seniority not sufficient'})

    return pd.DataFrame(non_compliant_rows)

# Example usage of the check_compliance function
non_compliant_df = check_compliance(talent_df, job_df, labels_df)
print("\nNon-compliant reasons:")
print(non_compliant_df.reason.unique())

# Key findings for candidate filtering:
# 1. Must-have languages, minimum degree, and minimum seniority are confirmed as effective filters.
# 2. Salary expectations > max_salary occur in positive matches, suggesting it's not a strict disqualifier.

Hypotesis that all rows with label=True have all must_have criteria fulfilled: True

Non-compliant reasons:
['Salary expectation too high']


**Conclusion from above**

1. Analysis indicates that salary expectations (salary expectations > max_salary) cannot disqualify a candidate and will not be used as filter to nerrow down search.
2. However, criteria such as must-have languages, minimum degree requirements, and minimal seniority levels match prove effective for filtering candidates. These factors will directly narrow down candidate's list for a role.

In [8]:
# src/search/search.py

import asyncio
from typing import List, Dict
from instaffo_matching.models.retriver import CandidateFilter 
from instaffo_matching.models.ranker import TalentJobRanker
from instaffo_matching.data.loader import load_data, get_matching_dataframes
from instaffo_matching.data.preprocessor import standardize_data

from instaffo_matching.utils.metrics import timing_decorator
#from instaffo_matching.search.cache import SearchCache
from instaffo_matching.data.loader import load_data, get_matching_dataframes


import logging

logger = logging.getLogger(__name__)



class Search:
    def __init__(self, model_path: str = '../models_artifacts/model_03_08_2024.joblib'):
        self.ranker = TalentJobRanker(model_path)
        self.filter = CandidateFilter()
        # self.cache = SearchCache()

    @timing_decorator
    def match(self, talent: Dict, job: Dict) -> Dict:
        # in notebooks/02_exploratory_filter analysis was found that the following 
        # filters can be applied to reduce the number of candidates
        if not self.filter.meets_language_requirements(talent['languages'], job['languages']) or \
           not self.filter.degree_sufficient(talent['degree'], job['min_degree']) or \
           not self.filter.seniority_sufficient(talent['seniority'], job['seniorities']):
            return {
                "talent": talent,
                "job": job,
                "label": False,
                "score": 0.0,
            }

        # If the talent passes the initial filter, proceed with the full matching process
        job_df = pd.DataFrame([job])
        talent_df = pd.DataFrame([talent])
        job_df, talent_df = standardize_data(talent_df, job_df)
        label, ranking_score = self.ranker.predict(job=job_df, talent=talent_df)

        return {
            "talent": talent,
            "job": job,
            "label": bool(label),
            "score": float(ranking_score),
        }

    @timing_decorator
    async def match_bulk(self, talents: List[Dict], jobs: List[Dict]) -> List[Dict]:
        async def match_combination(talent, job):
            return self.match(talent, job)

        all_results = []
        for job in jobs:
            filtered_talents = self.filter.filter_candidates(job, talents)
            tasks = [match_combination(talent, job) for talent in filtered_talents]
            job_results = await asyncio.gather(*tasks)
            all_results.extend(job_results)

        return sorted(all_results, key=lambda x: x['score'], reverse=True)

    def warm_up_cache(self, talents: List[Dict], jobs: List[Dict]):
        """Pre-compute and cache results for common queries"""
        for talent in talents[:100]:  # Limit to top 100 talents for example
            for job in jobs[:100]:  # Limit to top 100 jobs
                self.match(talent, job)

    def update_model(self, new_model_path: str):
        """Hot-swap the model without downtime"""
        new_ranker = TalentJobRanker(new_model_path)
        self.ranker = new_ranker
        logger.info(f"Model updated to {new_model_path}")
        self.cache.clear()  # Clear cache when model is updated
        logger.info(f"Model updated to {new_model_path}")
        
        
# lets test this 

data = load_data("../data/data.json")
talent_df, job_df, labels_df = get_matching_dataframes(data=data)

search = Search('../models_artifacts/model_03_08_2024.joblib')

# test the match function
result = search.match(talent=talent_df.iloc[0].to_dict(), job=job_df.iloc[0].to_dict())
print(result)

# test the match_bulk function
results = asyncio.run(search.match_bulk(talent_df.head(100).to_dict(orient='records'), 
                                        job_df.head(100).to_dict(orient='records')))
    


Error during prediction: 'salary_expectation'


KeyError: 'salary_expectation'