In [None]:
!pip install -e ..

In [1]:
import json

from instaffo_matching.utils.logging import setup_logger
from instaffo_matching.data.loader import load_data, get_matching_dataframes
from instaffo_matching.data.preprocessor import standardize_data
from instaffo_matching.search.search import Search


logger = setup_logger()

  from pandas.core import (


In [2]:
data = load_data("../data/data.json")
talent_df, job_df, labels_df = get_matching_dataframes(data=data)

In [3]:
search = Search('../models_artifacts/model_03_08_2024.joblib')

# test the match function
result = search.match(talent=talent_df.iloc[0].to_dict(), job=job_df.iloc[0].to_dict())
print(json.dumps(result, indent=4))

2024-08-03 20:09:29 INFO [ranker.py:106]: Loaded model and feature engineer from ../models_artifacts/model_03_08_2024.joblib
2024-08-03 20:09:29 INFO [ranker.py:91]: Initialized GradientBoostingStrategy model and FeatureEngineer.
{
    "talent": {
        "degree": "bachelor",
        "job_roles": [
            "frontend-developer",
            "backend-developer",
            "full-stack-developer",
            "java-developer",
            "mobile-developer"
        ],
        "languages": [
            {
                "rating": "C2",
                "title": "German"
            },
            {
                "rating": "C2",
                "title": "English"
            },
            {
                "rating": "B2",
                "title": "French"
            },
            {
                "rating": "A2",
                "title": "Turkish"
            }
        ],
        "salary_expectation": 48000,
        "seniority": "junior"
    },
    "job": {
        "job_roles": [

In [7]:
import time 
# mesure the time it takes to run the match_bulk and match_bulk2 functions
t1 = time.time()
# test the match_bulk function
results = await search.match_bulk(talent_df.head(100).to_dict(orient='records'), 
                                job_df.head(100).to_dict(orient='records'))
t2 = time.time()
print(f"match_bulk took {t2-t1} seconds")

2024-08-03 20:04:49 INFO [metrics.py:12]: Function match_bulk took 0.0000 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.


2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0170 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0170 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0200 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0180 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0210 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.py:12]: Function match took 0.0160 seconds to execute.
2024-08-03 20:04:49 INFO [ranker.py:227]: Prediction made successfully.
2024-08-03 20:04:49 INFO [metrics.

In [None]:
results

In [5]:
import time 
t1 = time.time()
# test the match_bulk function
results = await search.match_bulk2(talent_df.head(100).to_dict(orient='records'), 
                                job_df.head(100).to_dict(orient='records'))
t2 = time.time()
print(f"match_bulk took {t2-t1} seconds")

match_bulk took 26.848660707473755 seconds


In [8]:
def matches_criteria(results):
    """
    Checks if the given results list matches the specified criteria.

    The function performs the following checks:
    1. Ensures `results` is a list.
    2. Ensures each item in the list is a dictionary.
    3. Ensures each dictionary contains the required keys: 'talent', 'job', 'label', and 'score'.
    4. Ensures the 'talent' dictionary contains the keys: 'degree', 'job_roles', 'languages', 'salary_expectation', and 'seniority'.
    5. Ensures the 'job' dictionary contains the keys: 'job_roles', 'languages', 'max_salary', 'min_degree', and 'seniorities'.
    6. Ensures the values of these keys match the expected types and constraints.
    7. Ensures the list is sorted in descending order by 'score'.

    Args:
        results (list): The list of results to be checked.

    Returns:
        bool: True if the results match the criteria, False otherwise.
    """
    if not isinstance(results, list):
        return False
    
    previous_score = float('inf')
    
    for item in results:
        if not isinstance(item, dict):
            return False
        
        # Check for required keys
        required_keys = {'talent', 'job', 'label', 'score'}
        if not required_keys.issubset(item.keys()):
            return False
        
        # Check talent structure
        talent = item['talent']
        talent_keys = {'degree', 'job_roles', 'languages', 'salary_expectation', 'seniority'}
        if not talent_keys.issubset(talent.keys()):
            return False
        
        # Check job structure
        job = item['job']
        job_keys = {'job_roles', 'languages', 'max_salary', 'min_degree', 'seniorities'}
        if not job_keys.issubset(job.keys()):
            return False
        
        # Check types and constraints
        if not isinstance(talent['degree'], str):
            return False
        if not isinstance(talent['job_roles'], list):
            return False
        if not isinstance(talent['languages'], list):
            return False
        if not isinstance(talent['salary_expectation'], int):
            return False
        if not isinstance(talent['seniority'], str):
            return False
        
        if not isinstance(job['job_roles'], list):
            return False
        if not isinstance(job['languages'], list):
            return False
        if not isinstance(job['max_salary'], int):
            return False
        if not isinstance(job['min_degree'], str):
            return False
        if not isinstance(job['seniorities'], list):
            return False
        
        if not isinstance(item['label'], bool):
            return False
        if not isinstance(item['score'], float):
            return False
        
        # Check if the list is sorted in descending order by score
        if item['score'] > previous_score:
            return False
        previous_score = item['score']
    
    return True

# Example usage
print(matches_criteria(results))

True


In [16]:
len(results)

6813