In [None]:
!pip install -e ..[dev]

## Data Loading

In [None]:
from instaffo_matching.data.loader import load_data, get_matching_dataframes
from instaffo_matching.data.preprocessor import standardize_data

import pandas as pd

data = load_data("../data/data.json")
talent_df, job_df, labels_df = get_matching_dataframes(data=data)

## Pre-Process Data and Deeper Anlysis

In [2]:
# Encodes categorical variables (degree, seniority, languages) into numerical values.
# This will be akin to ordinal encoding, preserves the relationships among categories
# Normalizes the data for machine learning model compatibility.
talent_df, job_df = standardize_data(talent_df, job_df)

### Explore hypotesis that I can use some criteria as filters to nerrow down search

In [3]:
# Join the dataframes on index
df = pd.concat([talent_df, job_df, labels_df], axis=1)

# CHECK IF FILTERED DATAFRAME HAS ALL MUST_HAVE LANGUAGES
def check_language_requirements(talent_df, job_df, labels_df):
    """ This is to validate hypothesis that all candidates have all must_have languages 
    and that it can be a filter to select the best candidates"""
    def rating_to_level(rating):
        levels = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
        return levels.get(rating, 0)
    
    def has_required_languages(candidate_languages, job_languages):
        candidate_dict = {lang['title']: rating_to_level(lang['rating']) for lang in candidate_languages}
        for job_lang in job_languages:
            if job_lang.get('must_have', False):
                job_title = job_lang['title']
                job_rating = rating_to_level(job_lang['rating'])
                if candidate_dict.get(job_title, 0) < job_rating:
                    return False
        return True
    
    for index in labels_df[labels_df['label'] == True].index:
        candidate_languages = talent_df.loc[index, 'languages']
        job_languages = job_df.loc[index, 'languages']
        if not has_required_languages(candidate_languages, job_languages):
            return False
    return True

# Tun the check language requirement
result = check_language_requirements(talent_df, job_df, labels_df)
print("Hypotesis that all rows with label=True have all must_have criteria fulfilled:", result)

def check_compliance(talent_df, job_df, labels_df):
    non_compliant_rows = []

    for index, label in labels_df[labels_df['label'] == True].iterrows():
        if not label['label']:
            continue  # Skip if the label is not True

        candidate = talent_df.loc[index]
        job = job_df.loc[index]

        # Check Salary
        if candidate['salary_expectation'] > job['max_salary']:
            non_compliant_rows.append({'index': index, 'reason': 'Salary expectation too high'})

        # Check Job Roles
        if not any(role in job['job_roles'] for role in candidate['job_roles']):
            non_compliant_rows.append({'index': index, 'reason': 'No matching job roles'})

        # Check Degree
        if candidate['degree'] < job['min_degree']:
            non_compliant_rows.append({'index': index, 'reason': 'Degree not sufficient'})

        # Check Seniority - compare against all listed seniorities in the job description
        candidate_seniority = candidate['seniority']
        job_seniorities = job['seniorities']
        if all(candidate_seniority < seniority for seniority in job_seniorities):
            non_compliant_rows.append({'index': index, 'reason': 'Seniority not sufficient'})

    return pd.DataFrame(non_compliant_rows)

# Example usage of the check_compliance function
non_compliant_df = check_compliance(talent_df, job_df, labels_df)
print("\nNon-compliant reasons:")
print(non_compliant_df.reason.unique())

# Only salary too high for job found that it cannot be a filter since it is not a must have
# but we can filter by max salary, must_have_laganuge, degree and seniority

Hypotesis that all rows with label=True have all must_have criteria fulfilled: True

Non-compliant reasons:
['Salary expectation too high']


**Conclusion from above**

1. Analysis indicates that salary expectations (salary expectations > max_salary) cannot disqualify a candidate and will not be used as filter to nerrow down search.
2. However, criteria such as must-have languages, minimum degree requirements, and minimal seniority levels match prove effective for filtering candidates. These factors will directly narrow down candidate's list for a role.