In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
from data.heuristic_extractor import HeuristicExtractor
from data.llm_extractor import LlmExtractor
from data.resume_extractor import ResumeExtractor
import json
import numpy as np
import pandas as pd

class CatBoostVacancyRanker:           
            
    def form_train_sequence(self, vacancy_dataset, user_dataset, index, is_train=True):
        count_skipped = 0
        user_dict = user_dataset.iloc[index]
        vacancy_uuid = user_dict['vacancy_uuid']
        vacancy_dict = vacancy_dataset[vacancy_dataset['vacancy_uuid'] == vacancy_uuid].iloc[0]

        result_dict = {}

        # Formulate features

        for grade_type in ['junior', 'middle', 'senior']:
            result_dict[f'{grade_type}_match'] = (user_dict[f'is_{grade_type}'] == vacancy_dict[f'is_{grade_type}'])

        user_hard_skills = set(user_dict['key_skills'])
        user_extra_skills = set(user_dict['extra_skills'])
        vacancy_hard_skills = set(vacancy_dict['hard_skills'])
        vacancy_extra_skills = set(vacancy_dict['extra_skills'])

        result_dict['hard_skills_intersection'] = len(vacancy_hard_skills) / max(len(vacancy_hard_skills.union(user_hard_skills)), 1)
        # result_dict['soft_skills_intersection'] = len(vacancy_extra_skills) / max(len(vacancy_extra_skills.union(user_extra_skills)), 1)
        result_dict['working_months'] = int(user_dict['working_months'])
        if vacancy_dict['monthly_wage'] is not None and vacancy_dict['monthly_wage'] != 'null':
            result_dict['monthly_wage'] = int(vacancy_dict['monthly_wage']) / 1000
        else:
            count_skipped += 1
            result_dict['monthly_wage'] = None
        try:
            result_dict['experience'] = int(vacancy_dict['experience'])
        except ValueError:
            count_skipped += 1
            result_dict['experience'] = None
        result_dict['age'] = user_dict['age']
        result_dict['benefits'] = vacancy_dict['benefits']
        if is_train:
            result_dict['relevance'] = user_dict['relevance']
            result_dict['group'] = self.vacancies_compress[user_dict['vacancy_uuid']]
            result_dict['weights'] = np.float64(3 if result_dict['monthly_wage'] is not None and result_dict['monthly_wage'] >= 200 else 1)

        return result_dict
    
    def __init__(self):
        pass

    def fit(self, vacancies):
        heuristic_extractor = HeuristicExtractor(vacancies)
        resume_extractor = ResumeExtractor(vacancies)
        llm_extractor = LlmExtractor(vacancies)

        heuristic_dataset = heuristic_extractor.form_dataset()
        llm_dataset = llm_extractor.form_dataset()
        resume_dataset = resume_extractor.form_dataset()

        self.vacancies_compress = {}
        for index, item in enumerate(vacancies):
            self.vacancies_compress[item['vacancy']['uuid']] = index

        cumulative_data = [self.form_train_sequence(llm_dataset, resume_dataset, index) for index in range(len(resume_dataset))]
        preprocessed_train = pd.DataFrame.from_dict(cumulative_data).sort_values(by=['group'])
        
        indexes = np.arange(len(self.vacancies_compress))
        np.random.shuffle(indexes)
        group_tr, group_te = indexes[:25], indexes[25:]
        train_indexes, test_indexes = [], []
        for index, row in preprocessed_train.iterrows():
            if index in group_tr:
                train_indexes.append(index)
            else:
                test_indexes.append(index)

        train_indexes = np.array(train_indexes)
        test_indexes = np.array(test_indexes)

        X_train, X_test = preprocessed_train.iloc[train_indexes], preprocessed_train.iloc[test_indexes]

        y_train = X_train['relevance']
        train_groups = X_train['group']
        X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)
    
        y_test = X_test['relevance']
        test_groups = X_test['group']
        X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)

        train = Pool(
            data=X_train,
            label=y_train,
            group_id=train_groups,
        )
    
        test = Pool(
            data=X_test,
            label=y_test,
            group_id=test_groups,
        )

        default_parameters = {
            'iterations': 400,
            'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
            'verbose': False,
            'random_seed': 0,
            'max_depth': 6,
            'learning_rate': 1 * 10 ** -4
        }

        loss_function = 'PairLogit'
        additional_params = {'custom_metric': ['PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5', 'NDCG:top=10']}

        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function

        if additional_params is not None:
            parameters.update(additional_params)

        model = CatBoostRanker(**parameters)
        model.fit(train, eval_set=test, plot=False)

        self.model = model

    def predict(self, vacancy, resumes_id):
        new_json = {}
        new_json['vacancy'] = vacancy['vacancy']
        new_json['resumes'] = []
        for i in vacancy['resumes']:
            if i['uuid'] in resumes_id:
                new_json['resumes'].append(i)

        vacancy = new_json
        resumes_extractor = ResumeExtractor([vacancy])
        resumes_dset = resumes_extractor.form_dataset()
        vacancy_dset = LlmExtractor([vacancy], test=True).form_dataset()
        description = [self.form_train_sequence(vacancy_dset, resumes_dset, i, is_train=False) for i in range(len(resumes_dset))]
        predictions = self.model.predict(pd.DataFrame(description))
        indexes = np.argsort(predictions)[::-1]
        return np.array(resumes_id)[indexes]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def general_pipeline():
    train_json = open('./data/case_2_data_for_members.json')
    train_vacancies = json.load(train_json)
    model = CatBoostVacancyRanker()
    model.fit(train_vacancies)

    # Here we get our predictions from extractors

    test_json = open('./data/case_2_reference_without_resume_sorted.json')
    test_vacancies = json.load(test_json)

    resumes = test_vacancies['resumes']
    resumes_id = [resume['uuid'] for resume in resumes]

    return model.predict(test_vacancies, resumes_id)

general_pipeline()

get_grade data extractor fullness: 14/29
get_work_type data extractor fullness: 19/29
get_experience data extractor fullness: 12/29
get_experience data extractor fullness: 29/29


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)


array(['f4f84f9b-6c7a-366d-b8b4-84b0e6f337c7',
       '70547288-56fc-3cab-9bef-f752f2e1b21f',
       'fd9c4130-177f-3546-8974-189a52fcc496',
       '1ed1795c-43b2-304e-8386-bf53d0ebec18',
       '6abae195-2cbf-3bbe-8dfb-8d2dcc3e88b6',
       '6561771c-7ef3-3e50-ab3a-ba8547201480',
       'f111eaab-2416-33f5-8a95-8ec11e8485c4',
       'e3a42770-927c-3d59-a761-bcc3549d1054',
       '62092df7-1166-3602-b509-a4df3999ffe5',
       '6eaade34-b13c-3d17-a573-dfa786cbb0d5',
       'c70de373-9f3a-3647-ab66-f25e98c29409',
       '9cef8747-af3a-3833-9098-3bfabf1e40eb',
       'bd53a1ea-ae17-3506-97b4-287aa8e9cc21',
       '3c4020e7-172f-3ed5-adc9-fe9a62097b26',
       '5785c202-6744-3e1b-994a-d5bffc6aad14',
       'b1357f8e-3c98-3142-81e7-fd729ec50ca0',
       '3e3a379f-226e-305e-b7d8-cf341e00cbd7',
       'feedc674-8547-35d0-9f63-fd6a3d3fedcd',
       'f56395de-af47-3f65-90e9-37d44ea07610',
       'd9fffe2b-cba9-3ff2-bd47-b8bfc48cbe89',
       '8e193a9a-d338-36b5-800b-45de7a809157',
       '156ce