In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer
from copy import deepcopy
from data.heuristic_extractor import HeuristicExtractor
from data.llm_extractor import LlmExtractor
from data.resume_extractor import ResumeExtractor
import json
import numpy as np
import pandas as pd
from models.als_model import ALSModel

class CatBoostVacancyRanker:           
            
    def form_train_sequence(self, vacancy_dataset, user_dataset, index, is_train=True):
        count_skipped = 0
        user_dict = user_dataset.iloc[index]
        vacancy_uuid = user_dict['vacancy_uuid']
        vacancy_dict = vacancy_dataset[vacancy_dataset['vacancy_uuid'] == vacancy_uuid].iloc[0]

        result_dict = {}

        # Formulate features

        for grade_type in ['junior', 'middle', 'senior']:
            result_dict[f'{grade_type}_match'] = (user_dict[f'is_{grade_type}'] == vacancy_dict[f'is_{grade_type}'])

        user_hard_skills = set(user_dict['key_skills'])
        user_extra_skills = set(user_dict['extra_skills'])
        vacancy_hard_skills = set(vacancy_dict['hard_skills'])
        vacancy_extra_skills = set(vacancy_dict['extra_skills'])

        result_dict['hard_skills_intersection'] = len(vacancy_hard_skills) / max(len(vacancy_hard_skills.union(user_hard_skills)), 1)
        # result_dict['soft_skills_intersection'] = len(vacancy_extra_skills) / max(len(vacancy_extra_skills.union(user_extra_skills)), 1)
        result_dict['working_months'] = int(user_dict['working_months'])
        if vacancy_dict['monthly_wage'] is not None and vacancy_dict['monthly_wage'] != 'null':
            result_dict['monthly_wage'] = int(vacancy_dict['monthly_wage']) / 1000
        else:
            count_skipped += 1
            result_dict['monthly_wage'] = None
        try:
            result_dict['experience'] = int(vacancy_dict['experience'])
        except ValueError:
            count_skipped += 1
            result_dict['experience'] = None
        result_dict['age'] = user_dict['age']
        result_dict['benefits'] = vacancy_dict['benefits']
        if is_train:
            result_dict['relevance'] = user_dict['relevance']
            result_dict['group'] = self.vacancies_compress[user_dict['vacancy_uuid']]
            result_dict['weights'] = np.float64(3 if result_dict['monthly_wage'] is not None and result_dict['monthly_wage'] >= 200 else 1)

        return result_dict
    
    def __init__(self):
        pass

    def fit(self, vacancies):
        heuristic_extractor = HeuristicExtractor(vacancies)
        resume_extractor = ResumeExtractor(vacancies)
        llm_extractor = LlmExtractor(vacancies)

        heuristic_dataset = heuristic_extractor.form_dataset()
        llm_dataset = llm_extractor.form_dataset()
        resume_dataset = resume_extractor.form_dataset()

        self.vacancies_compress = {}
        for index, item in enumerate(vacancies):
            self.vacancies_compress[item['vacancy']['uuid']] = index

        cumulative_data = [self.form_train_sequence(llm_dataset, resume_dataset, index) for index in range(len(resume_dataset))]
        preprocessed_train = pd.DataFrame.from_dict(cumulative_data).sort_values(by=['group'])
        
        indexes = np.arange(len(self.vacancies_compress))
        np.random.shuffle(indexes)
        group_tr, group_te = indexes[:25], indexes[25:]
        train_indexes, test_indexes = [], []
        for index, row in preprocessed_train.iterrows():
            if index in group_tr:
                train_indexes.append(index)
            else:
                test_indexes.append(index)

        train_indexes = np.array(train_indexes)
        test_indexes = np.array(test_indexes)

        X_train, X_test = preprocessed_train.iloc[train_indexes], preprocessed_train.iloc[test_indexes]

        y_train = X_train['relevance']
        train_groups = X_train['group']
        X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)
    
        y_test = X_test['relevance']
        test_groups = X_test['group']
        X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)

        train = Pool(
            data=X_train,
            label=y_train,
            group_id=train_groups,
        )
    
        test = Pool(
            data=X_test,
            label=y_test,
            group_id=test_groups,
        )

        default_parameters = {
            'iterations': 400,
            'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
            'verbose': False,
            'random_seed': 0,
            'max_depth': 6,
            'learning_rate': 1 * 10 ** -4
        }

        loss_function = 'PairLogit'
        additional_params = {'custom_metric': ['PrecisionAt:top=5', 'RecallAt:top=5', 'MAP:top=5', 'NDCG:top=10']}

        parameters = deepcopy(default_parameters)
        parameters['loss_function'] = loss_function
        parameters['train_dir'] = loss_function

        if additional_params is not None:
            parameters.update(additional_params)

        model = CatBoostRanker(**parameters)
        model.fit(train, eval_set=test, plot=False)

        self.model = model

    def predict(self, vacancy, resumes_id):
        new_json = {}
        new_json['vacancy'] = vacancy['vacancy']
        new_json['resumes'] = []
        for i in vacancy['resumes']:
            if i['uuid'] in resumes_id:
                new_json['resumes'].append(i)

        vacancy = new_json
        resumes_extractor = ResumeExtractor([vacancy])
        resumes_dset = resumes_extractor.form_dataset()
        vacancy_dset = LlmExtractor([vacancy], test=True).form_dataset()
        description = [self.form_train_sequence(vacancy_dset, resumes_dset, i, is_train=False) for i in range(len(resumes_dset))]
        predictions = self.model.predict(pd.DataFrame(description))
        indexes = np.argsort(predictions)[::-1]
        return np.array(resumes_id)[indexes]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/aaderevyagin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def general_pipeline():
    train_json = open('./data/case_2_data_for_members.json')
    train_vacancies = json.load(train_json)

    # Get result from selectors

    llm_control = pd.read_csv('./data/llm_control_dataset.csv')
    resume_control = pd.read_csv('./data/resume_control_dataset.csv')
    llm_dataset = pd.read_csv('./data/llm_dataset.csv')
    resume_dataset = pd.read_csv('./data/resume_dataset.csv')

    test_json = open('./data/case_2_reference_without_resume_sorted.json')
    test_vacancies = json.load(test_json)

    resumes = test_vacancies['resumes']

    als = ALSModel(llm_dataset, resume_dataset, llm_control, resume_control)
    als.fit(train_vacancies)
    users_list = als.predict(k=20)

    # Sample resumes

    uuid = [resume['uuid'] for resume in resumes]
    np.random.shuffle(uuid)
    uuid = uuid[:30]

    # Init catboost

    model = CatBoostVacancyRanker()
    model.fit(train_vacancies)

    # Here we get our predictions from extractor

    users_list = np.union1d(uuid, users_list)

    predictions = model.predict(test_vacancies, users_list)[:10]

    for resume in test_vacancies['resumes']:
        if resume['uuid'] in predictions:
            print(resume)
    
    return predictions

general_pipeline()

  0%|          | 0/100 [00:00<?, ?it/s]

8b9c8d16-c7f0-38a2-b80c-d94030c15a6f
get_grade data extractor fullness: 14/29
get_work_type data extractor fullness: 19/29
get_experience data extractor fullness: 12/29
get_experience data extractor fullness: 29/29


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=['relevance', 'group', 'weights'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.drop(columns=['relevance', 'group', 'weights'], inplace=True)


{'uuid': '6cf3cee0-c911-30f6-b254-416029f8af3b', 'first_name': 'Лада', 'last_name': 'Фёдорова', 'birth_date': '1992-01-01', 'country': 'Россия', 'city': 'Москва', 'about': None, 'key_skills': 'Git, Высоконагруженные системы, PostgreSQL, Docker, Java, Hibernate, Базы данных, Проектирование архитектуры приложений, Oracle, Java Spring Framework', 'experienceItem': [{'starts': '2021-06-01', 'ends': None, 'employer': 'ОАО Реч', 'city': 'Серпухов', 'position': 'Senior java developer', 'description': ' Проект - платформа вычисления состояния/эффективности проектов/команд/сотрудников. Стэк: java 8-17, kotlin, Spring(data, boot, cloud, batch, websocket, MVC, webflux), kafka, NoSQL(Redis, MongoDB, ElasticSearch), Postresql, Prometheus, Grafana, Docker, K8s. Нотации: C4, UML Обязанности: - Создание сервисов с нуля(микросервисная архитектура) - Доработка существующих сервисов - Оптимизация производительности и выявление узких мест производительности - Код-ревью - Bug-fixing - Участие в проектирова

array(['6065ee07-4c42-3cf4-8e45-9d053b9041f7',
       '0b1a7596-d9dc-37de-a154-90f3bfdc8ce6',
       'e59d1c07-489b-3299-803f-5dea7da43b56',
       'e3a42770-927c-3d59-a761-bcc3549d1054',
       'fd0ccbd0-3a58-3818-8691-98f31de17527',
       '989b9e34-1ee0-3fa0-8ac8-b4617d7444b7',
       '6cf3cee0-c911-30f6-b254-416029f8af3b',
       '19c83c9c-d35b-365d-959b-7efe82a306ea',
       '87093480-2ef4-3d14-a30e-0c4cf41b93da',
       '64d2b2a9-98df-37a6-a326-a223db297718'], dtype='<U36')