In [None]:
%pip install pandas matplotlib seaborn wordcloud missingno scikit-learn xgboost catboost fasttext-wheel nltk optuna -q

# EDA

In [None]:
import pandas as pd

df = pd.read_csv("../dataset.csv")

print(df.info())

print(df.isnull().sum())

print(df.describe())

df["Оценка (для проекта)"].value_counts().plot(
    kind="bar", title="Distribution of Ratings"
)

In [None]:
df["resume_word_count"] = df["Текст резюме Rollup (from Кандидат)"].apply(
    lambda x: len(str(x).split())
)
df["job_desc_word_count"] = df["Текст вакансии от компании (from Вакансия)"].apply(
    lambda x: len(str(x).split())
)

df[["resume_word_count", "job_desc_word_count"]].hist(bins=20, figsize=(10, 5))

In [None]:
bad_status = [
    "F. Отказали мы",
    "I.4a. Оценили резюме (не рекомендуем)",
    "F. Отказал клиент",
    "I.5a. Передумали питчить",
]
good_status = [
    "I.4b. Оценили резюме (рекомендуем)",
    "I.6. Написали",
    "II.11. Запомнить",
    "I.5. Согласуем питч",
    "II.6. Этап 2",
]

In [None]:
print(df["Статус"].unique())

In [None]:
df["Статус"].value_counts().sort_values().plot(kind="barh", title="Status Distribution")

In [None]:
status_ratings = df.groupby("Статус")["Оценка (для проекта)"].mean().sort_values()
status_ratings.plot(kind="barh", title="Average Rating by Status")

# Init

In [24]:
import pandas as pd
import re
from pymorphy2 import MorphAnalyzer
from nltk.corpus import stopwords
import numpy as np

df = pd.read_csv("../dataset.csv")
df = df.dropna(
    subset=[
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
        "Оценка (для проекта)",
    ]
)


def convert_score(value):
    try:
        if isinstance(value, (int, float)):
            return value

        if isinstance(value, str) and "/" in value:
            num, denom = value.split("/")
            return round((float(num) + float(denom)) / 2, 1)

        return float(value)

    except ValueError:
        return None


def custom_metric(y_true, y_pred):
    deviation = np.abs(y_true - y_pred)

    total_rows = len(y_true)

    count_deviation_1_or_more = np.sum(deviation >= 1)
    percentage_deviation_1_or_more = count_deviation_1_or_more / total_rows

    count_deviation_0_5_or_less = np.sum(deviation <= 0.5)
    percentage_deviation_0_5_or_less = count_deviation_0_5_or_less / total_rows

    return percentage_deviation_1_or_more, percentage_deviation_0_5_or_less

# Linreg

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vacancy = TfidfVectorizer(max_features=500)
tfidf_resume = TfidfVectorizer(max_features=500)

vacancy_tfidf = tfidf_vacancy.fit_transform(
    df["Текст вакансии от компании (from Вакансия)"]
)
resume_tfidf = tfidf_resume.fit_transform(df["Текст резюме Rollup (from Кандидат)"])

import numpy as np

X = np.hstack((vacancy_tfidf.toarray(), resume_tfidf.toarray()))

In [12]:
from sklearn.model_selection import train_test_split

y = df["Оценка (для проекта)"].apply(convert_score)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

# Optuna

In [None]:
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import time

X = df[
    [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ]
]
y = df["Оценка (для проекта)"].apply(convert_score)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def custom_metric(y_true, y_pred):
    deviation = np.abs(y_true - y_pred)
    total_rows = len(y_true)

    count_deviation_1_or_more = np.sum(deviation >= 1)
    percentage_deviation_1_or_more = count_deviation_1_or_more / total_rows

    count_deviation_0_5_to_1 = np.sum((deviation > 0.5) & (deviation < 1.0))
    percentage_deviation_0_5_to_1 = count_deviation_0_5_to_1 / total_rows

    return percentage_deviation_1_or_more, percentage_deviation_0_5_to_1


def objective(trial):
    iterations = trial.suggest_int("iterations", 500, 1500)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
    depth = trial.suggest_int("depth", 4, 10)
    l2_leaf_reg = trial.suggest_int("l2_leaf_reg", 1, 10)
    bagging_temperature = trial.suggest_float("bagging_temperature", 0, 3)

    model = CatBoostRegressor(
        iterations=iterations,
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        bagging_temperature=bagging_temperature,
        eval_metric="RMSE",
        text_features=[
            "Текст вакансии от компании (from Вакансия)",
            "Текст резюме Rollup (from Кандидат)",
        ],
        verbose=0,
        early_stopping_rounds=100,
    )

    model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

    y_pred = model.predict(X_test)

    percentage_deviation_1_or_more, percentage_deviation_0_5_to_1 = custom_metric(
        y_test, y_pred
    )

    target_deviation_1_or_more = 0.05
    target_deviation_0_5_to_1 = 0.2

    penalty_1_or_more = percentage_deviation_1_or_more - target_deviation_1_or_more
    penalty_0_5_to_1 = percentage_deviation_0_5_to_1 - target_deviation_0_5_to_1

    combined_penalty = penalty_1_or_more + penalty_0_5_to_1

    return combined_penalty


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

best_trial = study.best_trial
print(f"Best trial: {best_trial.number} with value: {best_trial.value}")
print("Best parameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")

In [15]:
best_params = {
    "iterations": 848,
    "learning_rate": 0.02461841312190255,
    "depth": 5,
    "l2_leaf_reg": 9,
    "bagging_temperature": 2.6931515237347994,
    "eval_metric": "RMSE",
    "loss_function": "RMSE",
    "text_features": [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ],
    "verbose": 100,
}

In [None]:
metric_5, metric_20 = custom_metric(y_test, y_pred)
print(
    f"Custom Metric - deviation of 1 or more: {metric_5}, deviation of 0.5 or less: {metric_20}"
)

# Simple Catboost with cross validation

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool

X = df[
    [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ]
]
y = df["Оценка (для проекта)"].apply(convert_score)


def custom_cv(X, y, model_params, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    penalties_1_or_more = []
    penalties_05_to_1 = []

    for train_index, test_index in kf.split(X):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        train_pool = Pool(
            X_train_fold,
            y_train_fold,
            text_features=[
                "Текст вакансии от компании (from Вакансия)",
                "Текст резюме Rollup (from Кандидат)",
            ],
        )
        test_pool = Pool(
            X_test_fold,
            y_test_fold,
            text_features=[
                "Текст вакансии от компании (from Вакансия)",
                "Текст резюме Rollup (from Кандидат)",
            ],
        )

        model = CatBoostRegressor(**model_params)
        model.fit(train_pool, eval_set=test_pool)

        y_pred = model.predict(test_pool)

        percentage_deviation_1_or_more, percentage_deviation_0_5_to_1 = custom_metric(
            y_test_fold, y_pred
        )

        penalties_1_or_more.append(percentage_deviation_1_or_more)
        penalties_05_to_1.append(percentage_deviation_0_5_to_1)

    return penalties_1_or_more, penalties_05_to_1


model_params = {
    "iterations": 848,
    "learning_rate": 0.02461841312190255,
    "depth": 5,
    "l2_leaf_reg": 9,
    "bagging_temperature": 2.6931515237347994,
    "early_stopping_rounds": 100,
    "verbose": 0,
}

average_penalty = custom_cv(X, y, model_params, n_splits=5)

mean_penalty_1_or_more = np.mean(average_penalty[0])
std_penalty_1_or_more = np.std(average_penalty[0])

mean_penalty_0_5_to_1 = np.mean(average_penalty[1])
std_penalty_0_5_to_1 = np.std(average_penalty[1])

print(
    f"Mean deviation >= 1: {mean_penalty_1_or_more:.4f}, Std: {std_penalty_1_or_more:.4f}"
)
print(
    f"Mean deviation between 0.5 and 1: {mean_penalty_0_5_to_1:.4f}, Std: {std_penalty_0_5_to_1:.4f}"
)

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool


def custom_metric(y_true, y_pred):
    deviation = np.abs(y_true - y_pred)
    total_rows = len(y_true)

    count_deviation_1_or_more = np.sum(deviation >= 1)
    percentage_deviation_1_or_more = (count_deviation_1_or_more / total_rows) * 100

    count_deviation_0_5_to_1 = np.sum((deviation > 0.5) & (deviation < 1.0))
    percentage_deviation_0_5_to_1 = (count_deviation_0_5_to_1 / total_rows) * 100

    return percentage_deviation_1_or_more, percentage_deviation_0_5_to_1


best_params = {
    "iterations": 848,
    "learning_rate": 0.02461841312190255,
    "depth": 5,
    "l2_leaf_reg": 9,
    "bagging_temperature": 2.6931515237347994,
    "loss_function": "RMSE",
    "eval_metric": "RMSE",
    "text_features": [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

custom_penalties = []

for train_index, test_index in kf.split(X_train):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    train_pool = Pool(
        data=X_train_fold,
        label=y_train_fold,
        text_features=best_params["text_features"],
    )

    model = CatBoostRegressor(**best_params)
    model.fit(train_pool, eval_set=(X_test_fold, y_test_fold), verbose=0)

    y_pred = model.predict(X_test_fold)

    percentage_deviation_1_or_more, percentage_deviation_0_5_to_1 = custom_metric(
        y_test_fold, y_pred
    )

    target_deviation_1_or_more = 0.05
    target_deviation_0_5_to_1 = 0.2

    penalty_1_or_more = percentage_deviation_1_or_more - target_deviation_1_or_more
    penalty_0_5_to_1 = percentage_deviation_0_5_to_1 - target_deviation_0_5_to_1

    combined_penalty = penalty_1_or_more + penalty_0_5_to_1
    custom_penalties.append(combined_penalty)

print("Custom penalties for each fold:", custom_penalties)

In [None]:
def custom_metric(y_true, y_pred):
    deviation = np.abs(y_true - y_pred)
    total_rows = len(y_true)

    count_deviation_1_or_more = np.sum(deviation >= 1)
    percentage_deviation_1_or_more = (count_deviation_1_or_more / total_rows) * 100

    count_deviation_0_5_to_1 = np.sum((deviation > 0.5) & (deviation < 1.0))
    percentage_deviation_0_5_to_1 = (count_deviation_0_5_to_1 / total_rows) * 100

    return percentage_deviation_1_or_more, percentage_deviation_0_5_to_1


metric_5, metric_20 = custom_metric(y_test, y_pred)
print(
    f"Custom Metric - deviation of 1 or more: {metric_5}, deviation of 0.5 or less: {metric_20}"
)

# Catboost with custom objective

In [12]:
class CustomDeviationObjective:
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            der1 = targets[index] - approxes[index]
            der2 = -1

            if abs(der1) >= 1:
                der1 *= 10
            elif abs(der1) >= 0.5:
                der1 *= 2

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result


class CustomDeviationMetric:
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]
        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w

            abs_deviation = np.abs(approx[i] - target[i])
            if abs_deviation >= 1:
                error_sum += w * 10
            elif abs_deviation >= 0.5:
                error_sum += w * 2
            else:
                error_sum += w * 1

        return error_sum, weight_sum

In [None]:
import numpy as np
from catboost import CatBoostRegressor, Pool
import fasttext


model = fasttext.load_model("./cc.ru.300.bin/cc.ru.300.bin")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def get_fasttext_vectors(texts):
    return np.array([model.get_word_vector(text) for text in texts])


X_train_vacancy_fasttext = get_fasttext_vectors(
    X_train["Текст вакансии от компании (from Вакансия)"].values
)
X_train_resume_fasttext = get_fasttext_vectors(
    X_train["Текст резюме Rollup (from Кандидат)"].values
)

X_test_vacancy_fasttext = get_fasttext_vectors(
    X_test["Текст вакансии от компании (from Вакансия)"].values
)
X_test_resume_fasttext = get_fasttext_vectors(
    X_test["Текст резюме Rollup (from Кандидат)"].values
)

X_train_combined = np.hstack((X_train_vacancy_fasttext, X_train_resume_fasttext))
X_test_combined = np.hstack((X_test_vacancy_fasttext, X_test_resume_fasttext))

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    loss_function=CustomDeviationObjective(),
    eval_metric=CustomDeviationMetric(),
    verbose=100,
)

model.fit(X_train_combined, y_train, eval_set=(X_test_combined, y_test))

y_pred = model.predict(X_test_combined)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"CatBoost with Russian FastText - Mean Squared Error: {mse}, R-squared: {r2}")


def custom_metric(y_true, y_pred):
    deviation_1 = ((y_true - y_pred).abs() >= 1).sum() / len(y_true)
    deviation_05 = ((y_true - y_pred).abs() >= 0.5).sum() / len(y_true)

    return deviation_1, deviation_05


metric_5, metric_20 = custom_metric(y_test, y_pred)
print(
    f"Custom Metric - deviation of 1 or more: {metric_5}, deviation of 0.5 or less: {metric_20}"
)

# Catboost with Fasttext

In [None]:
import re
import fasttext
import gzip
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = fasttext.load_model("./cc.ru.300.bin/cc.ru.300.bin")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def get_fasttext_vectors(texts):
    return np.array([model.get_word_vector(text) for text in texts])


X_train_vacancy_fasttext = get_fasttext_vectors(
    X_train["Текст вакансии от компании (from Вакансия)"].values
)
X_train_resume_fasttext = get_fasttext_vectors(
    X_train["Текст резюме Rollup (from Кандидат)"].values
)

X_test_vacancy_fasttext = get_fasttext_vectors(
    X_test["Текст вакансии от компании (from Вакансия)"].values
)
X_test_resume_fasttext = get_fasttext_vectors(
    X_test["Текст резюме Rollup (from Кандидат)"].values
)

X_train_combined = np.hstack((X_train_vacancy_fasttext, X_train_resume_fasttext))
X_test_combined = np.hstack((X_test_vacancy_fasttext, X_test_resume_fasttext))

model_catboost = CatBoostRegressor(
    iterations=1000, learning_rate=0.1, depth=6, eval_metric="RMSE", verbose=100
)

model_catboost.fit(X_train_combined, y_train, eval_set=(X_test_combined, y_test))

y_pred = model_catboost.predict(X_test_combined)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"CatBoost with Russian FastText - Mean Squared Error: {mse}, R-squared: {r2}")


def custom_metric(y_true, y_pred):
    deviation_1 = ((y_true - y_pred).abs() >= 1).sum() / len(y_true)
    deviation_05 = ((y_true - y_pred).abs() >= 0.5).sum() / len(y_true)

    return deviation_1, deviation_05


metric_5, metric_20 = custom_metric(y_test, y_pred)
print(
    f"Custom Metric - deviation of 1 or more: {metric_5}, deviation of 0.5 or less: {metric_20}"
)

# XGBoost

In [None]:
import re
import fasttext
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

model = fasttext.load_model("./cc.ru.300.bin/cc.ru.300.bin")

X = df[
    [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ]
]
y = df["Оценка (для проекта)"].apply(convert_score)  # Assuming convert_score is defined

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def get_fasttext_vectors(texts):
    return np.array([model.get_word_vector(text) for text in texts])


X_train_vacancy_fasttext = get_fasttext_vectors(
    X_train["Текст вакансии от компании (from Вакансия)"].values
)
X_train_resume_fasttext = get_fasttext_vectors(
    X_train["Текст резюме Rollup (from Кандидат)"].values
)

X_test_vacancy_fasttext = get_fasttext_vectors(
    X_test["Текст вакансии от компании (from Вакансия)"].values
)
X_test_resume_fasttext = get_fasttext_vectors(
    X_test["Текст резюме Rollup (from Кандидат)"].values
)

X_train_combined = np.hstack((X_train_vacancy_fasttext, X_train_resume_fasttext))
X_test_combined = np.hstack((X_test_vacancy_fasttext, X_test_resume_fasttext))

model_xgboost = XGBRegressor(
    n_estimators=1000, learning_rate=0.1, max_depth=6, eval_metric="rmse", verbosity=1
)

model_xgboost.fit(X_train_combined, y_train)

y_pred = model_xgboost.predict(X_test_combined)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"XGBoost with Russian FastText - Mean Squared Error: {mse}, R-squared: {r2}")


def custom_metric(y_true, y_pred):
    deviation_1 = ((y_true - y_pred).abs() >= 1).sum() / len(y_true)
    deviation_05 = ((y_true - y_pred).abs() >= 0.5).sum() / len(y_true)

    return deviation_1, deviation_05


metric_5, metric_20 = custom_metric(y_test, y_pred)
print(
    f"Custom Metric - deviation of 1 or more: {metric_5}, deviation of 0.5 or less: {metric_20}"
)

# Stacked

In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    StackingRegressor,
    GradientBoostingRegressor,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

model = fasttext.load_model("./cc.ru.300.bin/cc.ru.300.bin")

X = df[
    [
        "Текст вакансии от компании (from Вакансия)",
        "Текст резюме Rollup (from Кандидат)",
    ]
]
y = df["Оценка (для проекта)"].apply(convert_score)  # Assuming convert_score is defined

X["Текст вакансии от компании (from Вакансия)"] = X[
    "Текст вакансии от компании (from Вакансия)"
].apply(preprocess_text)
X["Текст резюме Rollup (from Кандидат)"] = X[
    "Текст резюме Rollup (from Кандидат)"
].apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


def get_fasttext_vectors(texts):
    return np.array([model.get_word_vector(text) for text in texts])


X_train_vacancy_fasttext = get_fasttext_vectors(
    X_train["Текст вакансии от компании (from Вакансия)"].values
)
X_train_resume_fasttext = get_fasttext_vectors(
    X_train["Текст резюме Rollup (from Кандидат)"].values
)

X_test_vacancy_fasttext = get_fasttext_vectors(
    X_test["Текст вакансии от компании (from Вакансия)"].values
)
X_test_resume_fasttext = get_fasttext_vectors(
    X_test["Текст резюме Rollup (from Кандидат)"].values
)

X_train_combined = np.hstack((X_train_vacancy_fasttext, X_train_resume_fasttext))
X_test_combined = np.hstack((X_test_vacancy_fasttext, X_test_resume_fasttext))

base_models = [
    ("rf", RandomForestRegressor(n_estimators=100, random_state=42)),
    (
        "xgb",
        XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42),
    ),
    (
        "catboost",
        CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, verbose=0),
    ),
]

stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42
    ),
)

stacked_model.fit(X_train_combined, y_train)

y_pred_stacked = stacked_model.predict(X_test_combined)

mse_final = mean_squared_error(y_test, y_pred_stacked)
r2_final = r2_score(y_test, y_pred_stacked)
print(f"Stacked Model - Mean Squared Error: {mse_final}, R-squared: {r2_final}")