In [None]:
!pip install stanza
!pip install transformers
!pip install catboost
!pip install faker

In [None]:
import os
import re

import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR

import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor

import stanza
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from google.colab import drive
from tqdm.notebook import tqdm

from faker import Faker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

nlp = stanza.Pipeline(lang="ru", processors="tokenize,lemma", use_gpu=True)

# Labor cost prediction

In [None]:
# Функция для генерации синтетических данных
def generate_synthetic_data(num_rows=5000):
    """Генерирует DataFrame с такой же структурой, как в проекте, но с фальшивыми данными."""
    fake = Faker('ru_RU')
    data = []

    # Создаем пул поддельных, но реалистичных ID и названий, чтобы они повторялись
    company_ids = [fake.uuid4() for _ in range(50)]
    project_names = ['Проект ' + fake.bs().split(' ')[0].replace(',', '') for _ in range(30)]
    department_names = ['Отдел ' + fake.job().split(' ')[0].replace(',', '') for _ in range(10)]
    parent_uuids = [fake.uuid4() for _ in range(100)] + [np.nan] * 900 # Большинство - NaN

    print(f"Генерация {num_rows} строк синтетических данных (для защиты NDA)...")

    for _ in range(num_rows):
        data.append({
            'uuid': fake.uuid4(),
            'subject': fake.sentence(nb_words=np.random.randint(3, 8)),
            'details': fake.text(max_nb_chars=150) if np.random.rand() > 0.4 else np.nan,
            'parent_uuid': np.random.choice(parent_uuids),
            'importance': np.random.choice([1, 2, 0], p=[0.9, 0.08, 0.02]), # Основано на вашем EDA
            'company_id': np.random.choice(company_ids),
            'project_name': np.random.choice(project_names),
            'department_name': np.random.choice(department_names),
            'labor_costs': np.random.choice([0.0, 10.2, 30.0, 60.0, 100.2, 180.0]) + np.random.randint(-5, 5)
        })

    df = pd.DataFrame(data)
    # Убедимся, что labor_costs не отрицательные
    df['labor_costs'] = df['labor_costs'].clip(lower=0)

    # Заполняем NaN в категориальных
    df['company_id'] = df['company_id'].fillna('__unknown__')
    df['project_name'] = df['project_name'].fillna('__unknown__')
    df['department_name'] = df['department_name'].fillna('__unknown__')

    # создадим флаг дочерней задачи
    df['is_subtask'] = df['parent_uuid'].notnull().astype(int)

    # склеим текст
    df['text'] = (df['subject'].fillna('') + ' ' + df['details'].fillna('')).str.strip()

    # удалим лишние поля
    df = df.drop(columns=['uuid', 'parent_uuid', 'subject', 'details'])

    # логарифмируем таргет
    df['labor_costs_log'] = np.log1p(df['labor_costs'])

    # меняем порядок
    cols = ['text', 'is_subtask'] + [c for c in df.columns if c not in ['text', 'is_subtask']]
    df = df[cols]

    return df

# Генерируем данные (10k строк вместо 580k, чтобы ноутбук быстро работал у рекрутера)
df_synthetic = generate_synthetic_data(10000)

print("Синтетические данные готовы и разделены.")

In [None]:
# напишем фукнцию для анализа и обработки выбросов

def analyze_and_trim_outliers(df, target_col='labor_costs', k=1.5, bins_before=100, bins_after=50, log_scale=True):
    print("Изначальное количество данных:", len(df))

    Q1 = df[target_col].quantile(0.25)
    Q3 = df[target_col].quantile(0.75)
    IQR = Q3 - Q1

    lower_threshold = Q1 - k * IQR
    upper_threshold = Q3 + k * IQR

    print(f"IQR: {IQR:.2f}, Нижняя граница: {lower_threshold:.2f}, Верхняя граница: {upper_threshold:.2f}")

    outliers_count_lower = (df[target_col] < lower_threshold).sum()
    outliers_count_upper = (df[target_col] > upper_threshold).sum()
    total_outliers = outliers_count_lower + outliers_count_upper
    print(f"Будет удалено всего: {total_outliers:,} записей "
          f"({(total_outliers/len(df))*100:.3f}%), останется {len(df)-total_outliers}")

    df_trimmed = df[(df[target_col] >= lower_threshold) & (df[target_col] <= upper_threshold)].copy()
    print("Статистика после обрезки:")
    print(df_trimmed[target_col].describe())

    # построим графики
    fig, axes = plt.subplots(1, 2, figsize=(15,5))

    # до обрезки
    axes[0].hist(df[target_col], bins=bins_before, alpha=0.7, edgecolor='black')
    axes[0].axvline(lower_threshold, color='red', linestyle='--', label='IQR Lower')
    axes[0].axvline(upper_threshold, color='red', linestyle='--', label='IQR Upper')
    axes[0].set_title('Распределение ДО обрезки')
    axes[0].set_xlabel(target_col)
    axes[0].set_ylabel('Частота')
    if log_scale:
        axes[0].set_yscale('log')
    axes[0].legend()

    # после обрезки
    axes[1].hist(df_trimmed[target_col], bins=bins_after, alpha=0.7, color='green', edgecolor='black')
    axes[1].set_title('Распределение ПОСЛЕ обрезки')
    axes[1].set_xlabel(target_col)
    axes[1].set_ylabel('Частота')
    if log_scale:
        axes[1].set_yscale('log')

    plt.tight_layout()
    plt.show()

    return df_trimmed, lower_threshold, upper_threshold


In [None]:
train_full, test_df = train_test_split(df_synthetic, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_full, test_size=0.1, random_state=42)

train_df_trimmed, lower_threshold, upper_threshold = analyze_and_trim_outliers(train_df)
val_df_trimmed = val_df[(val_df['labor_costs'] >= lower_threshold) & (val_df['labor_costs'] <= upper_threshold)].copy()
test_df_trimmed = test_df[(test_df['labor_costs'] >= lower_threshold) & (test_df['labor_costs'] <= upper_threshold)].copy()

In [None]:
# подсчёт количества тасков на компанию
company_task_count = train_df_trimmed["company_id"].value_counts()
project_task_count = train_df_trimmed["project_name"].value_counts()

print("Топ-10 компаний по количеству тасков:")
print(company_task_count.head(10))
print("\nТоп-10 проектов по количеству тасков:")
print(project_task_count.head(10))

company_task_count.describe(), project_task_count.describe()

In [None]:
def create_aggregated_features(train_df, val_df, test_df, target_col='labor_costs', company_thresh=15, project_thresh=30):

    train_features = train_df.copy()
    val_features = val_df.copy()
    test_features = test_df.copy()

    # Список всех датасетов для обработки
    datasets = [train_features, val_features, test_features]
    dataset_names = ['train', 'val', 'test']


    # -- company agg --
    company_agg = train_df.groupby('company_id')[target_col].agg(
        ['mean', 'median', 'std', 'count', 'min', 'max']
    ).reset_index()

    company_agg.columns = [
        'company_id', 'company_avg_cost', 'company_median_cost',
        'company_std_cost', 'company_task_count',
        'company_min_cost', 'company_max_cost'
    ]
    company_agg['company_std_cost'] = company_agg['company_std_cost'].fillna(0)

    # Применяем к всем датасетам
    for i, df in enumerate(datasets):
        datasets[i] = df.merge(company_agg, on='company_id', how='left')

    # Обработка пропусков для всех датасетов (кроме train)
    global_mean = train_df[target_col].mean()
    company_cols = ['company_avg_cost', 'company_median_cost', 'company_std_cost',
                   'company_task_count', 'company_min_cost', 'company_max_cost']

    for i, df in enumerate(datasets):
        if i == 0:  # train - пропускаем
            continue
        for col in company_cols:
            if col == 'company_task_count':
                df[col] = df[col].fillna(1)
            else:
                df[col] = df[col].fillna(global_mean)

    # Редкие компании
    company_task_count = train_df['company_id'].value_counts()
    for i, df in enumerate(datasets):
        df['is_rare_company'] = df['company_id'].map(
            lambda x: company_task_count.get(x, 0) < company_thresh
        )

    # -- department agg --
    dept_agg = train_df.groupby('department_name')[target_col].agg(
        ['mean', 'median', 'count', 'std']
    ).reset_index()

    dept_agg.columns = ['department_name', 'dept_avg_cost', 'dept_median_cost',
                       'dept_task_count', 'dept_std_cost']
    dept_agg['dept_std_cost'] = dept_agg['dept_std_cost'].fillna(0)

    # Применяем к всем датасетам
    for i, df in enumerate(datasets):
        datasets[i] = df.merge(dept_agg, on='department_name', how='left')

    # Обработка пропусков для департаментов
    dept_cols = ['dept_avg_cost', 'dept_median_cost', 'dept_task_count', 'dept_std_cost']
    for i, df in enumerate(datasets):
        if i == 0:  # train - пропускаем
            continue
        for col in dept_cols:
            if col == 'dept_task_count':
                df[col] = df[col].fillna(1)
            else:
                df[col] = df[col].fillna(global_mean)

    # -- project agg --
    project_agg = train_df.groupby('project_name')[target_col].agg(
        ['mean', 'median', 'count', 'std']
    ).reset_index()

    project_agg.columns = ['project_name', 'project_avg_cost', 'project_median_cost',
                          'project_task_count', 'project_std_cost']
    project_agg['project_std_cost'] = project_agg['project_std_cost'].fillna(0)

    # Применяем к всем датасетам
    for i, df in enumerate(datasets):
        datasets[i] = df.merge(project_agg, on='project_name', how='left')

    # Обработка пропусков для проектов
    project_cols = ['project_avg_cost', 'project_median_cost', 'project_task_count', 'project_std_cost']
    for i, df in enumerate(datasets):
        if i == 0:  # train - пропускаем
            continue
        for col in project_cols:
            if col == 'project_task_count':
                df[col] = df[col].fillna(1)
            else:
                df[col] = df[col].fillna(global_mean)

    # Редкие проекты
    project_task_count = train_df['project_name'].value_counts()
    for i, df in enumerate(datasets):
        df['is_rare_project'] = df['project_name'].map(
            lambda x: project_task_count.get(x, 0) < project_thresh
        )

    # -- interaction features --
    for i, df in enumerate(datasets):
        if 'importance' in df.columns:
            df['company_avg_x_importance'] = df['company_avg_cost'] * df['importance']
            df['dept_avg_x_importance'] = df['dept_avg_cost'] * df['importance']

        # Cross-level keys
        df['company_department'] = df['company_id'].astype(str) + "_" + df['department_name'].astype(str)
        df['project_department'] = df['project_name'].astype(str) + "_" + df['department_name'].astype(str)
        df['company_project'] = df['company_id'].astype(str) + "_" + df['project_name'].astype(str)

    def add_features_to_df(df):
        # Текстовые фичи
        if 'text_norm' in df.columns:
            df['num_words'] = df['text_norm'].str.split().str.len().fillna(0)
            df['num_unique_words'] = df['text_norm'].apply(
                lambda x: len(set(str(x).split())) if pd.notna(x) else 0
            )
            df['avg_word_length'] = df['text_norm'].apply(
                lambda x: np.mean([len(word) for word in str(x).split()]) if pd.notna(x) and str(x).split() else 0
            )

        if 'text' in df.columns:
            df['num_numbers'] = df['text'].str.count(r'\d+').fillna(0)
            df['has_urgent_words'] = df['text'].str.contains(
                r'срочно|urgent|asap|немедленно|быстро|скорее', case=False, na=False
            ).astype(int)
            df['has_complex_words'] = df['text'].str.contains(
                r'анализ|исследование|разработка|integration|analysis|архитектура|оптимизация',
                case=False, na=False
            ).astype(int)

        # Плотность информации
        if 'text_length' in df.columns and 'num_unique_words' in df.columns:
            df['info_density'] = df['num_unique_words'] / (df['text_length'] + 1)

        return df

    def add_ratio_features(df):
        # Соотношения с средними значениями (только для train, так как есть target)
        if target_col in df.columns:
            df['cost_vs_company_avg'] = df[target_col] / (df['company_avg_cost'] + 1)
            df['cost_vs_dept_avg'] = df[target_col] / (df['dept_avg_cost'] + 1)
            df['cost_vs_project_avg'] = df[target_col] / (df['project_avg_cost'] + 1)

            # Z-scores
            df['cost_zscore_company'] = (df[target_col] - df['company_avg_cost']) / (df['company_std_cost'] + 1)
            df['cost_zscore_dept'] = (df[target_col] - df['dept_avg_cost']) / (df['dept_std_cost'] + 1)

            # Позиция в диапазоне
            df['company_price_range'] = df['company_max_cost'] - df['company_min_cost']
            df['cost_position_in_range'] = (df[target_col] - df['company_min_cost']) / (df['company_price_range'] + 1)

        # Коэффициенты вариации (можно для всех датасетов)
        df['company_cv'] = df['company_std_cost'] / (df['company_avg_cost'] + 1)
        df['dept_cv'] = df['dept_std_cost'] / (df['dept_avg_cost'] + 1)

        return df

    def add_complexity_features(df):
        if 'importance' in df.columns and 'text_length' in df.columns:
            df['complexity_score'] = df['text_length'] * (df['importance'] + 0.1)
            df['text_importance_ratio'] = df['text_length'] / (df['importance'] + 0.1)

            df['importance_category'] = pd.cut(
                df['importance'],
                bins=[-0.1, 0, 2, 5, float('inf')],
                labels=['zero', 'low', 'medium', 'high']
            )

        if 'num_words' in df.columns and 'num_unique_words' in df.columns:
            df['text_complexity'] = df['num_unique_words'] * df['avg_word_length']

        return df

    def add_size_categories(df):
        df['company_size_category'] = pd.cut(
            df['company_task_count'],
            bins=[0, 10, 50, 200, float('inf')],
            labels=['small', 'medium', 'large', 'enterprise']
        )

        df['dept_size_category'] = pd.cut(
            df['dept_task_count'],
            bins=[0, 5, 20, 100, float('inf')],
            labels=['tiny', 'small', 'medium', 'large']
        )

        df['project_size_category'] = pd.cut(
            df['project_task_count'],
            bins=[0, 3, 15, 50, float('inf')],
            labels=['tiny', 'small', 'medium', 'large']
        )

        return df

    # Применяем все дополнительные функции
    for i, df in enumerate(datasets):
        datasets[i] = add_features_to_df(df)
        datasets[i] = add_ratio_features(df)
        datasets[i] = add_complexity_features(df)
        datasets[i] = add_size_categories(df)

        # Дополнительные комбинированные фичи
        if 'importance' in df.columns:
            df['company_size_x_importance'] = df['company_task_count'] * (df['importance'] + 0.1)
            df['dept_size_x_importance'] = df['dept_task_count'] * (df['importance'] + 0.1)

    # Финальная обработка пропусков
    for i, df in enumerate(datasets):
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if col != target_col:
                df[col] = df[col].fillna(0)

    # Обновляем ссылки
    train_features = datasets[0]
    val_features = datasets[1]
    test_features = datasets[2]


    return train_features, val_features, test_features

In [None]:
train, val, test = create_aggregated_features(train_df_trimmed, val_df_trimmed, test_df_trimmed)

In [None]:
# создадим фукнцию для анализа корреляции численных признаков с целевой переменной
def analyze_num_correlations(df, target_col='labor_costs_log', top_n=15):
    # Выбираем только численные столбцы
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)

    # Вычисляем корреляции
    correlations = df[numeric_cols + [target_col]].corr()[target_col].abs().sort_values(ascending=False)[1:]

    print(f"ТОП-{top_n} признаков по корреляции с {target_col}:")
    print("="*50)
    for i, (feature, corr) in enumerate(correlations.head(top_n).items(), 1):
        print(f"{i:2d}. {feature:<30}: {corr:.4f}")

    # Визуализация топ корреляций
    plt.figure(figsize=(12, 8))
    top_corr = correlations.head(top_n)
    bars = plt.barh(range(len(top_corr)), top_corr.values)
    plt.yticks(range(len(top_corr)), top_corr.index)
    plt.xlabel('Абсолютная корреляция')
    plt.title(f'ТОП-{top_n} признаков по корреляции с {target_col}')
    plt.gca().invert_yaxis()

    # Добавляем значения на столбцы
    for i, (bar, value) in enumerate(zip(bars, top_corr.values)):
        plt.text(value + 0.01, i, f'{value:.3f}', va='center')

    plt.tight_layout()
    plt.show()

    return correlations

In [None]:
correlations = analyze_num_correlations(train)

In [None]:
# создадим фукнцию для анализа корреляции категориальных признаков с целевой переменной
def analyze_cat_correlations(df, target_col='labor_costs'):
    categorical_cols = ['department_name', 'project_name', 'importance', 'company_id']
    results = {}

    for col in categorical_cols:
        if col in df.columns:
            print(f"\n--- {col.upper()} ---")

            # Статистика по категориям
            cat_stats = df.groupby(col)[target_col].agg([
                'count', 'mean', 'median', 'std'
            ]).round(2)
            cat_stats.columns = ['Count', 'Mean', 'Median', 'Std']
            cat_stats = cat_stats.sort_values('Mean', ascending=False)

            print(cat_stats)

            # ANOVA для оценки значимости различий
            groups = [group[target_col].values for name, group in df.groupby(col)]
            f_stat, p_value = stats.f_oneway(*groups)

            # Eta-squared (сила связи)
            ss_total = ((df[target_col] - df[target_col].mean()) ** 2).sum()
            ss_between = sum([len(group) * (np.mean(group) - df[target_col].mean()) ** 2
                             for group in groups])
            eta_squared = ss_between / ss_total

            results[col] = {
                'f_statistic': f_stat,
                'p_value': p_value,
                'eta_squared': eta_squared,
                'significant': p_value < 0.05
            }

            print(f"F-statistic: {f_stat:.2f}")
            print(f"P-value: {p_value:.2e}")
            print(f"Eta-squared: {eta_squared:.4f}")
            print(f"Статистически значим: {'Да' if p_value < 0.05 else 'Нет'}")

    return results

In [None]:
categorical_results = analyze_cat_correlations(train)

Lemmatization

In [None]:
def nlp_normalize(text):
    if not isinstance(text, str):
        return ""

    # 1. префиксы вида RE:, FW:, FWD:
    text = re.sub(r"^(re|fw|fwd)\s*[:\-]\s*", "", text, flags=re.IGNORECASE)

    # 2. нижний регистр
    text = text.lower()

    # 3. лишние символы (оставляем буквы, цифры и пробелы)
    text = re.sub(r"[^a-zа-яё0-9\s]", " ", text)

    # 4. лемматизация
    doc = nlp(text)
    lemmas = [word.lemma for sent in doc.sentences for word in sent.words]
    text = " ".join(lemmas)

    # 5. пробелов
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
train['text_norm'] = train['text'].apply(nlp_normalize)
val['text_norm'] = val['text'].apply(nlp_normalize)
test['text_norm'] = test['text'].apply(nlp_normalize)

In [None]:
train['text_length'] = train['text_norm'].apply(len)
val['text_length'] = val['text_norm'].apply(len)
test['text_length'] = test['text_norm'].apply(len)

# основные статистики
mean_len = train['text_length'].mean()
max_len = train['text_length'].max()
median_len = train['text_length'].median()

print(f"Средняя длина текста: {mean_len:.2f}")
print(f"Максимальная длина текста: {max_len}")
print(f"Медианная длина текста: {median_len}")

# гистограмма распределения длин текстов
plt.figure(figsize=(10,6))
plt.hist(train['text_length'], bins=50, color='skyblue', edgecolor='black')
plt.title('Распределение длины текстов')
plt.xlabel('Длина текста (символы)')
plt.ylabel('Количество текстов')

# добавим линии среднего, максимума и медианы
plt.axvline(mean_len, color='red', linestyle='--', label=f'Среднее ({mean_len:.0f})')
plt.axvline(median_len, color='green', linestyle='-', label=f'Медиана ({median_len:.0f})')
plt.axvline(max_len, color='purple', linestyle=':', label=f'Максимум ({max_len})')

plt.legend()
plt.show()

In [None]:
model_name = "Zamza/XLM-roberta-large-ftit-emb-lr01"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()  # выключаем режим обучения

# Если есть GPU, переносим модель на CUDA
model.to(device)

def get_xlm_embedding(text: str) -> np.ndarray:
    """
    Возвращает эмбеддинг для текста, используя XLM-R ftit-emb-lr01.
    """
    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model(**encoded)

    cls_emb = outputs.last_hidden_state[:, 0, :].squeeze(0).cpu().numpy()
    return cls_emb

def create_embeddings_and_save(df: pd.DataFrame, text_col, name_suffix) -> None:
    """
    Создаёт эмбеддинги, сохраняет в .npz и гарантирует совпадение индексов.
    """
    texts = df[text_col].tolist()
    indices = df.index.to_numpy()  # Гарантированно сохраняем индексы
    embeddings = []

    # Генерация эмбеддингов с прогресс-баром
    for text in tqdm(texts, desc="Creating XLM-R embeddings"):
        emb = get_xlm_embedding(text)
        embeddings.append(emb)

    embeddings_np = np.vstack(embeddings)

    # Сохраняем в .npz
    save_file = os.path.join(save_path, f"embeddings_xlm_roberta_{name_suffix}_full.npz")
    np.savez_compressed(save_file, embeddings=embeddings_np, indices=indices)

    print(f"Эмбеддинги сохранены в: {save_file}")

In [None]:
create_embeddings_and_save(train, text_col="text_norm", name_suffix='train')

In [None]:
create_embeddings_and_save(val, text_col="text_norm", name_suffix='val')
create_embeddings_and_save(test, text_col="text_norm", name_suffix='test')

In [None]:
def load_embeddings(npz_path):
    data = np.load(npz_path)
    return data['embeddings'], data['indices']

# Загрузка эмбеддингов
train_emb, train_idx = load_embeddings(os.path.join(save_path, "embeddings_xlm_roberta_train_full.npz"))
val_emb, val_idx = load_embeddings(os.path.join(save_path, "embeddings_xlm_roberta_val_full.npz"))
test_emb, test_idx = load_embeddings(os.path.join(save_path, "embeddings_xlm_roberta_test_full.npz"))

# Создадим DataFrame из эмбеддингов с индексами
train_emb_df = pd.DataFrame(train_emb, index=train_idx)
val_emb_df = pd.DataFrame(val_emb, index=val_idx)
test_emb_df = pd.DataFrame(test_emb, index=test_idx)

In [None]:
# Сначала обучим PCA на тренировочных эмбеддингах
pca = PCA(n_components=256, random_state=42)
train_emb_pca = pca.fit_transform(train_emb_df)

# Преобразуем в DataFrame с теми же индексами
train_emb_pca_df = pd.DataFrame(train_emb_pca, index=train_emb_df.index, columns=[f'pca_emb_{i}' for i in range(256)])

# Применяем тот же PCA к валидационным эмбеддингам
val_emb_pca = pca.transform(val_emb_df)
val_emb_pca_df = pd.DataFrame(val_emb_pca, index=val_emb_df.index, columns=[f'pca_emb_{i}' for i in range(256)])

# Применяем тот же PCA к тестовым эмбеддингам
test_emb_pca = pca.transform(test_emb_df)
test_emb_pca_df = pd.DataFrame(test_emb_pca, index=test_emb_df.index, columns=[f'pca_emb_{i}' for i in range(256)])

# Затем объединяем обратно с остальными признаками
train_with_emb = train.join(train_emb_pca_df)
val_with_emb = val.join(val_emb_pca_df)
test_with_emb = test.join(test_emb_pca_df)

In [None]:
train_with_emb.columns[:32]

In [None]:
# Убираем из признаков целевые колонки и ненужные
drop_cols = ['labor_costs', 'labor_costs_log', 'text', 'text_norm', 'company_id']
X_train = train_with_emb.drop(columns=drop_cols, errors='ignore')
y_train = train_with_emb['labor_costs_log']

X_val = val_with_emb.drop(columns=drop_cols, errors='ignore')
y_val = val_with_emb['labor_costs_log']

X_test = test_with_emb.drop(columns=drop_cols, errors='ignore')
y_test = test_with_emb['labor_costs_log']

cat_features = [
   # Основные категориальные
   'importance',
   'project_name',
   'department_name',

   # Флаги
   'is_subtask',
   'is_rare_company',
   'is_rare_project',
   'has_urgent_words',
   'has_complex_words',

   # Комбинированные категориальные ключи
   'company_department',
   'project_department',
   'company_project',

   # Размерные категории
   'company_size_category',
   'dept_size_category',
   'project_size_category'
]

## Models
все параметры модели были подобраны с помощью GridSearchCV и дальше были использованы лучшие из них

###CatBoost
обучалась на полном датасете

In [None]:
# Приводим категориальные признаки к строковому типу
for col in cat_features:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Создаем пулы для Catboost
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_features
)

test_pool = Pool(
    data=X_test,
    label=y_test,
    cat_features=cat_features
)

In [None]:
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.01,
    depth=7,
    eval_metric='RMSE',
    l2_leaf_reg=3,
    random_seed=42,
    task_type='GPU',
    early_stopping_rounds=500,
    verbose=200,
    od_type='Iter',
    use_best_model=True,
)

history = model.fit(train_pool, eval_set=val_pool, use_best_model=True)

# Предсказания и обратное преобразование
test_preds_log = model.predict(test_pool)
test_preds = np.expm1(test_preds_log)

mse_log = mean_squared_error(y_test, test_preds_log)
mse = mean_squared_error(test_with_emb['labor_costs'], test_preds)
catboost_r2 = r2_score(test_with_emb['labor_costs'], test_preds)

print(f"CatBoost R²: {catboost_r2:.4f}")
print(f"CatBoost MSE (log): {mse_log:.4f}")
print(f"CatBoost MSE: {mse:.4f}")

plt.figure(figsize=(8, 4))
plt.plot(model.get_evals_result()['validation']['RMSE'], label="Validation RMSE")
plt.plot(model.get_evals_result()['learn']['RMSE'], label="Train RMSE")
plt.xlabel("Iteration")
plt.ylabel("RMSE")
plt.title("CatBoost Learning Curve")
plt.legend()
plt.show()

In [None]:
# Получаем важность признаков (по количеству использований при разбиениях)
feature_importances = model.get_feature_importance(train_pool)

# Получаем названия признаков из train_pool
feature_names = train_pool.get_feature_names()

# Создаем DataFrame для удобства
fi_df = pd.DataFrame({'feature': feature_names[:20], 'importance': feature_importances[:20]})

# Сортируем по важности
fi_df = fi_df.sort_values(by='importance', ascending=False)

print(fi_df)

# Визуализация топ-20 признаков
plt.figure(figsize=(10,6))
plt.barh(fi_df['feature'][:20][::-1], fi_df['importance'][:20][::-1], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importance (CatBoost)')
plt.show()


###RandomForest
обучалась на 15к

In [None]:
# Подготовим данные для sklearn
def prepare_data_for_sklearn(X_train, X_val, X_test, cat_features):
    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()
    X_test_encoded = X_test.copy()
    label_encoders = {}

    for col in cat_features:
        print(f"  Кодируем {col}: {X_train[col].nunique()} уникальных значений")

        # Создаем LabelEncoder для каждого признака
        le = LabelEncoder()

        # Обучаем на train + val для покрытия всех категорий
        all_values = pd.concat([X_train[col], X_val[col], X_test[col]]).astype(str)
        le.fit(all_values)

        # Применяем к train и val
        X_train_encoded[col] = le.transform(X_train[col].astype(str))
        X_val_encoded[col] = le.transform(X_val[col].astype(str))
        X_test_encoded[col] = le.transform(X_test[col].astype(str))

        label_encoders[col] = le

    return X_train_encoded, X_val_encoded, X_test_encoded, label_encoders

X_train_sklearn, X_val_sklearn, X_test_sklearn, encoders = prepare_data_for_sklearn(X_train, X_val, X_test, cat_features)

In [None]:
# Инициализация модели
rf_model = RandomForestRegressor(
    max_depth=15,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=20,
    n_estimators=500,
    random_state=42,
    n_jobs=-1)


# Обучение
rf_model.fit(X_train_sklearn, y_train)

test_preds_rf_log = rf_model.predict(X_test_sklearn)
test_preds_rf = np.expm1(test_preds_rf_log)

mse_log_rf = mean_squared_error(y_test, test_preds_rf_log)
mse_rf = mean_squared_error(test_with_emb['labor_costs'], test_preds_rf)
rf_r2 = r2_score(test_with_emb['labor_costs'], test_preds_rf)

print(f"RF R²: {rf_r2:.4f}")
print(f"RF MSE (log): {mse_log_rf:.4f}")
print(f"RF MSE: {mse_rf:.4f}")

In [None]:
# Feature importance для RF
rf_importance = pd.DataFrame({
    'feature': X_train_sklearn.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

emb_features_rf = [f for f in rf_importance['feature'] if 'pca_emb_' in f]
emb_importance_rf = rf_importance[rf_importance['feature'].isin(emb_features_rf)]['importance'].sum()
total_importance_rf = rf_importance['importance'].sum()

print(f"RandomForest - доля важности эмбеддингов: {emb_importance_rf/total_importance_rf:.1%}")

top_features = rf_importance.sort_values('importance', ascending=False).head(15)

top_features['importance_pct'] = top_features['importance'] * 100
print(top_features[['feature', 'importance_pct']])


plt.figure(figsize=(10, 6))
sns.barplot(x=top_features['importance']*100, y=top_features['feature'])
plt.xlabel('Importance (%)')
plt.ylabel('Feature')
plt.title('Топ 15 признаков по важности (RandomForest)')
plt.tight_layout()
plt.show()

###LightGBM
обучалась на полном датасете

In [None]:
def prepare_data_for_lightgbm(X_train, X_val, X_test, cat_features):
    X_train_lgb = X_train.copy()
    X_val_lgb = X_val.copy()
    X_test_lgb = X_test.copy()

    for col in cat_features:
        le = LabelEncoder()
        all_values = pd.concat([X_train[col], X_val[col], X_test[col]]).astype(str)
        le.fit(all_values)

        X_train_lgb[col] = le.transform(X_train[col].astype(str))
        X_val_lgb[col] = le.transform(X_val[col].astype(str))
        X_test_lgb[col] = le.transform(X_test[col].astype(str))

    return X_train_lgb, X_val_lgb, X_test_lgb

X_train_lgb, X_val_lgb, X_test_lgb = prepare_data_for_lightgbm(X_train, X_val, X_test, cat_features)

lgb_train = lgb.Dataset(X_train_lgb, y_train, categorical_feature=cat_features)
lgb_val = lgb.Dataset(X_val_lgb, y_val, reference=lgb_train, categorical_feature=cat_features)
lgb_test = lgb.Dataset(X_test_lgb, y_test, reference=lgb_train, categorical_feature=cat_features)


In [None]:
evals_result = {}

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1
}

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    valid_sets=[lgb_val],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(100, verbose=True),
        lgb.record_evaluation(evals_result)
    ]
)

lgb_pred_log = lgb_model.predict(X_test_lgb)
lgb_pred = np.expm1(lgb_pred_log)

lgb_mse_log = mean_squared_error(y_test, lgb_pred_log)
mse_lgb = mean_squared_error(test_with_emb['labor_costs'], lgb_pred)
lgb_r2 = r2_score(test_with_emb['labor_costs'], lgb_pred)

print(f"LightGBM R²: {lgb_r2:.4f}")
print(f"LightGBM MSE (log): {lgb_mse_log:.4f}")
print(f"LightGBM MSE: {mse_lgb:.4f}")
print(f"LightGBM лучший RMSE: {lgb_model.best_score['valid_0']['rmse']:.4f}")

In [None]:
# Feature importance для LightGBM
lgb_importance = pd.DataFrame({
    'feature': X_train_lgb.columns,
    'importance': lgb_model.feature_importance()
}).sort_values('importance', ascending=False)

emb_importance_lgb = lgb_importance[lgb_importance['feature'].str.contains('pca_emb_')]['importance'].sum()
total_importance_lgb = lgb_importance['importance'].sum()

print(f"LightGBM - доля важности эмбеддингов: {emb_importance_lgb/total_importance_lgb:.1%}")

top_features = lgb_importance.sort_values('importance', ascending=False).head(15)

top_features['importance_pct'] = top_features['importance']
print(top_features[['feature', 'importance_pct']])


plt.figure(figsize=(10, 6))
sns.barplot(x=top_features['importance'], y=top_features['feature'])
plt.xlabel('Importance (%)')
plt.ylabel('Feature')
plt.title('Топ 15 признаков по важности (LGB)')
plt.tight_layout()
plt.show()

###XGBoost
обучалась на 15к

In [None]:
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=1.0,
    learning_rate=0.01,
    max_depth=5,
    n_estimators=300,
    reg_alpha=0.1,
    reg_lambda=1,
    submsample=0.8,
    random_state=42,
    tree_method='hist',
    device='cuda',
    verbosity=1,
    n_jobs=1,
    verbose=2
)

xgb_model.fit(X_train_sklearn, y_train)

# Предсказания
test_preds_log = xgb_model.predict(X_test_sklearn)
test_preds = np.expm1(test_preds_log)

mse_log = mean_squared_error(y_test, test_preds_log)
mse_xgb = mean_squared_error(test_with_emb['labor_costs'], test_preds)

print(f"XGBoost MSE (log): {mse_log:.4f}")
print(f"XGBoost MSE: {mse_xgb:.4f}")

In [None]:
# Feature importance для XGB
xgb_importance = pd.DataFrame({
    'feature': X_train_sklearn.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

emb_features_rf = [f for f in xgb_importance['feature'] if 'pca_emb_' in f]
emb_importance_rf = xgb_importance[xgb_importance['feature'].isin(emb_features_rf)]['importance'].sum()
total_importance_rf = xgb_importance['importance'].sum()

print(f"XGBoost - доля важности эмбеддингов: {emb_importance_rf/total_importance_rf:.1%}")

top_features = xgb_importance.sort_values('importance', ascending=False).head(15)

top_features['importance_pct'] = top_features['importance'] * 100
print(top_features[['feature', 'importance_pct']])


plt.figure(figsize=(10, 6))
sns.barplot(x=top_features['importance']*100, y=top_features['feature'])
plt.xlabel('Importance (%)')
plt.ylabel('Feature')
plt.title('Топ 15 признаков по важности (XGBoost)')
plt.tight_layout()
plt.show()



###LinearRegression

In [None]:
def prepare_data_for_linear_models(X_train, X_val, cat_features, num_features):

    # OneHot для категориальных
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    # StandardScaler для числовых
    scaler = StandardScaler()

    # Трансформер
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", scaler, num_features),
            ("cat", ohe, cat_features)
        ]
    )

    # fit + transform train
    X_train_prepared = preprocessor.fit_transform(X_train)

    # transform val
    X_val_prepared = preprocessor.transform(X_val)

    return X_train_prepared, X_val_prepared


num_features = [col for col in X_train.columns if col not in cat_features]
X_train_linear, X_val_linear = prepare_data_for_linear_models(X_train, X_val, cat_features, num_features)

In [None]:
param_grid_lr = {
   'fit_intercept': [True],
   'positive': [False, True]
}

lr_model = LinearRegression(
   n_jobs=-1
)

grid_search_lr = GridSearchCV(
   estimator=lr_model,
   param_grid=param_grid_lr,
   cv=3,
   scoring='neg_mean_squared_error',
   verbose=2,
   n_jobs=-1
)

grid_search_lr.fit(X_train_linear, y_train)
best_lr = grid_search_lr.best_estimator_

# Предсказания
val_preds_log = best_lr.predict(X_val_sklearn)
val_preds = np.expm1(val_preds_log)

mse_log = mean_squared_error(y_val, val_preds_log)
mse = mean_squared_error(val_with_emb['labor_costs'], val_preds)

print(f"LR MSE (log): {mse_log:.4f}")
print(f"LR MSE: {mse:.4f}")

###NN
обучалась на 100к данных

In [None]:
LR=0.003
EPOCHS=200
BATCH_SIZE=1024
PATIENCE=25

In [None]:
# Подготовка данных
def prepare_data(X_train, X_val, y_train, y_val, batch_size):
    # Стандартизация признаков
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Конвертация в tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled)
    X_val_tensor = torch.FloatTensor(X_val_scaled)
    y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1))
    y_val_tensor = torch.FloatTensor(y_val.values.reshape(-1, 1))

    # Создание DataLoader'ов
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader, scaler

train_dataloader, val_dataloader, scaler = prepare_data(
    X_train_sklearn, X_val_sklearn, y_train, y_val, batch_size=BATCH_SIZE
)


In [None]:
class FeedForwardNN(nn.Module):
  def __init__(self, input_size, dropout_rate=0.3, max_log_value = 5):
    super(FeedForwardNN, self).__init__()

    self.max_log_value = max_log_value

    self.network = nn.Sequential(
        # input_size -> 512
        nn.Linear(input_size, 512),
        nn.BatchNorm1d(512),
        nn.GELU(),
        nn.Dropout(dropout_rate),

        # 512 -> 1024
        nn.Linear(512, 1024),
        nn.BatchNorm1d(1024),
        nn.GELU(),
        nn.Dropout(dropout_rate),

        # 1024 -> 512
        nn.Linear(1024, 512),
        nn.BatchNorm1d(512),
        nn.GELU(),
        nn.Dropout(dropout_rate),

        # 512 -> 256
        nn.Linear(512, 256),
        nn.BatchNorm1d(256),
        nn.GELU(),
        nn.Dropout(dropout_rate),

        # 256 -> 128
        nn.Linear(256, 128),
        nn.BatchNorm1d(128),
        nn.GELU(),
        nn.Dropout(dropout_rate),

        # 128 -> 64
        nn.Linear(128, 64),
        nn.BatchNorm1d(64),
        nn.GELU(),
        nn.Dropout(dropout_rate / 2),

        # 64 -> 1
        nn.Linear(64, 1)
    )

  def forward(self, x):
    x = self.network(x)
    x = torch.clamp(x, min=-2.0, max=self.max_log_value)


    return x

In [None]:
# Создание модели
input_size = X_train_sklearn.shape[1]
print(input_size)
model = FeedForwardNN(input_size, dropout_rate=0.3)
model.to(device)

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, restore_best_weights=True, verbose=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.verbose = verbose
        self.best_loss = float('inf')
        self.counter = 0
        self.best_weights = None
        self.early_stop = False

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            if self.restore_best_weights:
                self.best_weights = model.state_dict().copy()
            if self.verbose:
                print(f"Validation loss improved to {val_loss:.4f}")
        else:
            # Нет улучшения
            self.counter += 1
            if self.verbose:
                print(f"No improvement for {self.counter}/{self.patience} epochs")

        if self.counter >= self.patience:
            self.early_stop = True
            if self.restore_best_weights and self.best_weights:
                model.load_state_dict(self.best_weights)
                if self.verbose:
                    print("Restored best weights")
            if self.verbose:
                print("Early stopping triggered!")

In [None]:
# Loss function для регрессии
loss_fn = torch.nn.MSELoss()


# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

# Scheduler
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [None]:
def train_one_epoch(model, dataloader, optimizer, loss_fn, scheduler, grad_clip=1.0):
    model.train()
    total_loss = 0
    loss_history = []

    for batch_idx, (input_data, labels) in enumerate(tqdm(dataloader)):
        input_data = input_data.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_data)
        loss = loss_fn(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        loss_history.append(loss.item())

    avg_loss = total_loss / len(dataloader)
    return avg_loss, loss_history

def validate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for input_data, labels in tqdm(dataloader):
            input_data = input_data.to(device)
            labels = labels.to(device)

            outputs = model(input_data)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [None]:
# История обучения
losses = {'train': [], 'val': []}
batch_losses_list = []
best_val_loss = float('inf')
early_stopping = EarlyStopping(patience=PATIENCE, min_delta=0.001, verbose=True)
best_model_path = None

# Основной цикл обучения
for epoch in range(EPOCHS):
    print(f"\nEPOCH {epoch + 1}/{EPOCHS}")

    train_loss, train_loss_history = train_one_epoch(
        model, train_dataloader, optimizer, loss_fn, scheduler, grad_clip=1.0
    )
    print(f"Train Loss: {train_loss:.4f}")

    val_loss = validate(model, val_dataloader, loss_fn)
    print(f"Val Loss: {val_loss:.4f}")

    # Сохранение лучшей модели (по val_loss для регрессии)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        # best_model_path = "some_local_path.pth"
        # torch.save({
        #     'model_state_dict': model.state_dict(),
        #     'optimizer_state_dict': optimizer.state_dict(),
        #     'scheduler_state_dict': scheduler.state_dict(),
        #     'epoch': epoch + 1,
        #     'input_size': input_size,
        #     'dropout_rate': 0.3,
        #     'train_loss': train_loss,
        #     'val_loss': val_loss,
        #     'best_val_loss': best_val_loss,
        # }, best_model_path)
        print(f"Saved new best model! Val Loss: {val_loss:.4f}")

    losses['train'].append(train_loss)
    losses['val'].append(val_loss)
    batch_losses_list.extend(train_loss_history)

    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print(f"\nEarly stopping at epoch {epoch+1}")
        print(f"Best validation loss: {early_stopping.best_loss:.4f}")
        break


In [None]:
# # загрузка лучшей модели
# print(f"Загружаем лучшую модель из: {best_model_path}")
# checkpoint = torch.load(best_model_path, map_location=device)
# model.load_state_dict(checkpoint['model_state_dict'])
# print(f"Лучшая модель загружена! Epoch: {checkpoint['epoch']}, Val Loss: {checkpoint['val_loss']:.4f}")

In [None]:
# Финальные предсказания
model.eval()
predictions = []

with torch.no_grad():
    for input_data, _ in val_dataloader:
        input_data = input_data.to(device)
        outputs = model(input_data)
        predictions.extend(outputs.cpu().numpy().flatten())

val_preds_log = np.array(predictions)
val_preds = np.expm1(val_preds_log)

# Метрики
mse_log = mean_squared_error(y_val, val_preds_log)
mse_nn = mean_squared_error(val_with_emb['labor_costs'], val_preds)

print(f"\nФинальные результаты:")
print(f"Neural Network MSE (log): {mse_log:.4f}")
print(f"Neural Network MSE: {mse_nn:.4f}")



In [None]:
# Построение графиков обучения

plt.figure(figsize=(10, 5))
plt.plot(batch_losses_list, label="Batch Loss")
plt.xlabel("Batch iteration")
plt.ylabel("MSE Loss")
plt.title("Training Loss per Batch")
plt.yscale("log")
plt.legend()
plt.show()

plt.subplot(1, 2, 2)
plt.scatter(val_with_emb['labor_costs'], val_preds, alpha=0.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Predictions vs Actual')

plt.tight_layout()
plt.show()

In [None]:
mses = {}
mses['Catboost'] = mse
mses['RandomForest'] = mse_rf
mses['LightGBM'] = mse_lgb
mses['XGBoost'] = mse_xgb

mses

In [None]:
plt.figure(figsize=(8, 5))
plt.bar(mses.keys(), mses.values())
plt.ylabel("MSE")
plt.title("Сравнение MSE разных моделей")
plt.xticks(rotation=30)
plt.show()