# Снижение дисперсии на реальных данных

## 📋 Оглавление

1. [**Загрузка данных**](#DATA) - подготовка датасета для эксперимента
2. [**Модели снижения дисперсии**](#Модели):
   - [BaseLine](#BaseLine) - стандартная оценка без снижения дисперсии
   - [Base CUPED](#Base-CUPED) - классический CUPED с одной контрольной переменной
   - [AutoCUPAC](#AutoCUPAC) - наш улучшенный метод с автоматическим выбором признаков
   - [Ambrosia](#Ambrosia) - сравнение с внешней библиотекой (⚠️залагала в процессе выполнения)
   - [Cluster Experiments](#Cluster-Experiments) - сравнение с внешней библиотекой
3. [**Результаты**](#RESULTS) - сводная таблица с показателями снижения дисперсии


## DATA

Используем реальные данные для демонстрации эффективности различных методов снижения дисперсии.

In [None]:
import pandas as pd
import numpy as np
import time
from functools import wraps
pd.set_option(
    'display.float_format', '{:.2f}'.format,
    'display.max_columns', None,
    'display.max_rows', None
)

In [9]:
df.head()

Unnamed: 0,epk_id,pl_6m,npv_val,sd_resident_nflag,cla_full_active_nflag,sd_sbrf_employee_nflag,prl_employee_dzo_nflag,dep_payroll_client_nflag,dep_social_client_nflag,prd_crd_dc_active_nflag,prd_crd_cc_active_nflag,dep_activity_nflag,dep_curr_acc_activity_nflag,prd_da_active_nflag,prd_pl_active_nflag,prd_mg_active_nflag,prd_insurance_active_nflag,prd_invest_prod_active_nflag,prd_vzr_active_nflag,prd_dszh_plus_active_nflag,srv_thanks_nflag,crd_cc_overdue_nflag,salary_amt,pens_amt,sbp_in_amt,sbp_out_amt,cash_in,cash_out,pos_amt,sdo_amt,invest_amt,cc_debt,auto_debt,mg_debt,pl_debt,loan_redemption,pl_6m_lag,npv_val_lag,sd_resident_nflag_lag,cla_full_active_nflag_lag,sd_sbrf_employee_nflag_lag,prl_employee_dzo_nflag_lag,dep_payroll_client_nflag_lag,dep_social_client_nflag_lag,prd_crd_dc_active_nflag_lag,prd_crd_cc_active_nflag_lag,dep_activity_nflag_lag,dep_curr_acc_activity_nflag_lag,prd_da_active_nflag_lag,prd_pl_active_nflag_lag,prd_mg_active_nflag_lag,prd_insurance_active_nflag_lag,prd_invest_prod_active_nflag_lag,prd_vzr_active_nflag_lag,prd_dszh_plus_active_nflag_lag,srv_thanks_nflag_lag,crd_cc_overdue_nflag_lag,salary_amt_lag,pens_amt_lag,sbp_in_amt_lag,sbp_out_amt_lag,cash_in_lag,cash_out_lag,pos_amt_lag,sdo_amt_lag,invest_amt_lag,cc_debt_lag,auto_debt_lag,mg_debt_lag,pl_debt_lag,loan_redemption_lag,pl_6m_lag_2,npv_val_lag_2,sd_resident_nflag_lag_2,cla_full_active_nflag_lag_2,sd_sbrf_employee_nflag_lag_2,prl_employee_dzo_nflag_lag_2,dep_payroll_client_nflag_lag_2,dep_social_client_nflag_lag_2,prd_crd_dc_active_nflag_lag_2,prd_crd_cc_active_nflag_lag_2,dep_activity_nflag_lag_2,dep_curr_acc_activity_nflag_lag_2,prd_da_active_nflag_lag_2,prd_pl_active_nflag_lag_2,prd_mg_active_nflag_lag_2,prd_insurance_active_nflag_lag_2,prd_invest_prod_active_nflag_lag_2,prd_vzr_active_nflag_lag_2,prd_dszh_plus_active_nflag_lag_2,srv_thanks_nflag_lag_2,crd_cc_overdue_nflag_lag_2,salary_amt_lag_2,pens_amt_lag_2,sbp_in_amt_lag_2,sbp_out_amt_lag_2,cash_in_lag_2,cash_out_lag_2,pos_amt_lag_2,sdo_amt_lag_2,invest_amt_lag_2,cc_debt_lag_2,auto_debt_lag_2,mg_debt_lag_2,pl_debt_lag_2,loan_redemption_lag_2,d,sd_gender_cd_F,sd_gender_cd_M,seg_age_segment_ACTIVE AGE,seg_age_segment_KIDS,seg_age_segment_MASS,seg_age_segment_OLD,seg_age_segment_TEEN,seg_age_segment_YOUTH,sd_stlmnt_type_cd_CITY_MLNR,sd_stlmnt_type_cd_CITY_OTHER,sd_stlmnt_type_cd_UNKNOWN_,sd_stlmnt_type_cd_VILLAGE,seg_client_fl_segment_cd_GROWN_UP,seg_client_fl_segment_cd_KIDS,seg_client_fl_segment_cd_MVS_EXT,seg_client_fl_segment_cd_MVS_KEY,seg_client_fl_segment_cd_PB,seg_client_fl_segment_cd_PREADULT,seg_client_fl_segment_cd_PRIME_AGE,seg_client_fl_segment_cd_SENIOR,seg_client_fl_segment_cd_TEEN,seg_client_fl_segment_cd_TOP_AFFL,seg_client_fl_segment_cd_YOUTH,seg_service_channel_cd_MASS,seg_service_channel_cd_PB,seg_service_channel_cd_PON,seg_service_channel_cd_SB1,seg_service_channel_cd_SBP,seg_client_cx_segment_cd_GROWN_UP,seg_client_cx_segment_cd_KIDS,seg_client_cx_segment_cd_MVS,seg_client_cx_segment_cd_PB,seg_client_cx_segment_cd_PREADULT,seg_client_cx_segment_cd_PRIME_AGE,seg_client_cx_segment_cd_SENIOR,seg_client_cx_segment_cd_TEEN,seg_client_cx_segment_cd_TOP_AFFL,seg_client_cx_segment_cd_YOUTH,sd_gender_cd_lag_F,sd_gender_cd_lag_M,seg_age_segment_lag_ACTIVE AGE,seg_age_segment_lag_KIDS,seg_age_segment_lag_MASS,seg_age_segment_lag_OLD,seg_age_segment_lag_TEEN,seg_age_segment_lag_YOUTH,sd_stlmnt_type_cd_lag_CITY_MLNR,sd_stlmnt_type_cd_lag_CITY_OTHER,sd_stlmnt_type_cd_lag_UNKNOWN_,sd_stlmnt_type_cd_lag_VILLAGE,seg_client_fl_segment_cd_lag_GROWN_UP,seg_client_fl_segment_cd_lag_KIDS,seg_client_fl_segment_cd_lag_MVS_EXT,seg_client_fl_segment_cd_lag_MVS_KEY,seg_client_fl_segment_cd_lag_PB,seg_client_fl_segment_cd_lag_PREADULT,seg_client_fl_segment_cd_lag_PRIME_AGE,seg_client_fl_segment_cd_lag_SENIOR,seg_client_fl_segment_cd_lag_TEEN,seg_client_fl_segment_cd_lag_TOP_AFFL,seg_client_fl_segment_cd_lag_YOUTH,seg_service_channel_cd_lag_MASS,seg_service_channel_cd_lag_PB,seg_service_channel_cd_lag_PON,seg_service_channel_cd_lag_SB1,seg_service_channel_cd_lag_SBP,seg_client_cx_segment_cd_lag_GROWN_UP,seg_client_cx_segment_cd_lag_KIDS,seg_client_cx_segment_cd_lag_MVS,seg_client_cx_segment_cd_lag_PB,seg_client_cx_segment_cd_lag_PREADULT,seg_client_cx_segment_cd_lag_PRIME_AGE,seg_client_cx_segment_cd_lag_SENIOR,seg_client_cx_segment_cd_lag_TEEN,seg_client_cx_segment_cd_lag_TOP_AFFL,seg_client_cx_segment_cd_lag_YOUTH,sd_gender_cd_lag_2_F,sd_gender_cd_lag_2_M,seg_age_segment_lag_2_ACTIVE AGE,seg_age_segment_lag_2_KIDS,seg_age_segment_lag_2_MASS,seg_age_segment_lag_2_OLD,seg_age_segment_lag_2_TEEN,seg_age_segment_lag_2_YOUTH,sd_stlmnt_type_cd_lag_2_CITY_MLNR,sd_stlmnt_type_cd_lag_2_CITY_OTHER,sd_stlmnt_type_cd_lag_2_UNKNOWN_,sd_stlmnt_type_cd_lag_2_VILLAGE,seg_client_fl_segment_cd_lag_2_GROWN_UP,seg_client_fl_segment_cd_lag_2_KIDS,seg_client_fl_segment_cd_lag_2_MVS_EXT,seg_client_fl_segment_cd_lag_2_MVS_KEY,seg_client_fl_segment_cd_lag_2_PB,seg_client_fl_segment_cd_lag_2_PREADULT,seg_client_fl_segment_cd_lag_2_PRIME_AGE,seg_client_fl_segment_cd_lag_2_SENIOR,seg_client_fl_segment_cd_lag_2_TEEN,seg_client_fl_segment_cd_lag_2_TOP_AFFL,seg_client_fl_segment_cd_lag_2_YOUTH,seg_service_channel_cd_lag_2_MASS,seg_service_channel_cd_lag_2_PB,seg_service_channel_cd_lag_2_PON,seg_service_channel_cd_lag_2_SB1,seg_service_channel_cd_lag_2_SBP,seg_client_cx_segment_cd_lag_2_GROWN_UP,seg_client_cx_segment_cd_lag_2_KIDS,seg_client_cx_segment_cd_lag_2_MVS,seg_client_cx_segment_cd_lag_2_PB,seg_client_cx_segment_cd_lag_2_PREADULT,seg_client_cx_segment_cd_lag_2_PRIME_AGE,seg_client_cx_segment_cd_lag_2_SENIOR,seg_client_cx_segment_cd_lag_2_TEEN,seg_client_cx_segment_cd_lag_2_TOP_AFFL,seg_client_cx_segment_cd_lag_2_YOUTH
0,1126087760123139866,1651.14,24959.68,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,22986.85,14507.0,11540.0,0.0,0.0,24052.73,12948.72,0.0,0.0,0.0,0.0,0.0,0.0,1574.86,17544.74,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,22986.85,600.0,1311.0,0.0,0.0,15195.38,8596.42,0.0,0.0,0.0,0.0,0.0,0.0,1472.42,14829.3,1,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0.0,22986.85,0.0,5700.0,0.0,0.0,15656.21,7849.66,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,1126087957692948301,-56.27,47.73,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206.31,0.0,0.0,0.0,0.0,0.0,0.0,-53.59,47.69,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205.53,0.0,0.0,0.0,0.0,0.0,0.0,-50.86,47.67,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.51,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1126088039297816510,-7.67,2198.91,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,-23.31,2470.8,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,-54.18,2462.33,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,8.76,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1126088284112300120,-62.76,24.65,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.96,0.0,0.0,0.0,0.0,0.0,0.0,-69.41,24.62,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.39,0.0,0.0,0.0,0.0,0.0,0.0,-81.34,24.62,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.06,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,1126097801760484327,-69.07,1074.15,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.52,0.0,33666.84,0.0,0.0,0.0,0.0,-66.7,1073.39,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.52,0.0,33666.84,0.0,0.0,0.0,0.0,-70.83,1073.68,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.52,0.0,16833.42,0.0,0.0,0.0,0.0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


Необходимо заполнить как снизу в примере

In [10]:
CUPED_FEATURE = 'pl_6m_lag' #только одна для купеда
CUPAC_FEATURE = df.filter(like='_lag').columns.tolist() 
TREATMENT = 'd'
TARGET = 'pl_6m'

In [11]:
GLOBAL_RESULTS = pd.DataFrame(columns=[
    "experiment", "df",
    "control mean", "test mean", "control means diff", "test means diff",
    "var_red", "execution time (sec)"
])

def log_execution_time(func):
    @wraps(func)
    def wrapper(self, df, *args, **kwargs):
        start = time.time()
        result = func(self, df, *args, **kwargs)
        self.execution_time = time.time() - start
        return result
    return wrapper

## Модели

Сравнение различных подходов к снижению дисперсии в экспериментах.

### BaseLine

Стандартная оценка ATE без применения методов снижения дисперсии. Базовый уровень для сравнения.

In [12]:

class Experiment:
    name = None
    counter = 1
    transformated_column = None
    execution_time = None

    def __init__(self):
        self.name = 'BaseLine'

    @log_execution_time
    def execute(self, df: pd.DataFrame):
        self.transformated_column = df[TARGET]

    def save_result(self, df: pd.DataFrame):
        global GLOBAL_RESULTS
        if self.transformated_column is None:
            return
        
        df = df.copy()
        df['y_transform'] = self.transformated_column
        control = df[df[TREATMENT] == 0]
        test = df[df[TREATMENT] == 1]

        control_mean_original = control[TARGET].mean()
        test_mean_original = test[TARGET].mean()

        control_mean_transformed = control['y_transform'].mean()
        test_mean_transformed = test['y_transform'].mean()

        var_original = df[TARGET].var()
        var_transformed = df['y_transform'].var()

        var_reduction = (var_original - var_transformed) / var_original if var_original != 0 else None

        control_means_diff = control_mean_transformed - control_mean_original
        test_means_diff = test_mean_transformed - test_mean_original

        summary = {
            "experiment": self.name,
            "df": f"df{self.counter}",
            "control mean": control_mean_original,
            "test mean": test_mean_original,
            "var_red": var_reduction if var_reduction is not None else None,
            "control means diff": control_means_diff,
            "test means diff": test_means_diff,
            "execution time (sec)": self.execution_time
        }

        GLOBAL_RESULTS = pd.concat([GLOBAL_RESULTS, pd.DataFrame([summary])], ignore_index=True)
        self.counter += 1
    
    def execute_and_save(self, df):
        self.execute(df)
        self.save_result(df)

### Base CUPED

Классический CUPED с использованием одной контрольной переменной. Простой и эффективный метод снижения дисперсии.

In [13]:
class CupedExperiment(Experiment):
    def __init__(self):
        self.name = 'BaseCuped'

    def cuped(data: pd.DataFrame, metric_col: str, covariate_col: str):
        cov_xy = data[[metric_col, covariate_col]].cov().loc[metric_col, covariate_col]
        std_y = data[metric_col].std()
        std_x = data[covariate_col].std()
        theta = cov_xy/(std_y*std_x)
        
        return data[metric_col] - theta*(data[covariate_col] - data[covariate_col].mean())

    @log_execution_time
    def execute(self, df: pd.DataFrame):
        self.transformated_column = CupedExperiment.cuped(df, TARGET, CUPED_FEATURE)

### AutoCUPAC

**Наш основной метод** - автоматический выбор признаков и модели для максимального снижения дисперсии. Использует ML-подходы для оптимальной комбинации контрольных переменных.

In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from typing import Optional, Dict, List, Union


class CUPACTransformer:
    """
    Улучшенный CUPAC трансформер с расширенной отчетностью
    """

    def __init__(
        self,
        target_col: str,
        lag_suffix: str = "_lag",
        target_counterfactual_suffix="0",
        models: Optional[Dict] = None,
        n_folds: int = 5,
        random_state: Optional[int] = None,
    ):
        self.target_col = target_col
        self.target_counterfactual_suffix = target_counterfactual_suffix
        self.lag_suffix = lag_suffix
        self.n_folds = n_folds
        self.random_state = random_state

        # Инициализация моделей
        self.models = models or {
            "Linear": LinearRegression(),
            "Ridge": Ridge(alpha=0.5),
            "Lasso": Lasso(alpha=0.01, max_iter=10000),
            "CatBoost": CatBoostRegressor(
                iterations=100,
                depth=4,
                learning_rate=0.1,
                silent=True,
                random_state=random_state,
                allow_writing_files=False,
            ),
        }

        # Состояние модели
        self.best_model = None
        self.best_model_name = None
        self.best_score = -np.inf
        self.variance_reduction = None
        self.lag_features = None
        self.current_features = None
        self.is_fitted = False
        self.model_results_ = {}
        self.feature_importances_ = None

    def _prepare_train_data(self, df: pd.DataFrame) -> tuple:
        """Подготовка данных для обучения"""
        target_counterfactual_name = (
            f"{self.target_col}{self.target_counterfactual_suffix}{self.lag_suffix}"
        )

        self.lag_features = [
            col
            for col in df.columns
            if col.endswith(self.lag_suffix)
            and col != f"{self.target_col}{self.lag_suffix}"
        ]

        if not self.lag_features:
            raise ValueError("Не найдены лаговые признаки для обучения")

        self.current_features = [
            col.replace(self.lag_suffix, "") for col in self.lag_features
        ]

        self.current_features.append(f"{target_counterfactual_name}_1")

        return df[self.lag_features], df[f"{target_counterfactual_name}_1"]

    def _prepare_inference_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Подготовка данных для применения"""
        if not self.current_features:
            raise RuntimeError("Сначала обучите модель (fit())")

        missing = [col for col in self.current_features if col not in df.columns]
        if missing:
            raise ValueError(f"Отсутствуют признаки: {missing}")

        self.current_features = ['X1_lag', 'X2_lag']

        return df[self.current_features].rename(
            columns=dict(zip(self.current_features, self.lag_features))
        )

    def _calculate_variance_reduction(self, y: pd.Series, pred: pd.Series) -> float:
        """Расчет снижения дисперсии"""
        pred_centered = pred - pred.mean()
        if pred_centered.var() < 1e-10:
            return 0.0

        theta = np.cov(y, pred_centered)[0, 1] / pred_centered.var()
        y_adj = y - theta * pred_centered
        return max(0, (1 - y_adj.var() / y.var()) * 100)

    def fit(self, df: pd.DataFrame) -> "CUPACTransformer":
        """Обучение модели на исторических данных"""
        # X, y = self._prepare_train_data(df)
        X, y = df[CUPAC_FEATURE], df[TARGET]

        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        results = {}

        for name, model in self.models.items():
            fold_scores = []
            fold_var_reductions = []
            status = "success"

            try:
                for train_idx, val_idx in kf.split(X):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                    if name == "CatBoost":
                        m = CatBoostRegressor(**model.get_params())
                        m.fit(X_train, y_train, verbose=False)
                    else:
                        m = model.__class__(**model.get_params())
                        m.fit(X_train, y_train)

                    pred = m.predict(X_val)
                    fold_scores.append(r2_score(y_val, pred))
                    fold_var_reductions.append(
                        self._calculate_variance_reduction(y_val, pred)
                    )

                results[name] = {
                    "r2": np.nanmean(fold_scores),
                    "var_reduction": np.nanmean(fold_var_reductions),
                    "status": status,
                }

            except Exception as e:
                error_msg = f"{type(e).__name__}: {str(e)}"
                results[name] = {
                    "r2": None,
                    "var_reduction": None,
                    "status": f"failed: {error_msg}",
                }
                print(f"Ошибка в {name}: {error_msg}")

        self.model_results_ = results

        # Выбор лучшей модели из успешных
        successful_models = {
            k: v for k, v in results.items() if v["status"] == "success"
        }
        if not successful_models:
            raise RuntimeError("Все модели завершились с ошибкой")

        self.best_model_name = max(
            successful_models, key=lambda x: successful_models[x]["var_reduction"]
        )
        self.best_score = successful_models[self.best_model_name]["r2"]
        self.variance_reduction = successful_models[self.best_model_name][
            "var_reduction"
        ]

        # Финальное обучение и feature importance
        # X, y = self._prepare_train_data(df)
        X, y = df[CUPAC_FEATURE], df[TARGET]
        best_model_params = self.models[self.best_model_name].get_params()

        if self.best_model_name == "CatBoost":
            self.best_model = CatBoostRegressor(**best_model_params)
            self.best_model.fit(X, y, verbose=False)
            self.feature_importances_ = dict(
                zip(X.columns, self.best_model.get_feature_importance())
            )
        else:
            self.best_model = self.models[self.best_model_name].__class__(
                **best_model_params
            )
            self.best_model.fit(X, y)
            if hasattr(self.best_model, "coef_"):
                self.feature_importances_ = dict(zip(X.columns, self.best_model.coef_))
            else:
                self.feature_importances_ = None

        self.is_fitted = True
        return self

    def transform(self, df: pd.DataFrame, inplace: bool = False) -> pd.DataFrame:
        """Применение модели к новым данным"""
        if not self.is_fitted:
            raise RuntimeError("Сначала вызовите fit()")

        # X = self._prepare_inference_data(df)
        # y = df[self.target_col]
        X, y = df[CUPAC_FEATURE], df[TARGET]
        pred = self.best_model.predict(X)

        # pred_centered = pred - pred.mean()
        # theta = np.cov(y, pred_centered)[0, 1] / pred_centered.var()
        # y_adj = y - theta * pred_centered
        y_adj = y - pred + y.mean()

        if inplace:
            df[f"{self.target_col}_cupac"] = y_adj
            return df
        return df.assign(**{f"{self.target_col}_cupac": y_adj})

    def get_report(self) -> str:
        """Генерация расширенного отчета"""
        if not self.is_fitted:
            return "Модель не обучена. Сначала вызовите fit()."

        # Сортировка фич по важности
        sorted_features = (
            sorted(
                self.feature_importances_.items(), key=lambda x: abs(x[1]), reverse=True
            )[:10]
            if self.feature_importances_
            else []
        )

        # Форматирование сравнения моделей
        model_comparison = []
        for name, data in self.model_results_.items():
            if data["status"] != "success":
                line = f"{name}: {data['status']}"
            else:
                line = (
                    f"{name}: R²={data['r2']:.3f}, "
                    f"Var.Red.={data['var_reduction']:.1f}%"
                )
            model_comparison.append(line)

        # Форматирование фич
        feature_analysis = []
        if sorted_features:
            max_coef = max(abs(v) for _, v in sorted_features)
            for feat, coef in sorted_features:
                rel_impact = abs(coef) / max_coef if max_coef != 0 else 0
                feature_analysis.append(
                    f"- {feat:<25} {coef:>7.3f} {'▇'*int(10*rel_impact)}"
                )

        report = [
            "Расширенный CUPAC Report",
            "=" * 40,
            "Сравнение моделей:",
            *model_comparison,
            "",
            f"Лучшая модель: {self.best_model_name}",
            f"Снижение дисперсии: {self.variance_reduction:.1f}%",
            f"Качество предсказания (R²): {self.best_score:.3f}",
            "",
            "Топ-10 значимых признаков:",
            *(
                feature_analysis
                if feature_analysis
                else ["Нет данных о важности признаков"]
            ),
            "",
            "Интерпретация:",
            "▇▇▇▇▇▇▇▇▇▇ - максимальное влияние",
            "Коэффициенты > 0: положительная связь с целевой переменной",
            "Коэффициенты < 0: отрицательная связь",
        ]
        return "\n".join(report)

    def fit_transform(
        self,
        df_train: pd.DataFrame,
        df_apply: Optional[pd.DataFrame] = None,
        inplace: bool = False,
    ) -> pd.DataFrame:
        self.fit(df_train)
        df_apply = df_train if df_apply is None else df_apply
        return self.transform(df_apply, inplace=inplace)

    def get_feature_mapping(self) -> Dict[str, str]:
        return dict(zip(self.lag_features, self.current_features))

In [15]:
class AutoCupacExperiment(Experiment):
    def __init__(self):
        self.name = 'AutoCupac'
        self.transformer = None

    @log_execution_time
    def execute(self, df: pd.DataFrame):
        transformer = CUPACTransformer(target_col=TARGET)
        transformer.fit(df)
        transformed_data = transformer.transform(df)
        
        self.transformer = transformer
        self.transformated_column = transformed_data[f'{TARGET}_cupac']

### Ambrosia

Сравнение с внешней библиотекой. **Примечание**: Ambrosia залагала в процессе выполнения.

In [16]:
from ambrosia.preprocessing import MultiCuped

class AmbrosiaCupacExperiment(Experiment):
    def __init__(self):
        self.name = 'AmbrosiaMultiCuped'

    @log_execution_time
    def execute(self, df: pd.DataFrame):
        cuped = MultiCuped(verbose=False)
        cuped.fit(
                df,
                TARGET,
                CUPAC_FEATURE)
        transformed_data = cuped.transform(df)

        self.transformated_column = transformed_data[f'{TARGET}_transformed']


### Cluster Experiments

Сравнение с внешней библиотекой.

In [17]:
from cluster_experiments import (
    AnalysisPlan, SimpleMetric, Variant,
    HypothesisTest, TargetAggregation
)

class CupacClusterExperiment(Experiment):
    def __init__(self):
        self.name = 'ClusterCupac'

    @log_execution_time
    def execute(self, df: pd.DataFrame):
        cur = df.copy()
        cur["user_id"] = np.arange(len(cur))
        cur = cur.rename(columns={TREATMENT: "variant"})
        cur["variant"] = cur["variant"].replace({0: "control", 1: "treatment"})

        cupac_model = TargetAggregation(
            agg_col="user_id"
        )

        cupac_model.fit(cur[["user_id"] + list(set(CUPAC_FEATURE) - set(CUPED_FEATURE))], cur[CUPED_FEATURE])
        pred = cupac_model.predict(cur[["user_id"] + CUPAC_FEATURE])

        pred_centered = pred - pred.mean()
        theta = np.cov(cur[TARGET], pred_centered)[0, 1] / pred_centered.var()
        self.transformated_column = cur[TARGET] - theta * pred_centered

## RESULTS

**Сводная таблица результатов** - сравнение эффективности методов по снижению дисперсии (var_red) и времени выполнения.

In [18]:
experiments = [Experiment(), CupedExperiment(), AutoCupacExperiment(), AmbrosiaCupacExperiment(), CupacClusterExperiment()]

In [None]:
from tqdm import tqdm
for exp in tqdm(experiments):
    for df in dfs:
        exp.execute_and_save(df)

100%|██████████| 5/5 [1:45:55<00:00, 1271.09s/it]  


In [23]:
GLOBAL_RESULTS

Unnamed: 0,experiment,df,control mean,test mean,control means diff,test means diff,var_red,execution time (sec)
0,BaseLine,df1,4854.04,4905.29,0.0,0.0,0.00,0.0
1,BaseCuped,df1,4854.04,4905.29,-8.86,8.86,0.30,0.08
2,AutoCupac,df1,4854.04,4905.29,-11.03,11.03,0.33,6325.33
3,AmbrosiaMultiCuped,df1,4854.04,4905.29,1.0286429183856296e+24,-1.0287969423798597e+24,-8481995866788892498083888852471408994273355366...,7.13
4,ClusterCupac,df1,4854.04,4905.29,-11.74,11.74,0.32,7.98
