In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import sys

# !{sys.executable} -m pip install linearmodels

# сгенерируем синтетические данные

In [3]:
import pandas as pd
import numpy as np

In [4]:
from data_gen import DataGenerator

In [5]:
df_gen = DataGenerator(
    n_samples = 10_000,
    time_correlations={
        'X1': 0.3,
        'X2': 0.2,
        'y0': 0.8
    },
    distributions = {
        'X1': {'type': 'normal', 'mean': 1, 'std': 2},
        'X2': {'type': 'bernoulli', 'p': 0.4},
        'y0': {'type': 'normal', 'mean': 10, 'std': 4}
    },
    effect_size = 5.0,
    seed = 1114,
)

df = df_gen.generate()

In [6]:
df.describe()

Unnamed: 0,X1,X1_lag,X2,X2_lag,y0,y0_lag_1,y0_lag_2,z,U,D,d,y1,y
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.014193,1.000744,0.4061,0.3937,9.930242,9.94699,9.92388,0.4994,-0.002302,0.4968,0.3303,14.919766,11.759085
std,1.974633,2.012851,0.491128,0.488594,4.030378,4.034053,4.027346,0.500025,1.005845,0.500015,0.470345,4.725965,4.997801
min,-6.601822,-5.702579,0.0,0.0,-5.503725,-4.656782,-5.032439,0.0,-3.678372,0.0,0.0,-2.308571,-5.503725
25%,-0.300915,-0.350493,0.0,0.0,7.256198,7.225515,7.172954,0.0,-0.672154,0.0,0.0,11.754888,8.306735
50%,1.032543,0.985913,0.0,0.0,9.963009,9.995205,9.959684,0.0,-0.011273,0.0,0.0,14.9216,11.428904
75%,2.319681,2.334578,1.0,1.0,12.67869,12.709974,12.652646,1.0,0.674547,1.0,1.0,18.095607,14.995862
max,8.355516,8.613754,1.0,1.0,24.566237,24.039729,24.633057,1.0,3.526289,1.0,1.0,33.225206,33.225206


In [7]:
df.corr()

Unnamed: 0,X1,X1_lag,X2,X2_lag,y0,y0_lag_1,y0_lag_2,z,U,D,d,y1,y
X1,1.0,0.284308,0.007335,0.00123,0.004969,0.020897,0.013754,0.006951,-0.002888,-0.006704,-0.007246,-0.002252,-0.004459
X1_lag,0.284308,1.0,-0.002579,-0.012452,2.8e-05,0.00891,0.00834,-0.001747,-0.010746,-0.024415,-0.019153,-0.005966,-0.007068
X2,0.007335,-0.002579,1.0,0.187625,0.006889,-0.003521,-0.00777,0.000381,-0.013351,-0.01731,-0.013572,-0.002256,-0.001944
X2_lag,0.00123,-0.012452,0.187625,1.0,0.012096,0.012911,0.012817,-0.000466,0.003258,0.01314,0.001571,0.009878,0.011815
y0,0.004969,2.8e-05,0.006889,0.012096,1.0,0.801618,0.643973,0.000976,-0.011477,-0.003107,0.004976,0.848736,0.80816
y0_lag_1,0.020897,0.00891,-0.003521,0.012911,0.801618,1.0,0.801331,-0.009141,-0.005001,-0.00699,-0.004596,0.686934,0.645514
y0_lag_2,0.013754,0.00834,-0.00777,0.012817,0.643973,0.801331,1.0,-0.005579,-0.002406,-0.002292,-0.000476,0.554681,0.519708
z,0.006951,-0.001747,0.000381,-0.000466,0.000976,-0.009141,-0.005579,1.0,-0.016875,0.328799,0.703129,-0.003604,0.367174
U,-0.002888,-0.010746,-0.013351,0.003258,-0.011477,-0.005001,-0.002406,-0.016875,1.0,0.503446,0.262801,0.306424,0.205024
D,-0.006704,-0.024415,-0.01731,0.01314,-0.003107,-0.00699,-0.002292,0.328799,0.503446,1.0,0.706795,0.152517,0.365792


In [8]:
df['X1'].var()

3.899173568893451

In [9]:
df['X1_lag'].var()

4.05156882720308

# используем auto CUPAC

In [11]:
from autocupac import CUPACTransformer

In [12]:
# Инициализация и применение
transformer = CUPACTransformer(target_col='y')
transformer.fit(df)
transformed_data = transformer.transform(df)

# Или одной командой
# transformed_data = transformer.fit_transform(train_data, new_data)

print(transformer.get_report())
print("\nTransformed Data:")
print(transformed_data)

Расширенный CUPAC Report
Сравнение моделей:
Linear: R²=0.641, Var.Red.=64.2%
Ridge: R²=0.642, Var.Red.=64.2%
Lasso: R²=0.641, Var.Red.=64.2%
CatBoost: R²=0.638, Var.Red.=63.8%

Лучшая модель: Ridge
Снижение дисперсии: 64.2%
Качество предсказания (R²): 0.642

Топ-10 значимых признаков:
- y0_lag_2                    0.803 ▇▇▇▇▇▇▇▇▇▇
- X2_lag                      0.022 
- X1_lag                      0.005 

Интерпретация:
▇▇▇▇▇▇▇▇▇▇ - максимальное влияние
Коэффициенты > 0: положительная связь с целевой переменной
Коэффициенты < 0: отрицательная связь

Transformed Data:
            X1    X1_lag  X2  X2_lag         y0   y0_lag_1   y0_lag_2  z  \
0     1.407035  2.582219   0       1  10.054458   9.812256   9.592427  1   
1     3.850649  2.757775   0       1   5.968851   6.515070   7.316338  0   
2     3.287116  3.961634   1       0  11.414347   9.482436   8.523009  1   
3     4.293427  0.229035   1       0   6.488326   5.130305  10.111296  1   
4    -0.514052  1.328158   1       0  15.660215

In [13]:
transformed_data.describe()

Unnamed: 0,X1,X1_lag,X2,X2_lag,y0,y0_lag_1,y0_lag_2,z,U,D,d,y1,y,y_cupac
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.014193,1.000744,0.4061,0.3937,9.930242,9.94699,9.92388,0.4994,-0.002302,0.4968,0.3303,14.919766,11.759085,11.759085
std,1.974633,2.012851,0.491128,0.488594,4.030378,4.034053,4.027346,0.500025,1.005845,0.500015,0.470345,4.725965,4.997801,3.817295
min,-6.601822,-5.702579,0.0,0.0,-5.503725,-4.656782,-5.032439,0.0,-3.678372,0.0,0.0,-2.308571,-5.503725,1.435019
25%,-0.300915,-0.350493,0.0,0.0,7.256198,7.225515,7.172954,0.0,-0.672154,0.0,0.0,11.754888,8.306735,9.021053
50%,1.032543,0.985913,0.0,0.0,9.963009,9.995205,9.959684,0.0,-0.011273,0.0,0.0,14.9216,11.428904,11.148921
75%,2.319681,2.334578,1.0,1.0,12.67869,12.709974,12.652646,1.0,0.674547,1.0,1.0,18.095607,14.995862,14.077516
max,8.355516,8.613754,1.0,1.0,24.566237,24.039729,24.633057,1.0,3.526289,1.0,1.0,33.225206,33.225206,28.91166
