### Библиотеки

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from tqdm import tqdm

import statsmodels.api as sm
from scipy.stats import t, f, boxcox, skew, kurtosis, gmean
from statsmodels.stats.diagnostic import linear_reset, het_white

import warnings
warnings.filterwarnings('ignore')

### Обработка

In [17]:
# data = pd.read_csv('data_after_processing.csv', encoding='utf-8')
data = pd.read_csv('data_after_feature_selection.csv', encoding='utf-8')
data

# data = data.drop(['title'], axis=1)
# data = data.drop(['author_Другой', 'publisher_Другой', 'publication_year_Другой',
                #   'cover_type_Мягкий заламинированный картон', 'reading_age_6+', ''], axis=1)

Unnamed: 0,price,const,avg_rating,cnt_reviews,pages_cnt,tirage,publisher_АСТ,publisher_Азбука,publisher_Иностранка,publisher_Эксмо,cover_type_Мягкий_переплёт,2024_or_2025,6-,16+
0,312,1.0,4.1,925,512,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
1,312,1.0,4.1,1341,320,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,284,1.0,4.5,363,192,25000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,339,1.0,4.3,872,288,12000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,284,1.0,4.2,3004,320,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,1241,1.0,5.0,3,624,1500,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3186,2207,1.0,3.7,53,448,1500,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3187,1103,1.0,4.3,42,784,3000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3188,1011,1.0,3.8,29,704,2000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Логарифм цены

In [18]:
data['log_price'] = np.log(data['price'])
data = data.drop(['price'], axis=1)
y = data['log_price']
X = data.drop(['log_price'], axis=1)

### Регрессоры, которые можно логарифмировать и нет

In [19]:
X.columns

Index(['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Эксмо', 'cover_type_Мягкий_переплёт', '2024_or_2025', '6-',
       '16+'],
      dtype='object')

In [20]:
cols_to_try_log = [
    'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 
]

unconditional_cols = [x for x in X.columns.tolist() if x not in cols_to_try_log]

In [28]:
def generate_all_combinations(num_of_repeats):
    """Генерирует все комбинации признаков, к которым применяется функциональное преобразование"""
    return product([False, True], repeat=num_of_repeats)

def prepare_X(cols_to_try_func, combination, df, func):
    """Создает матрицу X для текущей комбинации"""
    """Примеры функций: np.log, np.power, np.reciprocal, ..."""
    X_temp = df[unconditional_cols].copy()
    feature_name = getattr(func, '__name__', repr(func))
    
    for i, col in enumerate(cols_to_try_func):
        if combination[i]:
            try:
                safe_col = df[col]
                if (safe_col <= 0).any() and (func == np.log) or (func == np.reciprocal):
                    safe_col = safe_col + 1e-6

                X_temp[f'{feature_name}_{col}'] = func(safe_col)

            except Exception as e:
                print(f"Ошибка при применении функции к {col}: {e}")
        else:
            X_temp[col] = df[col]
    
    return sm.add_constant(X_temp)

def getting_statistic(cols_to_try_func, df, func):
    results = []

    total_combinations = 2 ** len(cols_to_try_func)
    feature_name = getattr(func, '__name__', repr(func))

    for combination in tqdm(generate_all_combinations(num_of_repeats=len(cols_to_try_func)), total=total_combinations):
        try:
            # Подготавливаем данные
            X_curr = prepare_X(cols_to_try_func, combination, df, func)

            model = sm.OLS(df['log_price'], X_curr).fit()
            results.append({
                'combination': combination,
                'aic': model.aic,
                'bic': model.bic,
                'adj_r2': model.rsquared_adj,
            })

        except Exception as e:
            print(f"Error in combination {combination}: {str(e)}")
            continue

    df_results = pd.DataFrame(results)

    df_results[f'{feature_name}_columns'] = df_results['combination'].apply(
        lambda x: [cols_to_try_func[i] for i, my_func in enumerate(x) if my_func]
    )
    
    return df_results

In [29]:
# Проверяем логарифмирование
data_frame = getting_statistic(cols_to_try_func=cols_to_try_log, df=data, func=np.log)

best_aic = data_frame.loc[data_frame['aic'].idxmin()]
best_bic = data_frame.loc[data_frame['bic'].idxmin()]
best_adj_r2 = data_frame.loc[data_frame['adj_r2'].idxmax()]

print("Лучшая модель по AIC:")
print(f"Логарифмированные переменные: {best_aic['log_columns']}")
print(f"AIC: {best_aic['aic']:.2f}\n")

print("Лучшая модель по BIC:")
print(f"Логарифмированные переменные: {best_bic['log_columns']}")
print(f"BIC: {best_bic['bic']:.2f}\n")

print("Лучшая модель по Adj.R²:")
print(f"Логарифмированные переменные: {best_adj_r2['log_columns']}")
print(f"Adj.R²: {best_adj_r2['adj_r2']:.4f}")

100%|██████████| 16/16 [00:00<00:00, 113.66it/s]

Лучшая модель по AIC:
Логарифмированные переменные: ['tirage']
AIC: 1702.39

Лучшая модель по BIC:
Логарифмированные переменные: ['tirage']
BIC: 1781.27

Лучшая модель по Adj.R²:
Логарифмированные переменные: ['tirage']
Adj.R²: 0.6062





In [30]:
# Проверяем признаки вида 1/x
my_func = np.reciprocal
data_frame = getting_statistic(cols_to_try_func=cols_to_try_log, df=data, func=my_func)

best_aic = data_frame.loc[data_frame['aic'].idxmin()]
best_bic = data_frame.loc[data_frame['bic'].idxmin()]
best_adj_r2 = data_frame.loc[data_frame['adj_r2'].idxmax()]

column_name = f'{my_func.__name__}_columns'

print("Лучшая модель по AIC:")
print(f"Преобразованные переменные: {best_aic[column_name]}")

print(f"AIC: {best_aic['aic']:.2f}\n")

print("Лучшая модель по BIC:")
print(f"Преобразованные переменные: {best_bic[column_name]}")
print(f"BIC: {best_bic['bic']:.2f}\n")

print("Лучшая модель по Adj.R²:")
print(f"Преобразованные переменные: {best_adj_r2[column_name]}")
print(f"Adj.R²: {best_adj_r2['adj_r2']:.4f}")

100%|██████████| 16/16 [00:00<00:00, 180.94it/s]

Лучшая модель по AIC:
Преобразованные переменные: []
AIC: 1774.09

Лучшая модель по BIC:
Преобразованные переменные: []
BIC: 1852.98

Лучшая модель по Adj.R²:
Преобразованные переменные: []
Adj.R²: 0.5973





Также функции будут работать, если мы решим проверить какие-то кастомные функции типо этого:

In [31]:
def quadratic_shift(x):
    return x**2

### Итоговая модель с ln Y

После проверки всех функциональных форм обучаем итоговую модель:

In [32]:
cols_to_log = ['tirage']
cols_not_to_log = [col for col in cols_to_try_log if col not in cols_to_log] + unconditional_cols

X_log = np.log(data.loc[:, cols_to_log])
X_not_to_log = data.loc[:, cols_not_to_log]

X_log_model = sm.add_constant(pd.concat((X_log, X_not_to_log), axis=1))

log_model = sm.OLS(data['log_price'], X_log_model).fit()
log_model.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.608
Model:,OLS,Adj. R-squared:,0.606
Method:,Least Squares,F-statistic:,410.2
Date:,"Tue, 06 May 2025",Prob (F-statistic):,0.0
Time:,11:33:38,Log-Likelihood:,-838.2
No. Observations:,3190,AIC:,1702.0
Df Residuals:,3177,BIC:,1781.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
tirage,-0.2476,0.013,-18.999,0.000,-0.273,-0.222
avg_rating,0.1175,0.016,7.253,0.000,0.086,0.149
cnt_reviews,0.0006,5.01e-05,11.072,0.000,0.000,0.001
pages_cnt,0.0008,2.69e-05,28.267,0.000,0.001,0.001
const,7.6400,0.131,58.474,0.000,7.384,7.896
publisher_АСТ,-0.3519,0.030,-11.729,0.000,-0.411,-0.293
publisher_Азбука,-0.3447,0.032,-10.873,0.000,-0.407,-0.283
publisher_Иностранка,0.0240,0.039,0.607,0.544,-0.053,0.101
publisher_Эксмо,-0.3021,0.030,-10.153,0.000,-0.360,-0.244

0,1,2,3
Omnibus:,90.146,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,181.182
Skew:,0.18,Prob(JB):,4.54e-40
Kurtosis:,4.11,Cond. No.,12400.0


**мини-вопрос на подумать:** допустим, мы попробовали разные функциональные формы для призаков и в качестве того, для чего резонно применять `log` и `custom_func` у нас функция выдала один и тот же признак `tirage`. Что с ним делать?
<br>
<br>
**"ответ":** я предлагаю сильно не заморачиваться и выбрать что придется