In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean

import statsmodels.api as sm
from scipy.stats import t, f, boxcox, skew, kurtosis
from statsmodels.stats.diagnostic import linear_reset, het_white


import warnings
warnings.filterwarnings('ignore')

### Чтение и предобработка

In [113]:
data = pd.read_csv('data_after_processing.csv')
data.columns = [col.replace(' ', '_') for col in data.columns]

In [114]:
not_null = ['thickness', 'width', 'length', 'volume']
data = data[(data[not_null] > 0).all(axis=1)]

In [115]:
#data = data.drop(['title'], axis=1)
data = data.drop(['author_Другой', 'publisher_Другой', 'publication_year_Другой',
                  'cover_type_Мягкий_заламинированный_картон', 'cover_type_Твёрдый_переплёт', 'reading_age_12+', 'is_russian_author_0', 'is_russian_author_1'], axis=1)

In [116]:
data.columns

Index(['price', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк', 'publisher_АСТ',
       'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
       'publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024',
       'publication_year_2025', 'cover_type_Мягкий_переплёт', 'reading_age_0+',
       'reading_age_16+', 'reading_age_18+', 'reading_age_6+', 'thickness',
       'width', 'length', 'volume'],
      dtype='object')

In [117]:
X, y = data.drop(['price'], axis=1), data['price']
X = sm.add_constant(X)

## Отбор объясняющих признаков

### Удаляем авторов
Было принято решение удалить всех авторов тк по ним очень мало наблюдений

In [118]:
authors_columns = ['author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк']
    
X[authors_columns].sum()

author_Джейн_Остен           68.0
author_Джек_Лондон           69.0
author_Джордж_Оруэлл         64.0
author_Лев_Толстой           46.0
author_Луиза_Мэй_Олкотт      48.0
author_Михаил_Булгаков       71.0
author_Николай_Гоголь        47.0
author_Федор_Достоевский    114.0
author_Эрих_Ремарк           66.0
dtype: float64

In [119]:
X.drop(columns = ['author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк'], inplace=True)

### Размерные признаки

Выкидываем по-очереди вес, толщину, объем, страницы, площадь, длину, ширину смотрим на R2 и на коэффициент при обложке

In [120]:
X['S'] = X['length'] * X['width']

Выбираем, какой признак удалить. Выводим $R^2_{adj}$ и коэффициент при pages_cnt

In [None]:
from itertools import combinations
groups = ['weight', 'thickness', 'volume', 'width', 'length', 'S']

to_data_frame = []

for r in range (1, len(groups) + 1):
    for group in combinations(groups, r=r):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r = model_reduced.rsquared_adj
        # b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        b_pages = model_reduced.params['pages_cnt']
        aic = model_reduced.aic

        if (b_pages > 0):
            dct = {'group': group,
                  'r': r,
                  'b_pages' : b_pages}

            to_data_frame.append(dct)

data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info.sort_values(by=['r'], ascending=False)

Unnamed: 0,group,r,b_pages
0,[weight],0.742979,0.198612
3,"[weight, width]",0.728274,0.216185
14,"[weight, width, S]",0.728023,0.215811
5,"[weight, S]",0.72802,0.215338
4,"[weight, length]",0.722819,0.215002
13,"[weight, width, length]",0.721443,0.210861
23,"[weight, width, length, S]",0.720517,0.210317
15,"[weight, length, S]",0.720431,0.210419
1,"[weight, thickness]",0.711167,0.005746
7,"[weight, thickness, width]",0.696173,0.02258


In [121]:
X.columns

Index(['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
       'publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024',
       'publication_year_2025', 'cover_type_Мягкий_переплёт', 'reading_age_0+',
       'reading_age_16+', 'reading_age_18+', 'reading_age_6+', 'thickness',
       'width', 'length', 'volume', 'S'],
      dtype='object')

In [148]:
from tqdm import tqdm
from itertools import combinations

groups = [
        'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 
       'cover_type_Мягкий_переплёт', 
       'thickness', 'width', 'length', 'volume', 'S', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
        ]

to_data_frame = []

for r in tqdm(range(1, len(groups) + 1)):
    for group in tqdm(combinations(groups, r=r)):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r_sq_adj = model_reduced.rsquared_adj

        try:
            b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        except:
            b_cover = 0

        try:
            b_pages = model_reduced.params['pages_cnt']
        except:
            b_pages = 0 

        aic = model_reduced.aic

        # if (b_pages >= 0):
        dct = {'group': group,
                'r': r_sq_adj,
                'b_pages' : b_pages,
                'b_cover': b_cover}

        to_data_frame.append(dct)

data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info.sort_values(by=['r'], ascending=False)

16it [00:00, 61.00it/s]00:00<?, ?it/s]
120it [00:01, 70.05it/s]0:00<00:03,  3.78it/s]
560it [00:08, 64.07it/s]0:01<00:15,  1.12s/it]
1820it [00:32, 56.68it/s]:10<00:59,  4.62s/it]
4368it [00:45, 96.68it/s]:42<03:05, 15.47s/it]
8008it [01:26, 92.39it/s]:28<04:48, 26.19s/it]
11440it [01:51, 102.25it/s]4<07:47, 46.75s/it]
12870it [01:52, 114.84it/s]6<10:12, 68.05s/it]
11440it [01:40, 114.00it/s]8<10:56, 82.06s/it]
8008it [01:12, 110.59it/s]19<10:14, 87.78s/it]
4368it [00:42, 103.28it/s]:31<08:18, 83.04s/it]
1820it [00:16, 110.84it/s]:13<05:52, 70.57s/it]
560it [00:04, 114.93it/s]0:30<03:36, 54.10s/it]
120it [00:01, 118.62it/s]0:35<01:57, 39.19s/it]
16it [00:00, 104.89it/s]10:36<00:55, 27.66s/it]
1it [00:00, 50.93it/s] [10:36<00:19, 19.37s/it]
100%|██████████| 16/16 [10:36<00:00, 39.77s/it]


Unnamed: 0,group,r,b_pages,b_cover
4,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
8,[volume],0.842171,-0.447176,-18.576132
0,[avg_rating],0.842129,-0.432689,-23.978703
73,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
14,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
...,...,...,...,...
65521,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043505,0.000000,0.000000
65489,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043455,0.000000,0.000000
65532,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043331,0.000000,0.000000
65518,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043275,0.000000,0.000000


In [153]:
df_filtered = data_frame_info[(data_frame_info['b_pages'] >= 0) & (data_frame_info['b_cover'] <= 0)].sort_values(by=['r'], ascending=False)
df_filtered

Unnamed: 0,group,r,b_pages,b_cover
2,[pages_cnt],0.830180,0.0,-64.593657
56,"[pages_cnt, publisher_Манн,_Иванов_и_Фербер]",0.829327,0.0,-62.913745
17,"[avg_rating, pages_cnt]",0.828891,0.0,-65.610969
402,"[pages_cnt, publisher_АСТ, publisher_Манн,_Ива...",0.828344,0.0,-63.073107
53,"[pages_cnt, publisher_АСТ]",0.828298,0.0,-62.954811
...,...,...,...,...
65521,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043505,0.0,0.000000
65489,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043455,0.0,0.000000
65532,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043331,0.0,0.000000
65518,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043275,0.0,0.000000


In [166]:
df_filtered[50:100]

Unnamed: 0,group,r,b_pages,b_cover
350,"[pages_cnt, cover_type_Мягкий_переплёт, publis...",0.825177,0.0,0.0
405,"[pages_cnt, publisher_Азбука, publisher_Манн,_...",0.825172,0.0,-65.345831
1758,"[pages_cnt, volume, publisher_АСТ, publisher_М...",0.825166,0.0,-53.867209
2586,"[avg_rating, cnt_reviews, pages_cnt, publisher...",0.825151,0.0,-61.139282
384,"[pages_cnt, volume, publisher_АСТ]",0.825145,0.0,-54.254161
704,"[avg_rating, cnt_reviews, pages_cnt, publisher...",0.825134,0.0,-61.488652
8598,"[avg_rating, pages_cnt, publisher_АСТ, publish...",0.825109,0.0,-69.082756
1799,"[pages_cnt, publisher_Азбука, publisher_Манн,_...",0.825032,0.0,-64.456491
5596,"[pages_cnt, publisher_Азбука, publisher_Иностр...",0.824972,0.0,-64.405686
808,"[avg_rating, pages_cnt, cover_type_Мягкий_пере...",0.824799,0.0,0.0


In [164]:
df_filtered[:50].loc[1797]['group']

['pages_cnt',
 'publisher_Азбука',
 'publisher_Иностранка',
 'publisher_Манн,_Иванов_и_Фербер']

In [167]:
from tqdm import tqdm
from itertools import combinations

groups = [
       'cover_type_Мягкий_переплёт', 
       'thickness', 'width', 'length', 'volume', 'S', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
        ]

to_data_frame = []

for r in tqdm(range(1, len(groups) + 1)):
    for group in tqdm(combinations(groups, r=r)):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r_sq_adj = model_reduced.rsquared_adj

        try:
            b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        except:
            b_cover = 0

        try:
            b_pages = model_reduced.params['pages_cnt']
        except:
            b_pages = 0 

        aic = model_reduced.aic

        # if (b_pages >= 0):
        dct = {'group': group,
                'r': r_sq_adj,
                'b_pages' : b_pages,
                'b_cover': b_cover}

        to_data_frame.append(dct)

data_frame_info_2 = pd.DataFrame(to_data_frame)
data_frame_info_2.sort_values(by=['r'], ascending=False)

12it [00:00, 38.21it/s]00:00<?, ?it/s]
66it [00:01, 48.57it/s]00:00<00:03,  3.15it/s]
220it [00:03, 60.55it/s]0:01<00:09,  1.07it/s]
495it [00:06, 73.29it/s]0:05<00:19,  2.18s/it]
792it [00:09, 83.00it/s]0:12<00:31,  3.99s/it]
924it [00:09, 93.41it/s]0:21<00:41,  5.99s/it]
792it [00:09, 81.17it/s]0:31<00:43,  7.32s/it]
495it [00:06, 73.63it/s]0:41<00:40,  8.12s/it]
220it [00:02, 85.80it/s]0:48<00:30,  7.69s/it]
66it [00:00, 74.45it/s]00:50<00:18,  6.09s/it]
12it [00:00, 89.55it/s][00:51<00:08,  4.48s/it]
1it [00:00, 36.62it/s] [00:51<00:03,  3.15s/it]
100%|██████████| 12/12 [00:51<00:00,  4.31s/it]


Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.841480,-0.445161,0.000000
...,...,...,...,...
4086,"[cover_type_Мягкий_переплёт, thickness, width,...",0.320099,0.686124,0.000000
4085,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313711,0.725821,0.000000
4019,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313535,0.726377,0.000000
4082,"[cover_type_Мягкий_переплёт, thickness, width,...",0.306005,0.712602,0.000000


In [169]:
df_filtered_2 = data_frame_info_2.sort_values(by=['r'], ascending=False)
df_filtered_2

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.841480,-0.445161,0.000000
...,...,...,...,...
4086,"[cover_type_Мягкий_переплёт, thickness, width,...",0.320099,0.686124,0.000000
4085,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313711,0.725821,0.000000
4019,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313535,0.726377,0.000000
4082,"[cover_type_Мягкий_переплёт, thickness, width,...",0.306005,0.712602,0.000000


In [170]:
df_filtered_2[:10]

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.0
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.0
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.84148,-0.445161,0.0
70,"[publisher_АСТ, publisher_Манн,_Иванов_и_Фербер]",0.841256,-0.429168,-22.117287
7,[publisher_АСТ],0.841213,-0.429663,-22.183297
55,"[volume, publisher_Манн,_Иванов_и_Фербер]",0.841196,-0.450615,-16.026493
110,"[cover_type_Мягкий_переплёт, volume, publisher...",0.841051,-0.46385,0.0
125,"[cover_type_Мягкий_переплёт, publisher_АСТ, pu...",0.84095,-0.450966,0.0


In [186]:
sm.OLS(y, 
        X.drop(columns=[
       'thickness', 'width', 'length', 'volume', 'weight'])
).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.664
Model:,OLS,Adj. R-squared:,0.662
Method:,Least Squares,F-statistic:,313.4
Date:,"Tue, 06 May 2025",Prob (F-statistic):,0.0
Time:,22:37:13,Log-Likelihood:,-21360.0
No. Observations:,3190,AIC:,42760.0
Df Residuals:,3169,BIC:,42890.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-786.2062,67.402,-11.664,0.000,-918.362,-654.051
avg_rating,63.4577,10.264,6.182,0.000,43.332,83.583
cnt_reviews,0.3496,0.035,10.084,0.000,0.282,0.418
pages_cnt,0.4473,0.017,26.108,0.000,0.414,0.481
tirage,-0.0194,0.002,-12.540,0.000,-0.022,-0.016
publisher_АСТ,-117.2113,28.426,-4.123,0.000,-172.947,-61.476
publisher_Азбука,-106.6558,29.201,-3.652,0.000,-163.911,-49.401
publisher_Иностранка,38.3213,32.722,1.171,0.242,-25.837,102.480
"publisher_Манн,_Иванов_и_Фербер",-16.2341,36.387,-0.446,0.656,-87.579,55.111

0,1,2,3
Omnibus:,1600.424,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45337.35
Skew:,1.806,Prob(JB):,0.0
Kurtosis:,21.112,Cond. No.,168000.0


In [143]:
data_frame_info['b_pages'].max()

0.0

In [133]:
data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info

Unnamed: 0,group,r,b_pages
0,"[cnt_reviews, publication_year_2025, cover_typ...",0.360659,0.641586
1,"[cnt_reviews, publication_year_2025, cover_typ...",0.360659,0.641586
2,"[cnt_reviews, publication_year_2025, cover_typ...",0.360659,0.641586


In [125]:
to_data_frame

[]

In [109]:
dff = pd.DataFrame(to_data_frame).sort_values(by='r', ascending=False)[:10]
dff

Unnamed: 0,group,r,b_pages
15690,"[publisher_Манн,_Иванов_и_Фербер, publication_...",0.651297,0.465441
15195,"[publisher_Иностранка, publication_year_2022, ...",0.651272,0.466996
41106,"[publisher_Иностранка, publisher_Манн,_Иванов_...",0.651266,0.469725
4436,"[publisher_Манн,_Иванов_и_Фербер, publication_...",0.651234,0.465414
4485,"[publisher_Манн,_Иванов_и_Фербер, publication_...",0.651213,0.46558
4430,"[publisher_Манн,_Иванов_и_Фербер, publication_...",0.651213,0.465448
14777,"[publisher_Иностранка, publisher_Манн,_Иванов_...",0.651206,0.469651
4216,"[publisher_Иностранка, publication_year_2022, ...",0.651206,0.467003
4854,"[publication_year_2022, publication_year_2024,...",0.651204,0.46533
14771,"[publisher_Иностранка, publisher_Манн,_Иванов_...",0.651189,0.469613


In [None]:
dff

In [100]:
X['width'] = data['width']
# X.drop(columns=['weight', 'volume', 'thickness', 'length', 'width', 'S'], inplace = True)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     298.2
Date:                Tue, 06 May 2025   Prob (F-statistic):               0.00
Time:                        21:14:25   Log-Likelihood:                -21412.
No. Observations:                3190   AIC:                         4.287e+04
Df Residuals:                    3169   BIC:                         4.299e+04
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Тк часть публикационных годов незначима, давайте объединим - судя по ящику с усами 24 и 25 год стоит дороже

In [74]:
X['2024_or_2025'] = X['publication_year_2024'] + X['publication_year_2025']
X.drop(columns=['publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024', 'publication_year_2025'], inplace=True)

In [75]:
groups_to_test = [['2024_or_2025']]
for group in groups_to_test:
    X_full = X.copy()
    X_reduced = X.drop(columns=group)
    model_full = sm.OLS(y, X_full).fit()
    model_reduced = sm.OLS(y, X_reduced).fit()

    f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
    print(f"Группа: {group}, F = {f_stat}, p = {p_val}, коэффициент оказался значим")

Группа: ['2024_or_2025'], F = 9.262100071087392, p = 0.0023586145341476824, коэффициент оказался значим


In [76]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.527
Model:                            OLS   Adj. R-squared:                  0.525
Method:                 Least Squares   F-statistic:                     235.6
Date:                Tue, 06 May 2025   Prob (F-statistic):               0.00
Time:                        11:36:13   Log-Likelihood:                -21907.
No. Observations:                3190   AIC:                         4.385e+04
Df Residuals:                    3174   BIC:                         4.394e+04
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Некоторые рейтинги незначимы, давайте сделаем точно также, 0+ и 6+ стоят дороже. Оставшиеся объединим

In [77]:
X['6-'] = X['reading_age_0+'] + X['reading_age_6+']
X['16+'] = X['reading_age_16+'] + X['reading_age_18+']
X.drop(columns=['reading_age_0+', 'reading_age_6+', 'reading_age_16+', 'reading_age_18+'], inplace=True)

In [78]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.524
Model:                            OLS   Adj. R-squared:                  0.522
Method:                 Least Squares   F-statistic:                     268.6
Date:                Tue, 06 May 2025   Prob (F-statistic):               0.00
Time:                        11:36:16   Log-Likelihood:                -21918.
No. Observations:                3190   AIC:                         4.386e+04
Df Residuals:                    3176   BIC:                         4.395e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Остался незначимым коэффициент при издательстве `publisher_Манн,_Иванов_и_Фербер` По нему мало наблюдений, дропнем его, отнеся в категорию другое

In [79]:
X.drop(columns=['publisher_Манн,_Иванов_и_Фербер'], inplace=True)

In [80]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.524
Model:                            OLS   Adj. R-squared:                  0.522
Method:                 Least Squares   F-statistic:                     291.0
Date:                Tue, 06 May 2025   Prob (F-statistic):               0.00
Time:                        11:36:20   Log-Likelihood:                -21918.
No. Observations:                3190   AIC:                         4.386e+04
Df Residuals:                    3177   BIC:                         4.394e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

Теперь сделаем возвращение удаленных признаков, посмотрим, как они себя ведут

In [81]:
groups_to_test = [['publication_year_2021', 'publication_year_2022', 'publication_year_2023'],
                  ['publisher_Манн,_Иванов_и_Фербер']]
for group in groups_to_test:
    X_full = X.copy()
    X_reduced = X.copy()
    X_full[group] = data[group]
    model_full = sm.OLS(y, X_full).fit()
    model_reduced = sm.OLS(y, X_reduced).fit()

    f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
    print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
print('Все выкинутые группы по-прежнему незначимы')

Группа: ['publication_year_2021', 'publication_year_2022', 'publication_year_2023'], F = 1.0349866339120282, p = 0.3758769904309481
Группа: ['publisher_Манн,_Иванов_и_Фербер'], F = 0.011764867815294514, p = 0.9136329206973712
Все выкинутые группы по-прежнему незначимы


In [82]:
X.columns

Index(['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Эксмо', 'cover_type_Мягкий_переплёт', '2024_or_2025', '6-',
       '16+'],
      dtype='object')

### Итоговый набор признаков:
['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Эксмо', 'cover_type_Мягкий_переплёт', '2024_or_2025', '6-',
       '16+']

### Сохраняем данные

In [83]:
data_to_csv = pd.concat([y, X], axis=1)
data_to_csv

Unnamed: 0,price,const,avg_rating,cnt_reviews,pages_cnt,tirage,publisher_АСТ,publisher_Азбука,publisher_Иностранка,publisher_Эксмо,cover_type_Мягкий_переплёт,2024_or_2025,6-,16+
0,312,1.0,4.1,925,512,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
1,312,1.0,4.1,1341,320,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,284,1.0,4.5,363,192,25000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,339,1.0,4.3,872,288,12000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,284,1.0,4.2,3004,320,30000,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,1241,1.0,5.0,3,624,1500,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3186,2207,1.0,3.7,53,448,1500,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3187,1103,1.0,4.3,42,784,3000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3188,1011,1.0,3.8,29,704,2000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [84]:
data_to_csv.to_csv('data_after_feature_selection.csv', index=False)

## Сравнение функциональных форм на основе теста Бокса-Кокса и PE теста:

**Тест Бокса-Кокса с преобразование Зарембки (адекватный и рабочий)**

In [50]:
geo_mean = gmean(y)
y_help = y/geo_mean
ln_y_help = np.log(y_help)

lin_model = sm.OLS(y_help, X).fit()
rss1 = np.sum(lin_model.resid**2)

semi_log_model = sm.OLS(ln_y_help, X).fit()
rss2 = np.sum(semi_log_model.resid**2)

chi_stat = X.shape[0]/2 * np.abs(np.log(rss1/rss2))
print(f'{chi_stat} > 3.84 значит, между моделями есть существенное различие, выбираем на основе RSS')

1456.1341744874328 > 3.84 значит, между моделями есть существенное различие, выбираем на основе RSS


In [51]:
if rss2 < rss1:
  print('Полулогарифмическая модель предпочтительнее')
else:
  print('Линейная модель предпочтительнее')

Полулогарифмическая модель предпочтительнее


**PE тест**

In [54]:
model_log = sm.OLS(ln_y, X).fit()

yhat_lin = model.fittedvalues
ln_yhat_log = model_log.fittedvalues
yhat_log = np.exp(ln_yhat_log)

mask = yhat_lin >= 0

yhat_lin = yhat_lin[mask]
yhat_log = yhat_log[mask]
ln_y = ln_y[mask]
ln_yhat_log = ln_yhat_log[mask]
X_masked = X[mask]

# Шаг 2: Вспомогательные регрессии
aux_log_X = X_masked.copy()
aux_log_X['delta'] = yhat_lin - yhat_log
aux_log_model = sm.OLS(ln_y, aux_log_X).fit()

aux_lin_X = X_masked.copy()
aux_lin_X['delta'] = ln_yhat_log - np.log(yhat_lin)
aux_lin_model = sm.OLS(y[mask], aux_lin_X).fit()

print("Модель для полулогарифмической спецификации")
print()
print(aux_log_model.summary())

print("Модель для линейной спецификации")
print()
print(aux_lin_model.summary())


Модель для полулогарифмической спецификации

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.611
Model:                            OLS   Adj. R-squared:                  0.609
Method:                 Least Squares   F-statistic:                     382.6
Date:                Tue, 06 May 2025   Prob (F-statistic):               0.00
Time:                        11:23:12   Log-Likelihood:                -824.59
No. Observations:                3181   AIC:                             1677.
Df Residuals:                    3167   BIC:                             1762.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

**Вывод:** все значимо, нужны другие функциональные формы

## Сравнение функциональных форм на основе нормированной R^2:

In [55]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.524
Model:,OLS,Adj. R-squared:,0.522
Method:,Least Squares,F-statistic:,291.0
Date:,"Tue, 06 May 2025",Prob (F-statistic):,0.0
Time:,11:23:19,Log-Likelihood:,-21918.0
No. Observations:,3190,AIC:,43860.0
Df Residuals:,3177,BIC:,43940.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,239.0553,58.455,4.090,0.000,124.442,353.668
avg_rating,71.4526,12.078,5.916,0.000,47.770,95.135
cnt_reviews,0.3925,0.041,9.640,0.000,0.313,0.472
pages_cnt,0.5622,0.020,28.217,0.000,0.523,0.601
tirage,-0.0243,0.002,-13.623,0.000,-0.028,-0.021
publisher_АСТ,-195.0463,22.296,-8.748,0.000,-238.761,-151.331
publisher_Азбука,-172.3658,23.565,-7.314,0.000,-218.570,-126.162
publisher_Иностранка,73.1580,29.507,2.479,0.013,15.303,131.013
publisher_Эксмо,-155.6886,22.102,-7.044,0.000,-199.023,-112.354

0,1,2,3
Omnibus:,2435.121,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98417.195
Skew:,3.231,Prob(JB):,0.0
Kurtosis:,29.433,Cond. No.,75400.0


In [None]:
ln_X = np.log(data[num_features])
ln_X = pd.concat([ln_X, X.drop(columns=num_features)], axis=1)
ln_X = sm.add_constant(ln_X)

model_ln_X = sm.OLS(y, ln_X).fit()

In [None]:
model_ln_X.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.72
Model:,OLS,Adj. R-squared:,0.717
Method:,Least Squares,F-statistic:,251.1
Date:,"Mon, 05 May 2025",Prob (F-statistic):,0.0
Time:,07:49:09,Log-Likelihood:,-21756.0
No. Observations:,3256,AIC:,43580.0
Df Residuals:,3222,BIC:,43790.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
avg_rating,89.7436,49.336,1.819,0.069,-6.990,186.478
cnt_reviews,13.7996,3.077,4.484,0.000,7.766,19.833
pages_cnt,-321.2715,18.825,-17.066,0.000,-358.181,-284.362
tirage,-84.9547,7.907,-10.744,0.000,-100.459,-69.451
weight,717.8490,17.604,40.778,0.000,683.333,752.365
thickness,-63.6595,20.762,-3.066,0.002,-104.367,-22.952
width,94.4715,26.354,3.585,0.000,42.800,146.143
length,43.6833,37.006,1.180,0.238,-28.873,116.240
volume,74.4954,14.031,5.309,0.000,46.985,102.006

0,1,2,3
Omnibus:,3254.983,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,337197.789
Skew:,4.616,Prob(JB):,0.0
Kurtosis:,51.993,Cond. No.,1.01e+16


**Вывод:** при сравнении ln_X - y с X - y нормированный R^2 лучше у X - y

In [None]:
ln_X.shape[0], len(ln_y)

(3256, 3252)

In [None]:
model_ln_X_ln_y = sm.OLS(ln_y, ln_X).fit()
model_ln_X_ln_y.summary()

ValueError: The indices for endog and exog are not aligned

In [None]:
model_ln_y = sm.OLS(ln_y, X).fit()
model_ln_y.summary()

ValueError: The indices for endog and exog are not aligned

**Вывод:** незначительно лучше ln_y - X, чем ln_y - ln_X