In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean

import statsmodels.api as sm
from scipy.stats import t, f, boxcox, skew, kurtosis
from statsmodels.stats.diagnostic import linear_reset, het_white


import warnings
warnings.filterwarnings('ignore')

### Чтение и предобработка

In [2]:
data = pd.read_csv('data_after_processing.csv')
data.columns = [col.replace(' ', '_') for col in data.columns]

In [3]:
not_null = ['thickness', 'width', 'length', 'volume']
data = data[(data[not_null] > 0).all(axis=1)]

In [4]:
#data = data.drop(['title'], axis=1)
data = data.drop(['author_Другой', 'publisher_Другой', 'publication_year_Другой',
                  'cover_type_Мягкий_заламинированный_картон', 'cover_type_Твёрдый_переплёт', 'reading_age_12+', 'is_russian_author_0', 'is_russian_author_1'], axis=1)

In [5]:
data.columns

Index(['price', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк', 'publisher_АСТ',
       'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
       'publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024',
       'publication_year_2025', 'cover_type_Мягкий_переплёт', 'reading_age_0+',
       'reading_age_16+', 'reading_age_18+', 'reading_age_6+', 'thickness',
       'width', 'length', 'volume'],
      dtype='object')

In [6]:
X, y = data.drop(['price'], axis=1), data['price']
X = sm.add_constant(X)

## Отбор объясняющих признаков

### Удаляем авторов
Было принято решение удалить всех авторов тк по ним очень мало наблюдений

In [7]:
authors_columns = ['author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк']
    
X[authors_columns].sum()

author_Джейн_Остен           68.0
author_Джек_Лондон           69.0
author_Джордж_Оруэлл         64.0
author_Лев_Толстой           46.0
author_Луиза_Мэй_Олкотт      48.0
author_Михаил_Булгаков       71.0
author_Николай_Гоголь        47.0
author_Федор_Достоевский    114.0
author_Эрих_Ремарк           66.0
dtype: float64

In [8]:
X.drop(columns = ['author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк'], inplace=True)

### Размерные признаки

Выкидываем по-очереди вес, толщину, объем, страницы, площадь, длину, ширину смотрим на R2 и на коэффициент при обложке

In [9]:
X['S'] = X['length'] * X['width']

Выбираем, какой признак удалить. Выводим $R^2_{adj}$ и коэффициент при pages_cnt

In [10]:
from itertools import combinations
groups = ['weight', 'thickness', 'volume', 'width', 'length', 'S']

to_data_frame = []

for r in range (1, len(groups) + 1):
    for group in combinations(groups, r=r):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r = model_reduced.rsquared_adj
        # b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        b_pages = model_reduced.params['pages_cnt']
        aic = model_reduced.aic

        if (b_pages > 0):
            dct = {'group': group,
                  'r': r,
                  'b_pages' : b_pages}

            to_data_frame.append(dct)

data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info.sort_values(by=['r'], ascending=False)

Unnamed: 0,group,r,b_pages
0,[weight],0.742979,0.198612
3,"[weight, width]",0.728274,0.216185
14,"[weight, width, S]",0.728023,0.215811
5,"[weight, S]",0.72802,0.215338
4,"[weight, length]",0.722819,0.215002
13,"[weight, width, length]",0.721443,0.210861
23,"[weight, width, length, S]",0.720517,0.210317
15,"[weight, length, S]",0.720431,0.210419
1,"[weight, thickness]",0.711167,0.005746
7,"[weight, thickness, width]",0.696173,0.02258


In [11]:
X.columns

Index(['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
       'publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024',
       'publication_year_2025', 'cover_type_Мягкий_переплёт', 'reading_age_0+',
       'reading_age_16+', 'reading_age_18+', 'reading_age_6+', 'thickness',
       'width', 'length', 'volume', 'S'],
      dtype='object')

In [12]:
from tqdm import tqdm
from itertools import combinations

groups = [
        'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 
       'cover_type_Мягкий_переплёт', 
       'thickness', 'width', 'length', 'volume', 'S', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
        ]

to_data_frame = []

for r in tqdm(range(1, len(groups) + 1)):
    for group in tqdm(combinations(groups, r=r)):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r_sq_adj = model_reduced.rsquared_adj

        try:
            b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        except:
            b_cover = 0

        try:
            b_pages = model_reduced.params['pages_cnt']
        except:
            b_pages = 0 

        aic = model_reduced.aic

        # if (b_pages >= 0):
        dct = {'group': group,
                'r': r_sq_adj,
                'b_pages' : b_pages,
                'b_cover': b_cover}

        to_data_frame.append(dct)

data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info.sort_values(by=['r'], ascending=False)

  0%|                                                    | 0/16 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
16it [00:00, 81.64it/s][A
  6%|██▊                                         | 1/16 [00:00<00:02,  5.07it/s]
0it [00:00, ?it/s][A
7it [00:00, 69.03it/s][A
17it [00:00, 86.95it/s][A
28it [00:00, 94.57it/s][A
38it [00:00, 91.99it/s][A
48it [00:00, 90.25it/s][A
58it [00:00, 91.21it/s][A
69it [00:00, 95.02it/s][A
79it [00:00, 94.52it/s][A
90it [00:00, 97.98it/s][A
100it [00:01, 97.86it/s][A
120it [00:01, 94.94it/s][A
 12%|█████▌                                      | 2/16 [00:01<00:11,  1.20it/s]
0it [00:00, ?it/s][A
13it [00:00, 124.91it/s][A
26it [00:00, 113.82it/s][A
38it [00:00, 115.79it/s][A
50it [00:00, 112.95it/s][A
62it [00:00, 112.87it/s][A
75it [00:00, 117.93it/s][A
87it [00:00, 112.47it/s][A
99it [00:00, 94.43it/s] [A
110it [00:01, 96.67it/s][A
121it [00:01, 98.76it/s][A
133it [00:01, 102.81it/s][A
144it [00:01, 102.36it/s][A
157it [00:01, 108.98it/s][A


Unnamed: 0,group,r,b_pages,b_cover
4,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
8,[volume],0.842171,-0.447176,-18.576132
0,[avg_rating],0.842129,-0.432689,-23.978703
73,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
14,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
...,...,...,...,...
65521,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043505,0.000000,0.000000
65489,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043455,0.000000,0.000000
65532,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043331,0.000000,0.000000
65518,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043275,0.000000,0.000000


In [13]:
df_filtered = data_frame_info[(data_frame_info['b_pages'] >= 0) & (data_frame_info['b_cover'] <= 0)].sort_values(by=['r'], ascending=False)
df_filtered

Unnamed: 0,group,r,b_pages,b_cover
2,[pages_cnt],0.830180,0.0,-64.593657
56,"[pages_cnt, publisher_Манн,_Иванов_и_Фербер]",0.829327,0.0,-62.913745
17,"[avg_rating, pages_cnt]",0.828891,0.0,-65.610969
402,"[pages_cnt, publisher_АСТ, publisher_Манн,_Ива...",0.828344,0.0,-63.073107
53,"[pages_cnt, publisher_АСТ]",0.828298,0.0,-62.954811
...,...,...,...,...
65521,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043505,0.0,0.000000
65489,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043455,0.0,0.000000
65532,"[avg_rating, pages_cnt, tirage, cover_type_Мяг...",0.043331,0.0,0.000000
65518,"[avg_rating, cnt_reviews, pages_cnt, tirage, c...",0.043275,0.0,0.000000


In [14]:
df_filtered[50:100]

Unnamed: 0,group,r,b_pages,b_cover
350,"[pages_cnt, cover_type_Мягкий_переплёт, publis...",0.825177,0.0,0.0
405,"[pages_cnt, publisher_Азбука, publisher_Манн,_...",0.825172,0.0,-65.345831
1758,"[pages_cnt, volume, publisher_АСТ, publisher_М...",0.825166,0.0,-53.867209
2586,"[avg_rating, cnt_reviews, pages_cnt, publisher...",0.825151,0.0,-61.139282
384,"[pages_cnt, volume, publisher_АСТ]",0.825145,0.0,-54.254161
704,"[avg_rating, cnt_reviews, pages_cnt, publisher...",0.825134,0.0,-61.488652
8598,"[avg_rating, pages_cnt, publisher_АСТ, publish...",0.825109,0.0,-69.082756
1799,"[pages_cnt, publisher_Азбука, publisher_Манн,_...",0.825032,0.0,-64.456491
5596,"[pages_cnt, publisher_Азбука, publisher_Иностр...",0.824972,0.0,-64.405686
808,"[avg_rating, pages_cnt, cover_type_Мягкий_пере...",0.824799,0.0,0.0


In [15]:
df_filtered[:50].loc[1797]['group']

['pages_cnt',
 'publisher_Азбука',
 'publisher_Иностранка',
 'publisher_Манн,_Иванов_и_Фербер']

In [16]:
from tqdm import tqdm
from itertools import combinations

groups = [
       'cover_type_Мягкий_переплёт', 
       'thickness', 'width', 'length', 'volume', 'S', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
        ]

to_data_frame = []

for r in tqdm(range(1, len(groups) + 1)):
    for group in tqdm(combinations(groups, r=r)):
        group = list(group)
# for group in groups_to_test:
        X_full = X.copy()
        X_reduced = X.drop(columns=group)
        model_full = sm.OLS(y, X_full).fit()
        model_reduced = sm.OLS(y, X_reduced).fit()

        f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
        # print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
        r_sq_adj = model_reduced.rsquared_adj

        try:
            b_cover = model_reduced.params['cover_type_Мягкий_переплёт']
        except:
            b_cover = 0

        try:
            b_pages = model_reduced.params['pages_cnt']
        except:
            b_pages = 0 

        aic = model_reduced.aic

        # if (b_pages >= 0):
        dct = {'group': group,
                'r': r_sq_adj,
                'b_pages' : b_pages,
                'b_cover': b_cover}

        to_data_frame.append(dct)

data_frame_info_2 = pd.DataFrame(to_data_frame)
data_frame_info_2.sort_values(by=['r'], ascending=False)

  0%|                                                    | 0/12 [00:00<?, ?it/s]
12it [00:00, 154.78it/s]

0it [00:00, ?it/s][A
11it [00:00, 108.27it/s][A
23it [00:00, 114.83it/s][A
38it [00:00, 129.61it/s][A
51it [00:00, 127.34it/s][A
66it [00:00, 120.14it/s][A
 17%|███████▎                                    | 2/12 [00:00<00:03,  3.14it/s]
0it [00:00, ?it/s][A
6it [00:00, 52.96it/s][A
16it [00:00, 73.33it/s][A
26it [00:00, 83.26it/s][A
39it [00:00, 99.86it/s][A
52it [00:00, 107.30it/s][A
67it [00:00, 118.76it/s][A
81it [00:00, 124.31it/s][A
94it [00:00, 115.45it/s][A
106it [00:00, 112.71it/s][A
119it [00:01, 117.07it/s][A
131it [00:01, 104.62it/s][A
146it [00:01, 115.78it/s][A
160it [00:01, 121.10it/s][A
173it [00:01, 119.60it/s][A
186it [00:01, 122.19it/s][A
199it [00:01, 118.57it/s][A
220it [00:02, 106.59it/s][A
 25%|███████████                                 | 3/12 [00:02<00:09,  1.05s/it]
0it [00:00, ?it/s][A
13it [00:00, 87.62it/s][A
22it [00:00, 35.81

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.841480,-0.445161,0.000000
...,...,...,...,...
4086,"[cover_type_Мягкий_переплёт, thickness, width,...",0.320099,0.686124,0.000000
4085,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313711,0.725821,0.000000
4019,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313535,0.726377,0.000000
4082,"[cover_type_Мягкий_переплёт, thickness, width,...",0.306005,0.712602,0.000000


In [17]:
df_filtered_2 = data_frame_info_2.sort_values(by=['r'], ascending=False)
df_filtered_2

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.000000
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.841480,-0.445161,0.000000
...,...,...,...,...
4086,"[cover_type_Мягкий_переплёт, thickness, width,...",0.320099,0.686124,0.000000
4085,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313711,0.725821,0.000000
4019,"[cover_type_Мягкий_переплёт, thickness, width,...",0.313535,0.726377,0.000000
4082,"[cover_type_Мягкий_переплёт, thickness, width,...",0.306005,0.712602,0.000000


In [18]:
df_filtered_2[:10]

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.0
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.0
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.84148,-0.445161,0.0
70,"[publisher_АСТ, publisher_Манн,_Иванов_и_Фербер]",0.841256,-0.429168,-22.117287
7,[publisher_АСТ],0.841213,-0.429663,-22.183297
55,"[volume, publisher_Манн,_Иванов_и_Фербер]",0.841196,-0.450615,-16.026493
110,"[cover_type_Мягкий_переплёт, volume, publisher...",0.841051,-0.46385,0.0
125,"[cover_type_Мягкий_переплёт, publisher_АСТ, pu...",0.84095,-0.450966,0.0


In [19]:
sm.OLS(y, 
        X.drop(columns=[
       'thickness', 'width', 'length', 'volume', 'weight'])
).fit().summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.664
Model:,OLS,Adj. R-squared:,0.662
Method:,Least Squares,F-statistic:,313.4
Date:,"Thu, 08 May 2025",Prob (F-statistic):,0.0
Time:,21:43:20,Log-Likelihood:,-21360.0
No. Observations:,3190,AIC:,42760.0
Df Residuals:,3169,BIC:,42890.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-786.2062,67.402,-11.664,0.000,-918.362,-654.051
avg_rating,63.4577,10.264,6.182,0.000,43.332,83.583
cnt_reviews,0.3496,0.035,10.084,0.000,0.282,0.418
pages_cnt,0.4473,0.017,26.108,0.000,0.414,0.481
tirage,-0.0194,0.002,-12.540,0.000,-0.022,-0.016
publisher_АСТ,-117.2113,28.426,-4.123,0.000,-172.947,-61.476
publisher_Азбука,-106.6558,29.201,-3.652,0.000,-163.911,-49.401
publisher_Иностранка,38.3213,32.722,1.171,0.242,-25.837,102.480
"publisher_Манн,_Иванов_и_Фербер",-16.2341,36.387,-0.446,0.656,-87.579,55.111

0,1,2,3
Omnibus:,1600.424,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,45337.35
Skew:,1.806,Prob(JB):,0.0
Kurtosis:,21.112,Cond. No.,168000.0


In [20]:
data_frame_info['b_pages'].max()

0.7513694349976455

In [21]:
data_frame_info = pd.DataFrame(to_data_frame)
data_frame_info

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.000000
1,[thickness],0.840834,-0.498145,-11.909964
2,[width],0.838651,-0.435703,9.115644
3,[length],0.837201,-0.444235,19.435177
4,[volume],0.842171,-0.447176,-18.576132
...,...,...,...,...
4090,"[cover_type_Мягкий_переплёт, thickness, width,...",0.560537,0.522558,0.000000
4091,"[cover_type_Мягкий_переплёт, thickness, length...",0.627190,0.487757,0.000000
4092,"[cover_type_Мягкий_переплёт, width, length, vo...",0.387221,0.003965,0.000000
4093,"[thickness, width, length, volume, S, weight, ...",0.494150,0.606361,-304.822362


In [22]:
to_data_frame

[{'group': ['cover_type_Мягкий_переплёт'],
  'r': 0.8422632177009954,
  'b_pages': -0.44650416875498816,
  'b_cover': 0},
 {'group': ['thickness'],
  'r': 0.8408335785690062,
  'b_pages': -0.49814493830416307,
  'b_cover': -11.909964159552912},
 {'group': ['width'],
  'r': 0.8386510517125491,
  'b_pages': -0.43570318743375414,
  'b_cover': 9.115643855753332},
 {'group': ['length'],
  'r': 0.8372006493583477,
  'b_pages': -0.4442351257955762,
  'b_cover': 19.435176572629494},
 {'group': ['volume'],
  'r': 0.8421711383013855,
  'b_pages': -0.44717625611759015,
  'b_cover': -18.576132109285748},
 {'group': ['S'],
  'r': 0.8379378020737981,
  'b_pages': -0.43559596801765776,
  'b_cover': 8.789212836642777},
 {'group': ['weight'],
  'r': 0.7429792625383166,
  'b_pages': 0.19861235686084108,
  'b_cover': -120.47311113668451},
 {'group': ['publisher_АСТ'],
  'r': 0.8412130490341261,
  'b_pages': -0.4296625287334824,
  'b_cover': -22.183297157502015},
 {'group': ['publisher_Азбука'],
  'r': 0.

In [23]:
dff = pd.DataFrame(to_data_frame).sort_values(by='r', ascending=False)[:10]
dff

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.0
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.0
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.84148,-0.445161,0.0
70,"[publisher_АСТ, publisher_Манн,_Иванов_и_Фербер]",0.841256,-0.429168,-22.117287
7,[publisher_АСТ],0.841213,-0.429663,-22.183297
55,"[volume, publisher_Манн,_Иванов_и_Фербер]",0.841196,-0.450615,-16.026493
110,"[cover_type_Мягкий_переплёт, volume, publisher...",0.841051,-0.46385,0.0
125,"[cover_type_Мягкий_переплёт, publisher_АСТ, pu...",0.84095,-0.450966,0.0


In [24]:
dff

Unnamed: 0,group,r,b_pages,b_cover
0,[cover_type_Мягкий_переплёт],0.842263,-0.446504,0.0
4,[volume],0.842171,-0.447176,-18.576132
15,"[cover_type_Мягкий_переплёт, volume]",0.841959,-0.462564,0.0
10,"[publisher_Манн,_Иванов_и_Фербер]",0.841802,-0.422804,-22.603478
21,"[cover_type_Мягкий_переплёт, publisher_Манн,_И...",0.84148,-0.445161,0.0
70,"[publisher_АСТ, publisher_Манн,_Иванов_и_Фербер]",0.841256,-0.429168,-22.117287
7,[publisher_АСТ],0.841213,-0.429663,-22.183297
55,"[volume, publisher_Манн,_Иванов_и_Фербер]",0.841196,-0.450615,-16.026493
110,"[cover_type_Мягкий_переплёт, volume, publisher...",0.841051,-0.46385,0.0
125,"[cover_type_Мягкий_переплёт, publisher_АСТ, pu...",0.84095,-0.450966,0.0


In [25]:
X['width'] = data['width']
# X.drop(columns=['weight', 'volume', 'thickness', 'length', 'width', 'S'], inplace = True)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     684.1
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        21:43:21   Log-Likelihood:                -20138.
No. Observations:                3190   AIC:                         4.033e+04
Df Residuals:                    3164   BIC:                         4.049e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Тк часть публикационных годов незначима, давайте объединим - судя по ящику с усами 24 и 25 год стоит дороже

In [26]:
X['2024_or_2025'] = X['publication_year_2024'] + X['publication_year_2025']
X.drop(columns=['publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024', 'publication_year_2025'], inplace=True)

In [27]:
groups_to_test = [['2024_or_2025']]
for group in groups_to_test:
    X_full = X.copy()
    X_reduced = X.drop(columns=group)
    model_full = sm.OLS(y, X_full).fit()
    model_reduced = sm.OLS(y, X_reduced).fit()

    f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
    print(f"Группа: {group}, F = {f_stat}, p = {p_val}, коэффициент оказался значим")

Группа: ['2024_or_2025'], F = 10.135973258262572, p = 0.001468126150113724, коэффициент оказался значим


In [28]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.843
Method:                 Least Squares   F-statistic:                     814.4
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        21:43:21   Log-Likelihood:                -20140.
No. Observations:                3190   AIC:                         4.032e+04
Df Residuals:                    3168   BIC:                         4.046e+04
Df Model:                          21                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Некоторые рейтинги незначимы, давайте сделаем точно также, 0+ и 6+ стоят дороже. Оставшиеся объединим

In [29]:
X['6-'] = X['reading_age_0+'] + X['reading_age_6+']
X['16+'] = X['reading_age_16+'] + X['reading_age_18+']
X.drop(columns=['reading_age_0+', 'reading_age_6+', 'reading_age_16+', 'reading_age_18+'], inplace=True)

In [30]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.842
Model:                            OLS   Adj. R-squared:                  0.841
Method:                 Least Squares   F-statistic:                     888.3
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        21:43:21   Log-Likelihood:                -20159.
No. Observations:                3190   AIC:                         4.036e+04
Df Residuals:                    3170   BIC:                         4.048e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

Остался незначимым коэффициент при издательстве `publisher_Манн,_Иванов_и_Фербер` По нему мало наблюдений, дропнем его, отнеся в категорию другое

In [31]:
X.drop(columns=['publisher_Манн,_Иванов_и_Фербер'], inplace=True)

In [32]:
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.841
Model:                            OLS   Adj. R-squared:                  0.840
Method:                 Least Squares   F-statistic:                     931.9
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        21:43:21   Log-Likelihood:                -20167.
No. Observations:                3190   AIC:                         4.037e+04
Df Residuals:                    3171   BIC:                         4.049e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

Теперь сделаем возвращение удаленных признаков, посмотрим, как они себя ведут

In [33]:
groups_to_test = [['publication_year_2021', 'publication_year_2022', 'publication_year_2023'],
                  ['publisher_Манн,_Иванов_и_Фербер']]
for group in groups_to_test:
    X_full = X.copy()
    X_reduced = X.copy()
    X_full[group] = data[group]
    model_full = sm.OLS(y, X_full).fit()
    model_reduced = sm.OLS(y, X_reduced).fit()

    f_stat, p_val, df_diff = model_full.compare_f_test(model_reduced)
    print(f"Группа: {group}, F = {f_stat}, p = {p_val}")
print('Все выкинутые группы по-прежнему незначимы')

Группа: ['publication_year_2021', 'publication_year_2022', 'publication_year_2023'], F = 1.1379738634341303, p = 0.3322728437803809
Группа: ['publisher_Манн,_Иванов_и_Фербер'], F = 17.294522031221078, p = 3.2859613708560274e-05
Все выкинутые группы по-прежнему незначимы


In [34]:
X.columns

Index(['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Эксмо', 'cover_type_Мягкий_переплёт', 'thickness', 'width',
       'length', 'volume', 'S', '2024_or_2025', '6-', '16+'],
      dtype='object')

### Итоговый набор признаков:
['const', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage',
       'publisher_АСТ', 'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Эксмо', 'cover_type_Мягкий_переплёт', '2024_or_2025', '6-',
       '16+']

### Сохраняем данные

In [35]:
data_to_csv = pd.concat([y, X], axis=1)
data_to_csv

Unnamed: 0,price,const,avg_rating,cnt_reviews,pages_cnt,tirage,weight,publisher_АСТ,publisher_Азбука,publisher_Иностранка,publisher_Эксмо,cover_type_Мягкий_переплёт,thickness,width,length,volume,S,2024_or_2025,6-,16+
0,312,1.0,4.1,925,512,30000,250.0,1.0,0.0,0.0,0.0,1.0,2.1,11.5,18.0,434.700,207.00,1.0,0.0,1.0
1,312,1.0,4.1,1341,320,30000,200.0,1.0,0.0,0.0,0.0,1.0,2.4,11.5,18.0,496.800,207.00,1.0,0.0,1.0
2,284,1.0,4.5,363,192,25000,132.0,1.0,0.0,0.0,0.0,1.0,1.5,11.5,18.0,310.500,207.00,1.0,0.0,1.0
3,339,1.0,4.3,872,288,12000,209.0,1.0,0.0,0.0,0.0,1.0,1.8,11.5,18.0,372.600,207.00,1.0,0.0,1.0
4,284,1.0,4.2,3004,320,30000,200.0,1.0,0.0,0.0,0.0,1.0,1.6,11.4,17.9,326.496,204.06,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,1241,1.0,5.0,3,624,1500,960.0,0.0,0.0,0.0,1.0,0.0,3.5,16.7,24.0,1402.800,400.80,0.0,0.0,1.0
3186,2207,1.0,3.7,53,448,1500,760.0,0.0,0.0,0.0,1.0,0.0,2.6,17.0,24.0,1060.800,408.00,0.0,0.0,1.0
3187,1103,1.0,4.3,42,784,3000,860.0,0.0,0.0,1.0,0.0,0.0,3.8,14.5,21.7,1195.670,314.65,0.0,0.0,1.0
3188,1011,1.0,3.8,29,704,2000,638.0,0.0,0.0,0.0,1.0,0.0,3.4,14.0,20.6,980.560,288.40,0.0,0.0,1.0


In [36]:
data_to_csv.to_csv('data_after_feature_selection.csv', index=False)

## Сравнение функциональных форм на основе теста Бокса-Кокса и PE теста:

**Тест Бокса-Кокса с преобразование Зарембки (адекватный и рабочий)**

In [37]:
geo_mean = gmean(y)
y_help = y/geo_mean
ln_y_help = np.log(y_help)

lin_model = sm.OLS(y_help, X).fit()
rss1 = np.sum(lin_model.resid**2)

semi_log_model = sm.OLS(ln_y_help, X).fit()
rss2 = np.sum(semi_log_model.resid**2)

chi_stat = X.shape[0]/2 * np.abs(np.log(rss1/rss2))
print(f'{chi_stat} > 3.84 значит, между моделями есть существенное различие, выбираем на основе RSS')

758.2368721562596 > 3.84 значит, между моделями есть существенное различие, выбираем на основе RSS


In [38]:
if rss2 < rss1:
    print('Полулогарифмическая модель предпочтительнее')
else:
    print('Линейная модель предпочтительнее')

Полулогарифмическая модель предпочтительнее


**PE тест**

In [62]:
ln_y = np.log(data['price'])
model_log = sm.OLS(ln_y, X).fit()

yhat_lin = model.fittedvalues
ln_yhat_log = model_log.fittedvalues
yhat_log = np.exp(ln_yhat_log)

mask = yhat_lin >= 0

yhat_lin = yhat_lin[mask].reset_index(drop=True)
yhat_log = yhat_log[mask].reset_index(drop=True)
ln_y = ln_y[mask].reset_index(drop=True)
ln_yhat_log = ln_yhat_log[mask].reset_index(drop=True)
X_masked = X[mask].reset_index(drop=True)
y_masked = y[mask].reset_index(drop=True) 

# Шаг 2: Вспомогательные регрессии
aux_log_X = X_masked.copy()
aux_log_X['delta'] = yhat_lin - yhat_log
aux_log_model = sm.OLS(ln_y, aux_log_X).fit()

aux_lin_X = X_masked.copy()
aux_lin_X['delta'] = ln_yhat_log - np.log(yhat_lin)
aux_lin_model = sm.OLS(y_masked, aux_lin_X).fit()

print("Модель для полулогарифмической спецификации")
print()
print(aux_log_model.summary())

print("Модель для линейной спецификации")
print()
print(aux_lin_model.summary())


Модель для полулогарифмической спецификации

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.809
Model:                            OLS   Adj. R-squared:                  0.808
Method:                 Least Squares   F-statistic:                     704.8
Date:                Thu, 08 May 2025   Prob (F-statistic):               0.00
Time:                        22:13:03   Log-Likelihood:                 306.58
No. Observations:                3185   AIC:                            -573.2
Df Residuals:                    3165   BIC:                            -451.8
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

**Вывод:** все значимо, нужны другие функциональные формы

## Сравнение функциональных форм на основе нормированной R^2:

In [63]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.841
Model:,OLS,Adj. R-squared:,0.84
Method:,Least Squares,F-statistic:,931.9
Date:,"Thu, 08 May 2025",Prob (F-statistic):,0.0
Time:,22:13:24,Log-Likelihood:,-20167.0
No. Observations:,3190,AIC:,40370.0
Df Residuals:,3171,BIC:,40490.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2589.0195,254.219,10.184,0.000,2090.569,3087.470
avg_rating,21.2645,7.048,3.017,0.003,7.446,35.083
cnt_reviews,0.1856,0.024,7.819,0.000,0.139,0.232
pages_cnt,-0.4240,0.027,-15.810,0.000,-0.477,-0.371
tirage,-0.0138,0.001,-13.286,0.000,-0.016,-0.012
weight,1.6054,0.036,44.322,0.000,1.534,1.676
publisher_АСТ,-45.4872,13.321,-3.415,0.001,-71.606,-19.368
publisher_Азбука,-76.7769,14.108,-5.442,0.000,-104.438,-49.116
publisher_Иностранка,-94.5112,17.476,-5.408,0.000,-128.776,-60.246

0,1,2,3
Omnibus:,1338.935,Durbin-Watson:,2.019
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38232.371
Skew:,1.39,Prob(JB):,0.0
Kurtosis:,19.73,Cond. No.,535000.0


In [66]:
data.columns

Index(['price', 'avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight',
       'author_Джейн_Остен', 'author_Джек_Лондон', 'author_Джордж_Оруэлл',
       'author_Лев_Толстой', 'author_Луиза_Мэй_Олкотт',
       'author_Михаил_Булгаков', 'author_Николай_Гоголь',
       'author_Федор_Достоевский', 'author_Эрих_Ремарк', 'publisher_АСТ',
       'publisher_Азбука', 'publisher_Иностранка',
       'publisher_Манн,_Иванов_и_Фербер', 'publisher_Эксмо',
       'publication_year_2021', 'publication_year_2022',
       'publication_year_2023', 'publication_year_2024',
       'publication_year_2025', 'cover_type_Мягкий_переплёт', 'reading_age_0+',
       'reading_age_16+', 'reading_age_18+', 'reading_age_6+', 'thickness',
       'width', 'length', 'volume'],
      dtype='object')

In [76]:
y

0        312
1        312
2        284
3        339
4        284
        ... 
3185    1241
3186    2207
3187    1103
3188    1011
3189     505
Name: price, Length: 3190, dtype: int64

In [77]:
num_features = ['avg_rating', 'cnt_reviews', 'pages_cnt', 'tirage', 'weight', 
                'thickness', 'width', 'length', 'volume']
ln_X = np.log(X_masked[num_features])
ln_X = pd.concat([ln_X, X_masked.drop(columns=num_features)], axis=1)
ln_X = sm.add_constant(ln_X)

model_ln_X = sm.OLS(y_masked, ln_X).fit()

In [78]:
model_ln_X.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.75
Model:,OLS,Adj. R-squared:,0.748
Method:,Least Squares,F-statistic:,558.3
Date:,"Thu, 08 May 2025",Prob (F-statistic):,0.0
Time:,22:32:19,Log-Likelihood:,-20860.0
No. Observations:,3185,AIC:,41760.0
Df Residuals:,3167,BIC:,41860.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
avg_rating,33.9292,43.469,0.781,0.435,-51.302,119.160
cnt_reviews,10.0181,2.685,3.731,0.000,4.753,15.283
pages_cnt,-292.3390,18.496,-15.806,0.000,-328.604,-256.074
tirage,-84.6104,6.838,-12.374,0.000,-98.017,-71.204
weight,667.0528,19.570,34.085,0.000,628.681,705.424
thickness,89.0255,24.443,3.642,0.000,41.101,136.950
width,9.8270,26.420,0.372,0.710,-41.975,61.629
length,-237.6714,38.855,-6.117,0.000,-313.855,-161.488
volume,-138.8190,19.578,-7.091,0.000,-177.206,-100.432

0,1,2,3
Omnibus:,2206.066,Durbin-Watson:,2.016
Prob(Omnibus):,0.0,Jarque-Bera (JB):,84210.904
Skew:,2.794,Prob(JB):,0.0
Kurtosis:,27.563,Cond. No.,2.68e+17


**Вывод:** при сравнении ln_X - y с X - y нормированный R^2 лучше у X - y

In [69]:
ln_X.shape[0], len(ln_y)

(3190, 3185)

In [73]:
ln_X

Unnamed: 0,avg_rating,cnt_reviews,pages_cnt,tirage,weight,thickness,width,length,volume,const,publisher_АСТ,publisher_Азбука,publisher_Иностранка,publisher_Эксмо,cover_type_Мягкий_переплёт,S,2024_or_2025,6-,16+
0,1.410987,6.829794,6.238325,10.308953,5.521461,0.741937,2.442347,2.890372,6.074656,1.0,1.0,0.0,0.0,0.0,1.0,207.00,1.0,0.0,1.0
1,1.410987,7.201171,5.768321,10.308953,5.298317,0.875469,2.442347,2.890372,6.208188,1.0,1.0,0.0,0.0,0.0,1.0,207.00,1.0,0.0,1.0
2,1.504077,5.894403,5.257495,10.126631,4.882802,0.405465,2.442347,2.890372,5.738184,1.0,1.0,0.0,0.0,0.0,1.0,207.00,1.0,0.0,1.0
3,1.458615,6.770789,5.662960,9.392662,5.342334,0.587787,2.442347,2.890372,5.920505,1.0,1.0,0.0,0.0,0.0,1.0,207.00,1.0,0.0,1.0
4,1.435085,8.007700,5.768321,10.308953,5.298317,0.470004,2.433613,2.884801,5.788418,1.0,1.0,0.0,0.0,0.0,1.0,204.06,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3185,1.609438,1.098612,6.436150,7.313220,6.866933,1.252763,2.815409,3.178054,7.246226,1.0,0.0,0.0,0.0,1.0,0.0,400.80,0.0,0.0,1.0
3186,1.308333,3.970292,6.104793,7.313220,6.633318,0.955511,2.833213,3.178054,6.966779,1.0,0.0,0.0,0.0,1.0,0.0,408.00,0.0,0.0,1.0
3187,1.458615,3.737670,6.664409,8.006368,6.756932,1.335001,2.674149,3.077312,7.086462,1.0,0.0,0.0,1.0,0.0,0.0,314.65,0.0,0.0,1.0
3188,1.335001,3.367296,6.556778,7.600902,6.458338,1.223775,2.639057,3.025291,6.888124,1.0,0.0,0.0,0.0,1.0,0.0,288.40,0.0,0.0,1.0


In [79]:
mask = yhat_lin.to_numpy() >= 0
ln_X = ln_X.to_numpy()[mask]
ln_y_masked = ln_y.to_numpy()[mask]

model_ln_X_ln_y = sm.OLS(ln_y, ln_X).fit()
model_ln_X_ln_y.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.798
Model:,OLS,Adj. R-squared:,0.797
Method:,Least Squares,F-statistic:,735.0
Date:,"Thu, 08 May 2025",Prob (F-statistic):,0.0
Time:,22:32:31,Log-Likelihood:,217.22
No. Observations:,3185,AIC:,-398.4
Df Residuals:,3167,BIC:,-289.2
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.1435,0.058,2.470,0.014,0.030,0.257
x2,0.0200,0.004,5.565,0.000,0.013,0.027
x3,-0.5518,0.025,-22.320,0.000,-0.600,-0.503
x4,-0.1574,0.009,-17.223,0.000,-0.175,-0.139
x5,1.2094,0.026,46.232,0.000,1.158,1.261
x6,-0.1319,0.033,-4.038,0.000,-0.196,-0.068
x7,0.1031,0.035,2.921,0.004,0.034,0.172
x8,-0.0099,0.052,-0.191,0.848,-0.112,0.092
x9,-0.0387,0.026,-1.480,0.139,-0.090,0.013

0,1,2,3
Omnibus:,505.661,Durbin-Watson:,2.018
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9101.893
Skew:,-0.009,Prob(JB):,0.0
Kurtosis:,11.282,Cond. No.,2.68e+17


In [81]:
model_ln_y = sm.OLS(ln_y_masked, X_masked).fit()
model_ln_y.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.799
Model:,OLS,Adj. R-squared:,0.798
Method:,Least Squares,F-statistic:,700.1
Date:,"Thu, 08 May 2025",Prob (F-statistic):,0.0
Time:,22:33:09,Log-Likelihood:,228.46
No. Observations:,3185,AIC:,-418.9
Df Residuals:,3166,BIC:,-303.7
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.2745,0.425,7.699,0.000,2.441,4.108
avg_rating,0.0609,0.012,5.164,0.000,0.038,0.084
cnt_reviews,0.0005,4.01e-05,11.747,0.000,0.000,0.001
pages_cnt,-0.0007,4.7e-05,-14.383,0.000,-0.001,-0.001
tirage,-3.13e-05,1.8e-06,-17.380,0.000,-3.48e-05,-2.78e-05
weight,0.0026,6.17e-05,41.924,0.000,0.002,0.003
publisher_АСТ,-0.1191,0.022,-5.335,0.000,-0.163,-0.075
publisher_Азбука,-0.1625,0.024,-6.877,0.000,-0.209,-0.116
publisher_Иностранка,-0.1447,0.029,-4.946,0.000,-0.202,-0.087

0,1,2,3
Omnibus:,567.504,Durbin-Watson:,2.019
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11569.208
Skew:,-0.217,Prob(JB):,0.0
Kurtosis:,12.327,Cond. No.,530000.0


**Вывод:** незначительно лучше ln_y - X, чем ln_y - ln_X