In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

Линейная регрессия на чек и трафик

In [6]:
input_path = Path('Х5_with_region_index_2024_population_patched_with_flags.xlsx')
df = pd.read_excel(input_path)

In [9]:
col_np = 'Населенный пункт'
col_region = 'Регион'
col_index = 'Индекс_РИА_2024'
col_is_season = 'is_season'
col_traffic_flag = 'traffic_flag'
col_pedestrian = 'Трафик пеший, в час'
col_check = 'Средний чек'
col_schools = 'Школы (300 м)'
col_size_cat = 'Торговая площадь, категориальный'

df[col_index] = pd.to_numeric(df[col_index], errors='coerce')
df[col_pedestrian] = pd.to_numeric(df[col_pedestrian], errors='coerce')
df[col_check] = pd.to_numeric(df[col_check], errors='coerce')
df[col_schools] = pd.to_numeric(df[col_schools], errors='coerce').fillna(0)

df['has_school'] = df[col_schools] > 0

size_map = {
    'Маленький': 'small',
    'Средний': 'medium',
    'Большой': 'large_plus',
    'Очень большой': 'large_plus',
}
df['size_group'] = df[col_size_cat].map(size_map)

# ищем колонку с численностью населения
pop_col = None
for c in df.columns:
    name = str(c)
    if 'числен' in name.lower() and 'насел' in name.lower():
        pop_col = c
        break
print('Колонка с населением:', pop_col)

if pop_col is not None:
    df[pop_col] = pd.to_numeric(df[pop_col], errors='coerce')
    df['population'] = df[pop_col]
else:
    df['population'] = np.nan

mask = (
    (df[col_is_season] == 0)
    & (df[col_traffic_flag] == 1)
    & (df[col_pedestrian] > 0)
    & df[col_index].notna()
    & df['size_group'].notna()
)

base_df = df[mask].copy()
print('Размер base_df после фильтрации:', base_df.shape)
base_df[[col_np, col_region, col_pedestrian, col_check, col_index, 'population', 'has_school', 'size_group']].head()

Колонка с населением: Численность населения
Размер base_df после фильтрации: (178677, 24)


Unnamed: 0,Населенный пункт,Регион,"Трафик пеший, в час",Средний чек,Индекс_РИА_2024,population,has_school,size_group
0,Абинск г,Краснодарский край,68.25,976.170936,76.58,38231,False,medium
1,Абинск г,Краснодарский край,68.25,1025.462154,76.58,38231,False,medium
2,Абинск г,Краснодарский край,68.25,1158.15089,76.58,38231,False,medium
5,Абинск г,Краснодарский край,68.25,1031.000127,76.58,38231,False,medium
6,Абинск г,Краснодарский край,68.25,1035.065767,76.58,38231,False,medium


In [11]:
reg_df = base_df.copy()
reg_df = reg_df.rename(columns={
    col_np: 'np_name',
    col_region: 'region',
    col_pedestrian: 'traffic_ped',
    col_check: 'avg_check',
    col_index: 'ria_index',
})

# убираем строки без населения, если оно есть
if reg_df['population'].notna().any():
    reg_df = reg_df[reg_df['population'].notna()].copy()

reg_df = reg_df[(reg_df['traffic_ped'] > 0) & (reg_df['avg_check'] > 0)].copy()
reg_df['log_traffic'] = np.log(reg_df['traffic_ped'])
reg_df['log_check'] = np.log(reg_df['avg_check'])
reg_df['log_population'] = np.log(reg_df['population'].replace({0: np.nan}))

print('Размер reg_df для регрессии:', reg_df.shape)
reg_df[['np_name', 'region', 'traffic_ped', 'avg_check', 'ria_index', 'population', 'has_school', 'size_group']].head()

Размер reg_df для регрессии: (178666, 27)


Unnamed: 0,np_name,region,traffic_ped,avg_check,ria_index,population,has_school,size_group
0,Абинск г,Краснодарский край,68.25,976.170936,76.58,38231,False,medium
1,Абинск г,Краснодарский край,68.25,1025.462154,76.58,38231,False,medium
2,Абинск г,Краснодарский край,68.25,1158.15089,76.58,38231,False,medium
5,Абинск г,Краснодарский край,68.25,1031.000127,76.58,38231,False,medium
6,Абинск г,Краснодарский край,68.25,1035.065767,76.58,38231,False,medium


In [12]:
reg_df

Unnamed: 0,new_id,Месяц,Трафик,avg_check,"Дата открытия, категориальный","Торговая площадь, категориальный",np_name,region,Численность населения,Количество домохозяйств,...,Пятерочки (500 м),ria_index,traffic_flag,is_season,has_school,size_group,population,log_traffic,log_check,log_population
0,0,10,59662,976.170936,Средний по возрасту,Средний,Абинск г,Краснодарский край,38231,728,...,0,76.58,1,0,False,medium,38231,4.223177,6.883638,10.551402
1,0,5,56674,1025.462154,Средний по возрасту,Средний,Абинск г,Краснодарский край,38231,728,...,0,76.58,1,0,False,medium,38231,4.223177,6.932899,10.551402
2,0,1,51488,1158.150890,Средний по возрасту,Средний,Абинск г,Краснодарский край,38231,728,...,0,76.58,1,0,False,medium,38231,4.223177,7.054580,10.551402
5,0,12,59476,1031.000127,Средний по возрасту,Средний,Абинск г,Краснодарский край,38231,728,...,0,76.58,1,0,False,medium,38231,4.223177,6.938285,10.551402
6,0,9,55856,1035.065767,Средний по возрасту,Средний,Абинск г,Краснодарский край,38231,728,...,0,76.58,1,0,False,medium,38231,4.223177,6.942220,10.551402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256705,21741,12,52388,1183.302760,Новый,Большой,Преображенская ст-ца,Волгоградская обл,4889,275,...,1,53.66,1,0,False,large_plus,4889,4.921197,7.076065,8.494743
256706,21742,10,51676,1167.101083,Новый,Средний,Октябрьский рп,Волгоградская обл,6071,262,...,0,53.66,1,0,False,medium,6071,4.831309,7.062278,8.711279
256707,21742,11,51516,1252.914118,Новый,Средний,Октябрьский рп,Волгоградская обл,6071,262,...,0,53.66,1,0,False,medium,6071,4.831309,7.133227,8.711279
256708,21742,9,49593,1130.823998,Новый,Средний,Октябрьский рп,Волгоградская обл,6071,262,...,0,53.66,1,0,False,medium,6071,4.831309,7.030702,8.711279


In [13]:
formula_traffic = """
traffic_ped ~ has_school + ria_index + population + C(size_group)
"""

formula_check = """
avg_check   ~ has_school + ria_index + population + C(size_group)
"""
formula_traffic_log = """
log_traffic ~ has_school + log_population + ria_index + C(size_group) 
"""

formula_check_log = """
log_check ~ has_school + log_population + ria_index + C(size_group) 
"""
# -----------------------------
# 2. Запуск моделей OLS с робастными ошибками
# -----------------------------
model_traf = smf.ols(formula_traffic, data=reg_df).fit(cov_type='HC3')
model_check = smf.ols(formula_check, data=reg_df).fit(cov_type='HC3')
model_traf_log = smf.ols(formula_traffic_log, data=reg_df).fit(cov_type='HC3')
model_check_log = smf.ols(formula_check_log, data=reg_df).fit(cov_type='HC3')
print("\n=== Модель: трафика ===")
print(model_traf.summary())




=== Модель: трафика ===
                            OLS Regression Results                            
Dep. Variable:            traffic_ped   R-squared:                       0.074
Model:                            OLS   Adj. R-squared:                  0.074
Method:                 Least Squares   F-statistic:                     1674.
Date:                Sat, 13 Dec 2025   Prob (F-statistic):               0.00
Time:                        00:59:31   Log-Likelihood:            -1.0217e+06
No. Observations:              178666   AIC:                         2.044e+06
Df Residuals:                  178660   BIC:                         2.044e+06
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
I

In [14]:
print("\n=== Модель: среднего чека ===")
print(model_check.summary())


=== Модель: среднего чека ===
                            OLS Regression Results                            
Dep. Variable:              avg_check   R-squared:                       0.311
Model:                            OLS   Adj. R-squared:                  0.311
Method:                 Least Squares   F-statistic:                 1.233e+04
Date:                Sat, 13 Dec 2025   Prob (F-statistic):               0.00
Time:                        00:59:48   Log-Likelihood:            -1.2502e+06
No. Observations:              178666   AIC:                         2.500e+06
Df Residuals:                  178660   BIC:                         2.501e+06
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [15]:
print("\n=== Модель: лог трафика ===") # При наличии школы трафик в среднем выше примерно на 9.9%
print(model_traf_log.summary())


=== Модель: лог трафика ===
                            OLS Regression Results                            
Dep. Variable:            log_traffic   R-squared:                       0.113
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     4609.
Date:                Sat, 13 Dec 2025   Prob (F-statistic):               0.00
Time:                        00:59:57   Log-Likelihood:            -1.3736e+05
No. Observations:              178666   AIC:                         2.747e+05
Df Residuals:                  178660   BIC:                         2.748e+05
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------

In [16]:
print("\n=== Модель: лог чека ===") # При наличии школы средний чек в среднем ниже примерно на 1.3% 
print(model_check_log.summary())


=== Модель: лог чека ===
                            OLS Regression Results                            
Dep. Variable:              log_check   R-squared:                       0.265
Model:                            OLS   Adj. R-squared:                  0.265
Method:                 Least Squares   F-statistic:                 1.250e+04
Date:                Sat, 13 Dec 2025   Prob (F-statistic):               0.00
Time:                        01:00:03   Log-Likelihood:                -23840.
No. Observations:              178666   AIC:                         4.769e+04
Df Residuals:                  178660   BIC:                         4.775e+04
Df Model:                           5                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------


In [17]:
vif_df = reg_df.copy()

# Числовые признаки
base_features = ['has_school', 'population', 'ria_index']

# Добавим дамми для size_group и region
dummies_size = pd.get_dummies(vif_df['size_group'], prefix='size', drop_first=True)
dummies_reg = pd.get_dummies(vif_df['region'], prefix='region', drop_first=True)

X = pd.concat([vif_df[base_features], dummies_size, dummies_reg], axis=1).dropna()
X  = X.select_dtypes(include=[np.number])
# формируем таблицу VIF
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("\n=== VIF ===")
print(vif_data.sort_values('VIF', ascending=False))


=== VIF ===
      feature       VIF
1   ria_index  1.316708
0  population  1.316708


Теперь проверку на парных сравнениях
### t-тесты по разностям

Отдельно для small / medium / large_plus считаем t-тесты по разностям
`with_school - without_school` для трафика и чека.

In [18]:
from scipy import stats

group_cols = [col_np, 'size_group', 'Индекс_РИА_2024']

agg = (
    base_df
    .groupby(group_cols + ['has_school'])
    .agg(
        mean_traffic=(col_pedestrian, 'mean'),
        mean_check=(col_check, 'mean'),
        count=('has_school', 'size'),
    )
    .reset_index()
)

pivot_pairs = agg.pivot_table(
    index=group_cols,
    columns='has_school',
    values=['mean_traffic', 'mean_check'],
)

pivot_pairs.columns = [f"{metric}_{'with' if hs else 'without'}" for metric, hs in pivot_pairs.columns]
pivot_pairs = pivot_pairs.dropna().reset_index()

pivot_pairs['diff_traffic'] = pivot_pairs['mean_traffic_with'] - pivot_pairs['mean_traffic_without']
pivot_pairs['diff_check'] = pivot_pairs['mean_check_with'] - pivot_pairs['mean_check_without']

def ria_bin(x):
    if x < 40:
        return 'x<40'
    elif x <= 70:
        return '40<x<70'
    else:
        return 'x>70'
pivot_pairs['ria_group'] = pivot_pairs['Индекс_РИА_2024'].apply(ria_bin)

print('Примеры пар:')
pivot_pairs.head()

Примеры пар:


Unnamed: 0,Населенный пункт,size_group,Индекс_РИА_2024,mean_check_without,mean_check_with,mean_traffic_without,mean_traffic_with,diff_traffic,diff_check,ria_group
0,1-я Моква д,medium,50.02,1311.874036,590.25495,86.555556,95.666667,9.111111,-721.619086,40<x<70
1,Абакан г,large_plus,31.09,704.011246,718.690294,335.230769,357.4,22.169231,14.679048,x<40
2,Абакан г,medium,31.09,835.349556,768.072178,279.76589,355.622103,75.856213,-67.277378,x<40
3,Абинск г,medium,76.58,852.761283,572.440678,91.267857,133.375,42.107143,-280.320605,x>70
4,Абинск г,small,76.58,560.248126,584.288377,128.3,69.875,-58.425,24.040252,x>70


In [None]:
def ttests(pivot_pairs, group):
    alpha = 0.05
    result = []
    for size in ["small", "medium", "large_plus"]:
        sub = pivot_pairs[pivot_pairs["size_group"] == size]
        if sub.empty:
            print(f"\n[WARN] Нет пар для size_group={size}")
            continue
        dif_tr = sub["diff_traffic"].dropna()
        dif_ch = sub["diff_check"].dropna()
        print(
            f"\n=== size_group = {size} ===",
            "Число пар (НП):",
            len(sub),
            "Группа по индексу:",
            group,
        )
        print()

        if len(dif_tr) >= 3:
            t_tr, p_tr = stats.ttest_1samp(dif_tr, popmean=0.0)
            print(f"Трафик: mean_diff={dif_tr.mean():.2f}, t={t_tr:.3f}, p={p_tr:.4g}")
        else:
            print("Недостаточно пар для трафика")

        if len(dif_ch) >= 3:
            t_ch, p_ch = stats.ttest_1samp(dif_ch, popmean=0.0)
            print(f"Чек: mean_diff={dif_ch.mean():.2f}, t={t_ch:.3f}, p={p_ch:.4g}")
        else:
            print("Недостаточно пар для чека")
        result.append(
            {
                "mean_diff_traf": dif_tr.mean(),
                "mean_diff_check": dif_ch.mean(),
                "t_traf": t_tr.mean(),
                "t_check": t_ch.mean(),
                "p_traf": p_tr.mean(),
                "p_check": p_ch.mean(),
                'size': size,
            }
        )
    return result

In [None]:
results = {}
for ria in pivot_pairs['ria_group'].unique():
    results[ria] = {}
    df_ria = pivot_pairs[pivot_pairs['ria_group'] == ria]

    result = ttests(df_ria, ria)
    results[ria] = result

results


=== size_group = small === Число пар (НП): 148 Группа по индексу: 40<x<70

Трафик: mean_diff=14.73, t=3.657, p=0.0003549
Чек: mean_diff=-40.52, t=-3.810, p=0.0002035

=== size_group = medium === Число пар (НП): 339 Группа по индексу: 40<x<70

Трафик: mean_diff=12.02, t=4.684, p=4.081e-06
Чек: mean_diff=-33.62, t=-3.230, p=0.001358

=== size_group = large_plus === Число пар (НП): 140 Группа по индексу: 40<x<70

Трафик: mean_diff=-1.65, t=-0.296, p=0.7678
Чек: mean_diff=-7.58, t=-0.379, p=0.7054

=== size_group = small === Число пар (НП): 30 Группа по индексу: x<40

Трафик: mean_diff=2.32, t=0.239, p=0.813
Чек: mean_diff=-21.09, t=-0.702, p=0.4886

=== size_group = medium === Число пар (НП): 64 Группа по индексу: x<40

Трафик: mean_diff=17.17, t=3.436, p=0.00105
Чек: mean_diff=-48.58, t=-2.308, p=0.02429

=== size_group = large_plus === Число пар (НП): 21 Группа по индексу: x<40

Трафик: mean_diff=-3.23, t=-0.365, p=0.7188
Чек: mean_diff=-114.37, t=-2.736, p=0.01274

=== size_group = sm

{'40<x<70': [{'mean_diff_traf': 14.733752435915326,
   'mean_diff_check': -40.523752299332095,
   't_traf': 3.6568468393261404,
   't_check': -3.8102077022316716,
   'p_traf': 0.0003549296362950166,
   'p_check': 0.0002035399266191013,
   'size': 'small'},
  {'mean_diff_traf': 12.024786352081975,
   'mean_diff_check': -33.62438266065923,
   't_traf': 4.684052580827773,
   't_check': -3.230387334261782,
   'p_traf': 4.08108396113864e-06,
   'p_check': 0.0013576907150908712,
   'size': 'medium'},
  {'mean_diff_traf': -1.6492657096040473,
   'mean_diff_check': -7.578264895251384,
   't_traf': -0.2957830056759138,
   't_check': -0.3788471663027171,
   'p_traf': 0.7678372139399948,
   'p_check': 0.7053794119066495,
   'size': 'large_plus'}],
 'x<40': [{'mean_diff_traf': 2.319740604843305,
   'mean_diff_check': -21.08818390344766,
   't_traf': 0.23874344177501597,
   't_check': -0.701535723792822,
   'p_traf': 0.8129836323919966,
   'p_check': 0.48856082519703237,
   'size': 'small'},
  {'me

In [33]:
rows = []

for ria_group, entries in results.items():
    for d in entries:
        row = d.copy()
        row["ria_group"] = ria_group
        rows.append(row)

df_res = pd.DataFrame(rows)

# ставим порядок столбцов
cols_order = [
    "ria_group", "size",
    "mean_diff_traf", "t_traf", "p_traf",
    "mean_diff_check", "t_check", "p_check"
]
df_res = df_res[cols_order]

df_res

Unnamed: 0,ria_group,size,mean_diff_traf,t_traf,p_traf,mean_diff_check,t_check,p_check
0,40<x<70,small,14.733752,3.656847,0.000355,-40.523752,-3.810208,0.000204
1,40<x<70,medium,12.024786,4.684053,4e-06,-33.624383,-3.230387,0.001358
2,40<x<70,large_plus,-1.649266,-0.295783,0.767837,-7.578265,-0.378847,0.705379
3,x<40,small,2.319741,0.238743,0.812984,-21.088184,-0.701536,0.488561
4,x<40,medium,17.170054,3.436053,0.00105,-48.581197,-2.308124,0.024288
5,x<40,large_plus,-3.231556,-0.365155,0.718828,-114.367443,-2.735582,0.012744
6,x>70,small,9.746723,2.093791,0.038808,-58.282156,-3.136855,0.002243
7,x>70,medium,13.601498,4.280978,2.9e-05,-26.410781,-2.028364,0.043905
8,x>70,large_plus,2.88212,0.567593,0.571535,-96.866039,-3.374063,0.001042


In [5]:
df_res.to_excel('result_ttest_season.xlsx', index=False)

NameError: name 'df_res' is not defined

In [37]:
# Пивотируем таблицу в широкий формат
pivot_cols = [
    "mean_diff_traf", "t_traf", "p_traf",
    "mean_diff_check", "t_check", "p_check"
]

wide = (
    df_res
    .pivot(index="ria_group", columns="size", values=pivot_cols)
)

# Чтобы убрать MultiIndex колонок
wide.columns = [f"{metric}_{size}" for metric, size in wide.columns]

wide = wide[[el for el in wide.columns if not el.startswith("t_")]].reset_index()

wide


Unnamed: 0,ria_group,mean_diff_traf_large_plus,mean_diff_traf_medium,mean_diff_traf_small,p_traf_large_plus,p_traf_medium,p_traf_small,mean_diff_check_large_plus,mean_diff_check_medium,mean_diff_check_small,p_check_large_plus,p_check_medium,p_check_small
0,40<x<70,-1.649266,12.024786,14.733752,0.767837,4e-06,0.000355,-7.578265,-33.624383,-40.523752,0.705379,0.001358,0.000204
1,x<40,-3.231556,17.170054,2.319741,0.718828,0.00105,0.812984,-114.367443,-48.581197,-21.088184,0.012744,0.024288,0.488561
2,x>70,2.88212,13.601498,9.746723,0.571535,2.9e-05,0.038808,-96.866039,-26.410781,-58.282156,0.001042,0.043905,0.002243


In [38]:
wide.to_excel('result_ttest_wide.xlsx', index=False)