In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn import metrics as metrics
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder  # Другие варианты: CatBoostEncoder, WOEEncoder
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("../../datasets/coffee_shop_dataset/coffee_shop_revenue.csv")
target = "Daily_Revenue"

num_features = list(df.select_dtypes(exclude='object'))
num_features.remove(target)
cat_features = list(df.select_dtypes(include='object'))
catboost_features = num_features+cat_features

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df[catboost_features], 
    df[target], 
    test_size=0.33, random_state=2025)
X_train, X_test, y_train, y_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

# 3. Кодирование категориальных признаков
encoder = TargetEncoder(cols=cat_features)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

scaler = MinMaxScaler()
scaled_X_train_encoded = scaler.fit_transform(X_train_encoded)
scaled_X_train_encoded = pd.DataFrame(scaled_X_train_encoded , columns=catboost_features)
df_train = pd.concat([scaled_X_train_encoded, y_train], axis=1)

scaled_X_test_encoded = scaler.transform(X_test_encoded[catboost_features])
scaled_X_test_encoded = pd.DataFrame(scaled_X_test_encoded , columns=catboost_features)
df_test = pd.concat([scaled_X_test_encoded, y_test], axis=1)


In [6]:
cat_model = CatBoostRegressor(
    num_trees=1,
    depth=3,
    cat_features=cat_features,  # Указываем индексы категориальных признаков
    verbose=0
)
cat_model.fit(X_train, y_train)  # Оригинальные данные (CatBoost сам обрабатывает категории)

<catboost.core.CatBoostRegressor at 0x14aef1210>

In [7]:
# 5. Получение индексов листьев (на закодированных данных)
train_leaf_indices = cat_model.calc_leaf_indexes(Pool(X_train, y_train, cat_features=cat_features)).flatten()
test_leaf_indices = cat_model.calc_leaf_indexes(Pool(X_test, cat_features=cat_features)).flatten()

In [8]:
def get_leaf_stats(leaf_indices, X, y):
    stats_list = []
    for leaf in np.unique(leaf_indices):
        mask = leaf_indices == leaf
        stat = {
            'leaf': leaf,
            'n_samples': np.sum(mask),
            'target_mean': np.mean(y[mask]),
            'target_std': np.std(y[mask])
        }
        
        for col in X.columns:
            stat[f'{col}_mean'] = np.mean(X.loc[mask, col])
            stat[f'{col}_std'] = np.std(X.loc[mask, col])
        
        stats_list.append(stat)
    
    return pd.DataFrame(stats_list)

leaf_stats_df = get_leaf_stats(train_leaf_indices, df_train[catboost_features], df_train[target])

In [9]:
# 7. Кластеризация листьев
cluster_ids = ['leaf', 'n_samples']
cluster_features = [col for col in leaf_stats_df.columns 
                   if not col in cluster_ids]

leaf_scaler = MinMaxScaler()
scaled_leaf_stats_df = leaf_scaler.fit_transform(leaf_stats_df[cluster_features])
scaled_leaf_stats_df = pd.DataFrame(scaled_leaf_stats_df, columns=cluster_features)
scaled_leaf_stats_df = pd.concat([leaf_stats_df[cluster_ids], scaled_leaf_stats_df], axis=1)

scaled_leaf_stats_df

kmeans = KMeans(n_clusters=3, random_state=2025)
scaled_leaf_stats_df['cluster'] = kmeans.fit_predict(scaled_leaf_stats_df[cluster_features])

In [10]:
# 8. Сопоставление кластеров
df_train['leaf'] = train_leaf_indices
df_train = df_train.merge(scaled_leaf_stats_df[['leaf', 'cluster']], on='leaf')

In [11]:
df_train 

Unnamed: 0,Number_of_Customers_Per_Day,Average_Order_Value,Operating_Hours_Per_Day,Number_of_Employees,Marketing_Spend_Per_Day,Location_Foot_Traffic,Daily_Revenue,leaf,cluster
0,0.389755,0.226667,0.545455,0.750000,0.803072,0.569020,1433.99,4,0
1,0.193764,0.120000,0.636364,0.333333,0.842878,0.554268,1053.07,0,0
2,0.481069,0.892000,0.272727,0.500000,0.999428,0.102213,2742.44,6,2
3,0.841871,0.797333,0.636364,0.583333,0.233120,0.126449,3418.71,7,1
4,0.416481,0.962667,0.454545,0.916667,0.174666,0.986301,2441.69,6,2
...,...,...,...,...,...,...,...,...,...
1335,0.917595,0.394667,0.272727,0.750000,0.320657,0.340358,2609.64,5,0
1336,0.808463,0.290667,1.000000,1.000000,0.654508,0.268704,2339.46,5,0
1337,0.487751,0.760000,0.636364,0.083333,0.190964,0.414120,2183.37,6,2
1338,0.175947,0.765333,0.454545,0.833333,0.403721,0.742887,989.97,2,2


In [12]:
# 9. Обучение линейных моделей
linear_models = {}
for cluster in sorted(scaled_leaf_stats_df['cluster'].unique()):
    cluster_data = df_train[df_train['cluster'] == cluster]
    X_cluster = cluster_data.drop([target, 'leaf', 'cluster'], axis=1)
    print(X_cluster.shape)
    y_cluster = cluster_data[target]
    
    lr = LinearRegression()
    lr.fit(X_cluster, y_cluster)
    linear_models[cluster] = lr

(670, 6)
(297, 6)
(373, 6)


In [13]:
df_train.cluster.value_counts()

cluster
0    670
2    373
1    297
Name: count, dtype: int64

In [15]:
# 10. Предсказание на тесте
df_test['leaf'] = test_leaf_indices
df_test = df_test.merge(scaled_leaf_stats_df[['leaf', 'cluster']], on='leaf', how='left')
df_test['cluster'] = df_test['cluster'].fillna(-1)  # Новые листья

test_predictions = np.zeros(len(X_test))
for cluster, model in linear_models.items():
    mask = df_test['cluster'] == cluster
    if mask.sum() > 0:
        test_predictions[mask] = model.predict(df_test.loc[mask, X_test_encoded.columns])

In [16]:
df_test.cluster.value_counts()

cluster
0    336
2    177
1    147
Name: count, dtype: int64

In [17]:
print(f"Test R2: {metrics.r2_score(y_test, test_predictions)}")
print(f"Test MSE: {metrics.mean_squared_error(y_test, test_predictions)}")
print(f"Test RMSE: {np.sqrt(metrics.mean_squared_error(y_test, test_predictions))}")
print(f"Test MAE: {metrics.mean_absolute_error(y_test, test_predictions)}")
print(f"Test MAPE: {metrics.mean_absolute_percentage_error(y_test, test_predictions)}")

Test R2: 0.9448021555748747
Test MSE: 51765.67945594959
Test RMSE: 227.5207231351676
Test MAE: 182.70935572043405
Test MAPE: 0.1345230356182867


In [18]:
train_predictions = np.zeros(len(X_train))
for cluster, model in linear_models.items():
    mask = df_train['cluster'] == cluster
    if mask.sum() > 0:
        train_predictions[mask] = model.predict(df_train.loc[mask, X_train_encoded.columns])

In [19]:
print(f"Train R2: {metrics.r2_score(y_train, train_predictions)}")
print(f"Train MSE: {metrics.mean_squared_error(y_train, train_predictions)}")
print(f"Train RMSE: {np.sqrt(metrics.mean_squared_error(y_train, train_predictions))}")
print(f"Train MAE: {metrics.mean_absolute_error(y_train, train_predictions)}")
print(f"Train MAPE: {metrics.mean_absolute_percentage_error(y_train, train_predictions)}")

Train R2: 0.9477790126492928
Train MSE: 50117.0708541751
Train RMSE: 223.86842308413017
Train MAE: 178.96426926026137
Train MAPE: 0.14558621978771116
