In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn import metrics as metrics
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder  # Другие варианты: CatBoostEncoder, WOEEncoder
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv("../../datasets/house_prices_dataset/house_prices_train.csv")
del df['Id']

target = "SalePrice"
num_features = list(df.select_dtypes(exclude='object'))
num_features.remove(target)
cat_features = list(df.select_dtypes(include='object'))
catboost_features = num_features+cat_features

df.loc[:, num_features] = df.loc[:, num_features].fillna(0)
df.loc[:, cat_features] = df.loc[:, cat_features].fillna('other')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df[catboost_features], 
    df[target], 
    test_size=0.33, random_state=2025)
X_train, X_test, y_train, y_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)

# 3. Кодирование категориальных признаков
encoder = TargetEncoder(cols=cat_features)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)

scaler = MinMaxScaler()
scaled_X_train_encoded = scaler.fit_transform(X_train_encoded)
scaled_X_train_encoded = pd.DataFrame(scaled_X_train_encoded , columns=catboost_features)
df_train = pd.concat([scaled_X_train_encoded, y_train], axis=1)

scaled_X_test_encoded = scaler.transform(X_test_encoded[catboost_features])
scaled_X_test_encoded = pd.DataFrame(scaled_X_test_encoded , columns=catboost_features)
df_test = pd.concat([scaled_X_test_encoded, y_test], axis=1)


In [9]:
cat_model = CatBoostRegressor(
    num_trees=1,
    depth=3,
    cat_features=cat_features,  # Указываем индексы категориальных признаков
    verbose=0
)
cat_model.fit(X_train, y_train)  # Оригинальные данные (CatBoost сам обрабатывает категории)

<catboost.core.CatBoostRegressor at 0x161c8bd50>

In [10]:
# 5. Получение индексов листьев (на закодированных данных)
train_leaf_indices = cat_model.calc_leaf_indexes(Pool(X_train, y_train, cat_features=cat_features)).flatten()
test_leaf_indices = cat_model.calc_leaf_indexes(Pool(X_test, cat_features=cat_features)).flatten()

In [11]:
def get_leaf_stats(leaf_indices, X, y):
    stats_list = []
    for leaf in np.unique(leaf_indices):
        mask = leaf_indices == leaf
        stat = {
            'leaf': leaf,
            'n_samples': np.sum(mask),
            'target_mean': np.mean(y[mask]),
            'target_std': np.std(y[mask])
        }
        
        for col in X.columns:
            stat[f'{col}_mean'] = np.mean(X.loc[mask, col])
            stat[f'{col}_std'] = np.std(X.loc[mask, col])
        
        stats_list.append(stat)
    
    return pd.DataFrame(stats_list)

leaf_stats_df = get_leaf_stats(train_leaf_indices, df_train[catboost_features], df_train[target])

In [12]:
# 7. Кластеризация листьев
cluster_ids = ['leaf', 'n_samples']
cluster_features = [col for col in leaf_stats_df.columns 
                   if not col in cluster_ids]

leaf_scaler = MinMaxScaler()
scaled_leaf_stats_df = leaf_scaler.fit_transform(leaf_stats_df[cluster_features])
scaled_leaf_stats_df = pd.DataFrame(scaled_leaf_stats_df, columns=cluster_features)
scaled_leaf_stats_df = pd.concat([leaf_stats_df[cluster_ids], scaled_leaf_stats_df], axis=1)

scaled_leaf_stats_df

kmeans = KMeans(n_clusters=3, random_state=2025)
scaled_leaf_stats_df['cluster'] = kmeans.fit_predict(scaled_leaf_stats_df[cluster_features])

In [13]:
# 8. Сопоставление кластеров
df_train['leaf'] = train_leaf_indices
df_train = df_train.merge(scaled_leaf_stats_df[['leaf', 'cluster']], on='leaf')

In [14]:
df_train 

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition,SalePrice,leaf,cluster
0,0.058824,0.285714,0.048966,0.444444,0.875,0.379562,0.933333,0.000000,0.000000,0.000000,...,0.400593,0.0,0.0,1.000000,1.0,0.126092,0.173698,109500,0,0
1,0.000000,0.357143,0.056671,0.555556,0.500,0.970803,0.933333,0.000000,0.012389,0.000000,...,1.000000,1.0,0.0,1.000000,1.0,1.000000,1.000000,169990,0,0
2,0.176471,0.434066,0.051947,0.444444,0.500,0.503650,0.000000,0.000000,0.162832,0.027815,...,1.000000,1.0,0.0,1.000000,1.0,0.126092,0.173698,105000,0,0
3,0.176471,0.329670,0.060241,0.333333,0.375,0.562044,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.0,0.0,1.000000,1.0,0.126092,0.000000,135000,0,0
4,0.058824,0.329670,0.031858,0.333333,0.625,0.401460,0.000000,0.000000,0.000000,0.000000,...,1.000000,1.0,0.0,1.000000,1.0,0.126092,0.173698,68500,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,0.000000,0.000000,0.062270,0.555556,0.625,0.817518,0.566667,0.000000,0.292035,0.000000,...,1.000000,1.0,0.0,1.000000,1.0,0.126092,0.173698,162500,0,0
974,0.294118,0.527473,0.075029,0.444444,0.500,0.306569,0.000000,0.000000,0.000000,0.000000,...,0.400593,0.0,0.0,1.000000,1.0,0.126092,0.173698,138887,1,0
975,0.000000,0.269231,0.028662,0.222222,0.875,0.605839,0.916667,0.000000,0.113274,0.000000,...,1.000000,1.0,0.0,1.000000,1.0,0.126092,0.173698,126175,0,0
976,0.000000,0.445055,0.053088,0.555556,0.500,0.817518,0.583333,0.000000,0.149558,0.000000,...,1.000000,1.0,0.0,0.716823,1.0,0.126092,0.173698,152000,0,0


In [15]:
# 9. Обучение линейных моделей
linear_models = {}
for cluster in sorted(scaled_leaf_stats_df['cluster'].unique()):
    cluster_data = df_train[df_train['cluster'] == cluster]
    X_cluster = cluster_data.drop([target, 'leaf', 'cluster'], axis=1)
    print(X_cluster.shape)
    y_cluster = cluster_data[target]
    
    lr = LinearRegression()
    lr.fit(X_cluster, y_cluster)
    linear_models[cluster] = lr

(541, 79)
(95, 79)
(342, 79)


In [16]:
df_train.cluster.value_counts()

cluster
0    541
2    342
1     95
Name: count, dtype: int64

In [17]:
# 10. Предсказание на тесте
df_test['leaf'] = test_leaf_indices
df_test = df_test.merge(scaled_leaf_stats_df[['leaf', 'cluster']], on='leaf', how='left')
df_test['cluster'] = df_test['cluster'].fillna(-1)  # Новые листья

test_predictions = np.zeros(len(X_test))
for cluster, model in linear_models.items():
    mask = df_test['cluster'] == cluster
    if mask.sum() > 0:
        test_predictions[mask] = model.predict(df_test.loc[mask, X_test_encoded.columns])

In [18]:
df_test.cluster.value_counts()

cluster
0    279
2    159
1     44
Name: count, dtype: int64

In [19]:
print(f"Test R2: {metrics.r2_score(y_test, test_predictions)}")
print(f"Test MSE: {metrics.mean_squared_error(y_test, test_predictions)}")
print(f"Test RMSE: {np.sqrt(metrics.mean_squared_error(y_test, test_predictions))}")
print(f"Test MAE: {metrics.mean_absolute_error(y_test, test_predictions)}")
print(f"Test MAPE: {metrics.mean_absolute_percentage_error(y_test, test_predictions)}")

Test R2: -0.039242072166173037
Test MSE: 7085900443.604636
Test RMSE: 84177.79067904215
Test MAE: 26711.732267101364
Test MAPE: 0.1422227631048248


In [20]:
train_predictions = np.zeros(len(X_train))
for cluster, model in linear_models.items():
    mask = df_train['cluster'] == cluster
    if mask.sum() > 0:
        train_predictions[mask] = model.predict(df_train.loc[mask, X_train_encoded.columns])

In [21]:
print(f"Train R2: {metrics.r2_score(y_train, train_predictions)}")
print(f"Train MSE: {metrics.mean_squared_error(y_train, train_predictions)}")
print(f"Train RMSE: {np.sqrt(metrics.mean_squared_error(y_train, train_predictions))}")
print(f"Train MAE: {metrics.mean_absolute_error(y_train, train_predictions)}")
print(f"Train MAPE: {metrics.mean_absolute_percentage_error(y_train, train_predictions)}")

Train R2: 0.944388209538754
Train MSE: 336711358.709807
Train RMSE: 18349.696420099353
Train MAE: 12204.717868005422
Train MAPE: 0.07306765370384945
