Перед запуском убедитесь, что в корне проекта есть файл .env и в нем заполнены выданные вам креды подключения к базам данных и хранилищу

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from sqlalchemy import create_engine

In [3]:
# подгружаем .env
load_dotenv()

True

In [4]:
# Считываем все креды
src_host = os.environ.get('DB_SOURCE_HOST')
src_port = os.environ.get('DB_SOURCE_PORT')
src_username = os.environ.get('DB_SOURCE_USER')
src_password = os.environ.get('DB_SOURCE_PASSWORD')
src_db = os.environ.get('DB_SOURCE_NAME') 

dst_host = os.environ.get('DB_DESTINATION_HOST')
dst_port = os.environ.get('DB_DESTINATION_PORT')
dst_username = os.environ.get('DB_DESTINATION_USER')
dst_password = os.environ.get('DB_DESTINATION_PASSWORD')
dst_db = os.environ.get('DB_DESTINATION_NAME')

s3_bucket = os.environ.get('S3_BUCKET_NAME')
s3_access_key = os.environ.get('AWS_ACCESS_KEY_ID')
s3_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')

In [5]:
# Создадим соединения
src_conn = create_engine(f'postgresql://{src_username}:{src_password}@{src_host}:{src_port}/{src_db}')
dst_conn = create_engine(f'postgresql://{dst_username}:{dst_password}@{dst_host}:{dst_port}/{dst_db}')

In [6]:
# Таблица квартиры
TABLE = 'flats_churn'
SQL = f'select * from {TABLE}'
data = pd.read_sql(SQL, dst_conn)
data

Unnamed: 0,id,flat_id,price,target,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,build_year,build_age,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,1,0,9500000,9500000.0,9,9.90,19.900000,1,false,false,35.099998,1965,60,6,55.717113,37.781120,2.64,84,12,true
1,2,1,13500000,13500000.0,7,9.00,16.600000,1,false,false,43.000000,2001,24,2,55.794849,37.608013,3.00,97,10,true
2,3,2,13500000,13500000.0,9,9.00,32.000000,2,false,false,56.000000,2000,25,4,55.740040,37.761742,2.70,80,10,true
3,4,5,8490104,8490104.0,9,9.00,30.799999,2,false,false,51.009998,2017,8,4,55.724728,37.743069,2.70,192,17,true
4,5,6,9500000,9500000.0,1,6.18,29.340000,2,false,false,44.520000,1964,61,4,55.795589,37.722622,2.64,180,5,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73597,73598,141248,8000000,8000000.0,5,8.50,29.000000,2,false,false,44.000000,1960,65,0,55.828831,37.587704,2.70,45,5,false
73598,73599,141249,14500000,14500000.0,7,13.00,34.000000,2,false,false,60.000000,2011,14,4,55.830299,37.501556,2.74,390,17,true
73599,73600,141250,8800000,8800000.0,5,5.40,28.400000,2,false,false,45.900002,1962,63,6,55.779945,37.472790,2.48,60,5,false
73600,73601,141251,9870000,9870000.0,6,7.00,37.000000,3,false,false,56.000000,1972,53,4,55.781586,37.479717,2.64,384,9,true


In [7]:
data.drop(columns=['studio', 'price', 'build_year'], inplace=True) #т.к только false значения
data

Unnamed: 0,id,flat_id,target,floor,kitchen_area,living_area,rooms,is_apartment,total_area,build_age,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
0,1,0,9500000.0,9,9.90,19.900000,1,false,35.099998,60,6,55.717113,37.781120,2.64,84,12,true
1,2,1,13500000.0,7,9.00,16.600000,1,false,43.000000,24,2,55.794849,37.608013,3.00,97,10,true
2,3,2,13500000.0,9,9.00,32.000000,2,false,56.000000,25,4,55.740040,37.761742,2.70,80,10,true
3,4,5,8490104.0,9,9.00,30.799999,2,false,51.009998,8,4,55.724728,37.743069,2.70,192,17,true
4,5,6,9500000.0,1,6.18,29.340000,2,false,44.520000,61,4,55.795589,37.722622,2.64,180,5,false
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73597,73598,141248,8000000.0,5,8.50,29.000000,2,false,44.000000,65,0,55.828831,37.587704,2.70,45,5,false
73598,73599,141249,14500000.0,7,13.00,34.000000,2,false,60.000000,14,4,55.830299,37.501556,2.74,390,17,true
73599,73600,141250,8800000.0,5,5.40,28.400000,2,false,45.900002,63,6,55.779945,37.472790,2.48,60,5,false
73600,73601,141251,9870000.0,6,7.00,37.000000,3,false,56.000000,53,4,55.781586,37.479717,2.64,384,9,true


In [8]:
## 

# Кодируем бинарные признаки

In [9]:
cat_features = data.select_dtypes(include='object')

In [10]:
potential_binary_features = cat_features.nunique() == 2
potential_binary_features

is_apartment    True
has_elevator    True
dtype: bool

In [11]:
binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
binary_cat_features

Unnamed: 0,is_apartment,has_elevator
0,false,true
1,false,true
2,false,true
3,false,true
4,false,false
...,...,...
73597,false,false
73598,false,true
73599,false,false
73600,false,true


In [12]:
from sklearn.preprocessing import OneHotEncoder

one_hot_drop = OneHotEncoder(drop='if_binary', sparse_output=False) 
drop_res = one_hot_drop.fit_transform(binary_cat_features)
drop_res = pd.DataFrame(drop_res, columns=one_hot_drop.get_feature_names_out())
# print(drop_res.head())
# print('shape: ', drop_res.shape) 
drop_res

Unnamed: 0,is_apartment_true,has_elevator_true
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,0.0
...,...,...
73597,0.0,0.0
73598,0.0,1.0
73599,0.0,0.0
73600,0.0,1.0


# Нормализация

## Нормализуем категориальные INT колонки

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
Min_Max_Columns = scaler.fit_transform(data[['build_age', 'floor', 'rooms', 'floors_total']])
Min_Max_Columns

array([[0.53211009, 0.42105263, 0.        , 0.39285714],
       [0.20183486, 0.31578947, 0.        , 0.32142857],
       [0.21100917, 0.42105263, 0.25      , 0.32142857],
       ...,
       [0.55963303, 0.21052632, 0.25      , 0.14285714],
       [0.46788991, 0.26315789, 0.5       , 0.28571429],
       [0.04587156, 0.26315789, 0.5       , 0.57142857]])

## Нормализуем int onehotencoder

In [14]:
data['building_type_int'].unique()

array([6, 2, 4, 1, 3, 0])

In [15]:
from sklearn.preprocessing import OneHotEncoder

one_hot_drop = OneHotEncoder(sparse_output=False)
building_type = one_hot_drop.fit_transform(data[['building_type_int']])
print('shape: ', building_type.shape)
building_type

shape:  (73602, 6)


array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.]])

# Нормируем числовые признаки float

In [16]:
num_features = data.select_dtypes(['float'])
num_features.drop(columns=['target',], inplace=True)
#num_features.drop(columns=['target',], inplace=True) # Попробовать закодировать координатые отдельно
num_features

Unnamed: 0,kitchen_area,living_area,total_area,latitude,longitude,ceiling_height
0,9.90,19.900000,35.099998,55.717113,37.781120,2.64
1,9.00,16.600000,43.000000,55.794849,37.608013,3.00
2,9.00,32.000000,56.000000,55.740040,37.761742,2.70
3,9.00,30.799999,51.009998,55.724728,37.743069,2.70
4,6.18,29.340000,44.520000,55.795589,37.722622,2.64
...,...,...,...,...,...,...
73597,8.50,29.000000,44.000000,55.828831,37.587704,2.70
73598,13.00,34.000000,60.000000,55.830299,37.501556,2.74
73599,5.40,28.400000,45.900002,55.779945,37.472790,2.48
73600,7.00,37.000000,56.000000,55.781586,37.479717,2.64


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler_res = scaler.fit_transform(num_features)
print(scaler_res)
print(pd.DataFrame(scaler_res, columns=scaler.get_feature_names_out()))

[[ 0.69920967 -0.99467974 -1.00377067 -0.12248416  1.14054233 -0.31628219]
 [ 0.26275246 -1.31768062 -0.47940055  0.59521098  0.05341575  2.58761635]
 [ 0.26275246  0.18965714  0.38348682  0.08918273  1.0188428   0.16770058]
 ...
 [-1.48307706 -0.16270758 -0.28691019  0.4576099  -0.79579784 -1.60690483]
 [-0.70715285  0.67905253  0.38348682  0.47275412 -0.75229265 -0.31628219]
 [ 2.05707721  0.78671937  1.15012927 -0.5240173   0.91669185  1.3776575 ]]
       kitchen_area  living_area  total_area  latitude  longitude  \
0          0.699210    -0.994680   -1.003771 -0.122484   1.140542   
1          0.262752    -1.317681   -0.479401  0.595211   0.053416   
2          0.262752     0.189657    0.383487  0.089183   1.018843   
3          0.262752     0.072202    0.052271 -0.052187   0.901575   
4         -1.104814    -0.070701   -0.378509  0.602043   0.773168   
...             ...          ...         ...       ...        ...   
73597      0.020276    -0.103980   -0.413025  0.908943  -0.07

# Результат

In [18]:
import pandas as pd
import yaml
import os
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from category_encoders import CatBoostEncoder
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from category_encoders import CatBoostEncoder

In [19]:
cat_features = data.select_dtypes(include='object')
cat_features

Unnamed: 0,is_apartment,has_elevator
0,false,true
1,false,true
2,false,true
3,false,true
4,false,false
...,...,...
73597,false,false
73598,false,true
73599,false,false
73600,false,true


In [20]:
# onehotencoder
cat_features = data.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2
binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
binary_cat_features

Unnamed: 0,is_apartment,has_elevator
0,false,true
1,false,true
2,false,true
3,false,true
4,false,false
...,...,...
73597,false,false
73598,false,true
73599,false,false
73600,false,true


In [21]:
OneHotEncoder_Columns = data[['building_type_int']]
OneHotEncoder_Columns

Unnamed: 0,building_type_int
0,6
1,2
2,4
3,4
4,4
...,...
73597,0
73598,4
73599,6
73600,4


In [22]:
# MinMaxScaler
Min_Max_Columns = data[['build_age', 'floor', 'rooms', 'floors_total']]
Min_Max_Columns

Unnamed: 0,build_age,floor,rooms,floors_total
0,60,9,1,12
1,24,7,1,10
2,25,9,2,10
3,8,9,2,17
4,61,1,2,5
...,...,...,...,...
73597,65,5,2,5
73598,14,7,2,17
73599,63,5,2,5
73600,53,6,3,9


In [23]:
# StandardScaler
num_features = data.select_dtypes(['float'])
num_features.drop(columns=['target',], inplace=True)
num_features

Unnamed: 0,kitchen_area,living_area,total_area,latitude,longitude,ceiling_height
0,9.90,19.900000,35.099998,55.717113,37.781120,2.64
1,9.00,16.600000,43.000000,55.794849,37.608013,3.00
2,9.00,32.000000,56.000000,55.740040,37.761742,2.70
3,9.00,30.799999,51.009998,55.724728,37.743069,2.70
4,6.18,29.340000,44.520000,55.795589,37.722622,2.64
...,...,...,...,...,...,...
73597,8.50,29.000000,44.000000,55.828831,37.587704,2.70
73598,13.00,34.000000,60.000000,55.830299,37.501556,2.74
73599,5.40,28.400000,45.900002,55.779945,37.472790,2.48
73600,7.00,37.000000,56.000000,55.781586,37.479717,2.64


In [43]:
one_hot_encoder_features = pd.concat([binary_cat_features, data[['building_type_int']]], axis=1)
one_hot_encoder_features

Unnamed: 0,is_apartment,has_elevator,building_type_int
0,false,true,6
1,false,true,2
2,false,true,4
3,false,true,4
4,false,false,4
...,...,...,...
73597,false,false,0
73598,false,true,4
73599,false,false,6
73600,false,true,4


## Обучим модель

In [29]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    data, 
    data['target'], 
    test_size=0.2,   # 20% данных валидация, 80% обучение
    random_state=42, # Фиксируем случайность для воспроизводимости
    shuffle=True      # Перемешиваем данные перед разбиением
)

In [30]:
preprocessor = ColumnTransformer(
    [
    ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
    ('one_hot', OneHotEncoder(), OneHotEncoder_Columns.columns.tolist()),
    ('minmax', MinMaxScaler(), Min_Max_Columns.columns.tolist()),
    ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

In [31]:
# Визуализируем
data_transformed = preprocessor.fit_transform(data, data['target'])
print(data_transformed)
print(pd.DataFrame(data_transformed, columns=preprocessor.get_feature_names_out()))

[[ 0.          1.          0.         ... -0.12248416  1.14054233
  -0.31628219]
 [ 0.          1.          0.         ...  0.59521098  0.05341575
   2.58761635]
 [ 0.          1.          0.         ...  0.08918273  1.0188428
   0.16770058]
 ...
 [ 0.          0.          0.         ...  0.4576099  -0.79579784
  -1.60690483]
 [ 0.          1.          0.         ...  0.47275412 -0.75229265
  -0.31628219]
 [ 0.          1.          0.         ... -0.5240173   0.91669185
   1.3776575 ]]
       is_apartment_true  has_elevator_true  building_type_int_0  \
0                    0.0                1.0                  0.0   
1                    0.0                1.0                  0.0   
2                    0.0                1.0                  0.0   
3                    0.0                1.0                  0.0   
4                    0.0                0.0                  0.0   
...                  ...                ...                  ...   
73597                0.0         

In [32]:
model = CatBoostRegressor(
    iterations=1000, 
    depth=8, 
    learning_rate=0.05, 
    l2_leaf_reg=3, 
    random_seed=42,
    verbose=100)

In [33]:
# pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)]
)

In [34]:
# Обучаем pipeline
pipeline.fit(X_tr, y_tr)

0:	learn: 3577170.1926987	total: 74.9ms	remaining: 1m 14s
100:	learn: 2186713.8408293	total: 1.32s	remaining: 11.7s
200:	learn: 2122944.7503545	total: 2.56s	remaining: 10.2s
300:	learn: 2082266.0208376	total: 3.78s	remaining: 8.78s
400:	learn: 2051470.1869076	total: 5.01s	remaining: 7.49s
500:	learn: 2024837.0842048	total: 6.25s	remaining: 6.23s
600:	learn: 2000110.6051004	total: 7.49s	remaining: 4.97s
700:	learn: 1979074.7710303	total: 8.73s	remaining: 3.72s
800:	learn: 1958791.4266428	total: 9.99s	remaining: 2.48s
900:	learn: 1939816.2711714	total: 11.3s	remaining: 1.24s
999:	learn: 1922423.3299977	total: 12.6s	remaining: 0us


In [35]:
# получаем предсказания для тестовой выборки
y_pred = pipeline.predict(X_val)

In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Выводим метрики
print('MAE:', mean_absolute_error(y_val, y_pred))   # Средняя абсолютная ошибка
print('MSE:', mean_squared_error(y_val, y_pred))   # Среднеквадратичная ошибка
print('RMSE:', mean_squared_error(y_val, y_pred, squared=False))  # Корень из MSE
print('R²:', r2_score(y_val, y_pred))   # Коэффициент детерминации (качество модели)

MAE: 1698352.4828766857
MSE: 4492959251351.498
RMSE: 2119660.173554124
R²: 0.6625918037491009




## Кроссвалидация

In [38]:
from sklearn.model_selection import KFold, cross_validate
n_splits = 5
n_jobs = -1
target_col = 'target'

# Используем KFold вместо StratifiedKFold, так как задача - регрессия
cv_strategy = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Метрики для регрессии (RMSE, R^2, MAE)
scoring = ['neg_root_mean_squared_error', 'r2', 'neg_mean_absolute_error']
cv_res = cross_validate(
        pipeline,
        data.drop(columns=[target_col]),
        data[target_col],
        cv=cv_strategy,
        n_jobs=n_jobs,
        scoring=scoring
    )

In [39]:
for key, value in cv_res.items():
    cv_res[key] = round(value.mean(), 3)

In [40]:
cv_res

{'fit_time': 29.231,
 'score_time': 0.181,
 'test_neg_root_mean_squared_error': -2099148.204,
 'test_r2': 0.672,
 'test_neg_mean_absolute_error': -1680116.407}

0:	learn: 3577170.1926987	total: 73.2ms	remaining: 1m 13s
100:	learn: 2189946.1277893	total: 3.32s	remaining: 29.6s
200:	learn: 2122440.5754587	total: 6.35s	remaining: 25.2s
300:	learn: 2082458.7427610	total: 9.52s	remaining: 22.1s
400:	learn: 2051766.0962514	total: 12.6s	remaining: 18.8s
500:	learn: 2025445.1238127	total: 15.8s	remaining: 15.7s
600:	learn: 2000467.6653291	total: 19s	remaining: 12.6s
700:	learn: 1978101.0433408	total: 22.2s	remaining: 9.45s
800:	learn: 1958108.8445348	total: 25.4s	remaining: 6.3s
900:	learn: 1940212.9014052	total: 28.6s	remaining: 3.14s
999:	learn: 1922881.6013862	total: 31.7s	remaining: 0us
0:	learn: 3563868.7350051	total: 19.7ms	remaining: 19.6s
100:	learn: 2199030.0808863	total: 3.2s	remaining: 28.5s
200:	learn: 2134441.3558143	total: 6.33s	remaining: 25.2s
300:	learn: 2092340.5029213	total: 9.48s	remaining: 22s
400:	learn: 2059297.7821526	total: 12.6s	remaining: 18.8s
500:	learn: 2033005.4729497	total: 15.7s	remaining: 15.6s
600:	learn: 2008472.935

In [51]:
avg_results = {metric: round(cv_res[f'test_{metric}'].mean(), 3) for metric in scoring}

print("Cross-validation results:", avg_results)

Cross-validation results: {'neg_root_mean_squared_error': -2099148.204, 'r2': 0.672, 'neg_mean_absolute_error': -1680116.407}
0:	learn: 3577170.1926987	total: 93.4ms	remaining: 1m 33s
100:	learn: 2189946.1277893	total: 3.26s	remaining: 29s
200:	learn: 2122440.5754587	total: 6.37s	remaining: 25.3s
300:	learn: 2082458.7427610	total: 9.52s	remaining: 22.1s
400:	learn: 2051766.0962514	total: 12.6s	remaining: 18.8s
500:	learn: 2025445.1238127	total: 15.7s	remaining: 15.6s
600:	learn: 2000467.6653291	total: 18.9s	remaining: 12.6s
700:	learn: 1978101.0433408	total: 22.1s	remaining: 9.44s
800:	learn: 1958108.8445348	total: 25.3s	remaining: 6.29s
900:	learn: 1940212.9014052	total: 28.5s	remaining: 3.13s
999:	learn: 1922881.6013862	total: 31.3s	remaining: 0us
0:	learn: 3563868.7350051	total: 23.7ms	remaining: 23.7s
100:	learn: 2199030.0808863	total: 2.89s	remaining: 25.7s
200:	learn: 2134441.3558143	total: 6.03s	remaining: 24s
300:	learn: 2092340.5029213	total: 9.22s	remaining: 21.4s
400:	learn: