In [109]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

In [110]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")

In [112]:
train_data

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,560,,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,4510000
1,1,2011-1,667,,10,50,2,1,25.0,,...,0,0,0,0,0,0,0,0,0,13231000
2,2,2011-1,90,0.0,1,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2008000
3,3,2011-1,94,1.0,3,62,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,12680000
4,4,2011-1,232,0.0,3,60,3,0,25.0,,...,0,0,0,0,0,0,0,0,0,3335000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2012-3,401,0.0,5,30,1,0,30.0,,...,0,0,0,0,0,0,0,0,0,1697000
99996,99996,2012-3,59,0.0,4,34,1,0,30.0,,...,0,0,0,0,0,0,0,0,0,3620000
99997,99997,2012-3,253,,3,52,2,0,5.0,,...,0,0,0,0,0,0,0,0,0,6712000
99998,99998,2012-3,305,1.0,3,181,5,1,5.0,,...,0,0,0,0,0,0,0,0,0,20835000


In [113]:
test_data

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
0,100000,2012-3,459,,1,60,3,1,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,100001,2012-3,344,1.0,10,52,2,1,,,...,0,0,0,0,0,0,0,0,0,0
2,100002,2012-3,585,0.0,4,54,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,100003,2012-3,494,,2,52,2,1,25.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,100004,2012-3,622,1.0,9,60,3,1,15.0,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,2013-9,613,1.0,2,34,1,1,25.0,,...,0,0,0,0,0,0,0,0,0,0
99996,199996,2013-9,167,1.0,10,61,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,0
99997,199997,2013-9,391,1.0,9,34,1,1,30.0,,...,0,0,0,0,0,0,0,0,0,0
99998,199998,2013-9,21,0.0,7,58,3,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,0


Уберем id и n_photos, тк они не дают точности для обучения модели
Преобразуем дату в datatime тип и разделим на год и месяц
Используем эвристику для заполнения недостающих данных для колонки балкон и наличие лифта

In [68]:
def preprocess_data(df):
    df = df.drop(columns=["id", "n_photos"])
    
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m')
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df = df.drop(columns=['date'])
    return df

def fill_build_tech(data):
    def fillna_build(row):
        if pd.isna(row["build_tech"]):
            if row["balcon"] == 2: return 2
            if row["area"] >= 100: return 1
            return 0
        return row["build_tech"]
    data["build_tech"] = data.apply(fillna_build, axis=1)
    data["build_tech"] = data["build_tech"].astype(int).astype(str)
    return data

In [69]:
def fill_g_lift(data):
    def fillna_lifts(row):
        if pd.isna(row["g_lift"]):
            return 0 if row["floor"] <= 5 else 1
        return row["g_lift"]
    data["g_lift"] = data.apply(fillna_lifts, axis=1)
    return data

def full_preprocessing(train, test):
    train = preprocess_data(train)
    test = preprocess_data(test)
    
    train = fill_build_tech(train)
    test = fill_build_tech(test)
    
    train = fill_g_lift(train)
    test = fill_g_lift(test)
    #Выделяем признаки для catboost и приводим их к строковому типу(категориальные признаки)
    cat_features = ['street_id', 'build_tech', 'floor', 'balcon', 'metro_dist']
    for col in cat_features:
        if col in train.columns:
            train[col] = train[col].astype(str)
        if col in test.columns:
            test[col] = test[col].astype(str)
    
    return train, test

In [70]:
train_data, test_data = full_preprocessing(train_data, test_data)

cat_features = ['street_id', 'build_tech', 'floor', 'balcon', 'metro_dist']

X = train_data.drop(columns=["price"])
y = train_data["price"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=12)

param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.15],
    'l2_leaf_reg': [1, 3, 5],
    'iterations': [500, 1000]
}

In [71]:
model = CatBoostRegressor(
    loss_function='MAE',
    cat_features=cat_features,
    verbose=0,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=3,
    n_jobs=-1
)

In [None]:
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшее MAE: {-grid_search.best_score_:.2f}")

In [72]:
best_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    verbose=100
)

y_val_pred = best_model.predict(X_val)
mae = mean_absolute_error(y_val, y_val_pred)

X_test = test_data.drop(columns=["price"]) if "price" in test_data else test_data
y_pred_res = best_model.predict(X_test)

test_result = pd.DataFrame()
test_result["price"] = y_pred_res.round(2)  # Округляем до 2-х знаков
test_result["id"] = range(100000, 100000 + len(y_pred_res))
test_result[["id", "price"]].to_csv("SampleSubmission.csv", index=False)