In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings("ignore")


### Подгружаем датасеты

In [2]:
train_df = pd.read_csv("train.csv")
contest_df = pd.read_csv("test.csv")
train_df.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,560,,2,59,3,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,4510000
1,1,2011-1,667,,10,50,2,1,25.0,,...,0,0,0,0,0,0,0,0,0,13231000
2,2,2011-1,90,0.0,1,48,2,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2008000
3,3,2011-1,94,1.0,3,62,3,1,30.0,,...,0,0,0,0,0,0,0,0,0,12680000
4,4,2011-1,232,0.0,3,60,3,0,25.0,,...,0,0,0,0,0,0,0,0,0,3335000


In [3]:
contest_df.shape

(100000, 24)

### Производим небольшую предобработку. Делаем поле даты более информативным, удаляем поле id

In [4]:
train_df.drop("id", axis=1, inplace=True)
contest_df.drop("id", axis=1, inplace=True)

train_df["date_value"] = train_df["date"].apply(
    lambda date: (int(date.split('-')[0]) * 12) + int(date.split('-')[1])
)
contest_df["date_value"] = contest_df["date"].apply(
    lambda date: (int(date.split('-')[0]) * 12) + int(date.split('-')[1])
)
train_df.drop("date", axis=1, inplace=True)
contest_df.drop("date", axis=1, inplace=True)

train_df

Unnamed: 0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,...,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price,date_value
0,560,,2,59,3,0,30.0,1.0,5,0,...,0,0,0,0,0,0,0,0,4510000,24133
1,667,,10,50,2,1,25.0,,1,0,...,0,0,0,0,0,0,0,0,13231000,24133
2,90,0.0,1,48,2,0,25.0,0.0,1,0,...,0,0,0,0,0,0,0,0,2008000,24133
3,94,1.0,3,62,3,1,30.0,,3,0,...,0,0,0,0,0,0,0,0,12680000,24133
4,232,0.0,3,60,3,0,25.0,,3,0,...,0,0,0,0,0,0,0,0,3335000,24133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,401,0.0,5,30,1,0,30.0,,1,0,...,0,0,0,0,0,0,0,0,1697000,24147
99996,59,0.0,4,34,1,0,30.0,,1,0,...,0,0,0,0,0,0,0,0,3620000,24147
99997,253,,3,52,2,0,5.0,,4,0,...,0,0,0,0,0,0,0,0,6712000,24147
99998,305,1.0,3,181,5,1,5.0,,3,0,...,0,0,0,0,0,0,0,0,20835000,24147


In [5]:
x_train, x_test = train_test_split(train_df, test_size=0.25, shuffle=False)
y_test = x_test["price"]
y_train = x_train["price"]
x_test.drop("price", axis=1)
x_train.drop("price", axis=1)

Unnamed: 0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,date_value
0,560,,2,59,3,0,30.0,1.0,5,0,...,0,0,0,0,0,0,0,0,0,24133
1,667,,10,50,2,1,25.0,,1,0,...,0,0,0,0,0,0,0,0,0,24133
2,90,0.0,1,48,2,0,25.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,24133
3,94,1.0,3,62,3,1,30.0,,3,0,...,0,0,0,0,0,0,0,0,0,24133
4,232,0.0,3,60,3,0,25.0,,3,0,...,0,0,0,0,0,0,0,0,0,24133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74995,157,,3,48,2,0,0.0,,0,0,...,0,0,0,0,0,0,0,0,0,24154
74996,151,,11,33,1,0,30.0,,0,0,...,0,0,0,0,0,0,0,0,0,24154
74997,645,0.0,4,53,3,0,15.0,1.0,2,0,...,0,0,0,0,0,0,0,0,0,24154
74998,562,1.0,10,51,2,1,15.0,,4,0,...,0,0,0,0,0,0,0,0,0,24154


### Ниже представлено описание датасета. Можем выделить признаки:
</br>
- ["area", "metro_dist", "date_value"] как числовые
</br>
- ["street_id", "floor", "rooms", "balcon", "n_photos"] как бинарные
</br>
- ["g_lift", "kw1", "kw2", "kw3", "kw4", "kw5", "kw6", "kw7", "kw8", 
    "kw9", "kw10", "kw11", "kw12", "kw13"] как категориальные

In [6]:
x_train.describe()

Unnamed: 0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,...,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price,date_value
count,75000.0,52705.0,75000.0,75000.0,75000.0,75000.0,71340.0,52437.0,75000.0,75000.0,...,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0
mean,334.814907,0.530519,5.265253,52.31112,2.119747,0.39816,22.131833,0.501097,2.475227,0.055853,...,0.00156,0.000947,0.008187,0.007813,0.002147,0.000347,0.001173,0.000147,5005430.0,24139.518093
std,194.086457,0.574041,3.90818,17.90216,0.830134,0.54802,8.296504,0.500004,1.948277,0.22964,...,0.039466,0.030754,0.09011,0.088048,0.046283,0.018616,0.034234,0.01211,4092644.0,4.576517
min,0.0,0.0,1.0,29.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448000.0,24133.0
25%,167.0,0.0,2.0,40.0,1.0,0.0,15.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2577000.0,24136.0
50%,335.0,0.0,4.0,52.0,2.0,0.0,25.0,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3976500.0,24139.0
75%,502.0,1.0,7.0,60.0,3.0,1.0,30.0,1.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6113000.0,24143.0
max,671.0,2.0,25.0,217.0,6.0,2.0,30.0,1.0,11.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,80548000.0,24154.0


### Опишем общую функцию для работы с моделями
</br>
Данная функция принимает модель, обучающий, валидационный и тестовый датасеты и делает предикт. Также можно получить лучшие гиперпараметры с помощью кросс-валидации

In [7]:
def trainModel(x_train, y_train, x_test, y_test, x_contest, model, 
               roc_auc=False, cv=False, grid_params=None, write_csv=False):
    numeric_features = ["area", "metro_dist", "date_value"]
    categorical_features = ["street_id", "floor", "rooms", "balcon", 
                            "n_photos"]
    binary_features = ["g_lift", "kw1", "kw2", "kw3", "kw4", "kw5", "kw6", "kw7", "kw8",
                    "kw9", "kw10", "kw11", "kw12", "kw13"]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    binary_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('bin', binary_transformer, binary_features), # Заменить
        ]
    )

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    model = pipeline.fit(x_train, y_train)
    y_valid = model.predict(x_test)
    print(mean_absolute_error(y_test, y_valid))

    if cv:
        grid_search = GridSearchCV(model, grid_params, cv=3, n_jobs=-1, verbose=4)
        grid_search.fit(x_train, y_train)
        print("Best parameters:", grid_search.best_params_)
        return

    if write_csv:
        y_predict = model.predict(x_contest)
        result_df = pd.DataFrame({'id': x_contest.index + 100_000, 'price': y_predict})
        result_df.to_csv('submission.csv', index=False)

### Модель линейной регрессии

In [8]:
model = LinearRegression()

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

1119647.339652951


__Результат:__ Получили MAE 1119647.339652951

### Модель RandomForestRegressor

Подберем оптимальные гиперпараметры

In [15]:
model = RandomForestRegressor()
grid_params = {
    'model__n_estimators': [200, 220],
    'model__max_depth': [10],
    'model__min_samples_split': [10, 12],
    'model__min_samples_leaf': [2, 3],
}

trainModel(x_train, y_train, x_test, y_test, contest_df, model, cv=True, grid_params=grid_params)

1478678.9574999998
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 200}


__Результат:__ Best parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 2, 'model__min_samples_split': 10, 'model__n_estimators': 200}

Получим предикт

In [14]:
model = RandomForestRegressor(max_depth=10, n_estimators=220, min_samples_leaf=2, min_samples_split=10)

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

1908044.5161523316


__Результат:__ Получили MAE 1908044.5161523316

### Модель GradientBoostingRegressor

Подберем оптимальные параметры

In [19]:
model = GradientBoostingRegressor()
grid_params = {
    'model__n_estimators': [400, 420],
    'model__max_depth': [8, 10],
    'model__learning_rate': [0.1, 0.01],
}

trainModel(x_train, y_train, x_test, y_test, contest_df, model, cv=True, grid_params=grid_params)

1887893.0492994662
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 420}


__Результат:__ Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 420}

Получим предикт

In [20]:
model = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, n_estimators=420)

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

1125907.7445560887


__Результат:__ Получили MAE 1125907.7445560887

### Модель CatBoostRegressor

Подберем оптимальные параметры

In [9]:
model = CatBoostRegressor()
grid_params = {
    'model__learning_rate': [0.1, 0.01],
    'model__depth': [8, 10],
    'model__iterations': [1000, 1200],
}
trainModel(x_train, y_train, x_test, y_test, contest_df, model, cv=True, grid_params=grid_params)

Learning rate set to 0.080991
0:	learn: 3934025.8391675	total: 152ms	remaining: 2m 32s
1:	learn: 3796693.2939461	total: 160ms	remaining: 1m 19s
2:	learn: 3673742.3914602	total: 167ms	remaining: 55.6s
3:	learn: 3567102.1628122	total: 174ms	remaining: 43.4s
4:	learn: 3471155.0309148	total: 181ms	remaining: 36.1s
5:	learn: 3389565.4208490	total: 188ms	remaining: 31.1s
6:	learn: 3310759.9517384	total: 195ms	remaining: 27.7s
7:	learn: 3242378.6001618	total: 203ms	remaining: 25.1s
8:	learn: 3182225.3627464	total: 209ms	remaining: 23s
9:	learn: 3127207.5861496	total: 215ms	remaining: 21.3s
10:	learn: 3079338.7667407	total: 222ms	remaining: 20s
11:	learn: 3037775.1127837	total: 229ms	remaining: 18.8s
12:	learn: 3001015.8361867	total: 235ms	remaining: 17.9s
13:	learn: 2970269.0337867	total: 242ms	remaining: 17s
14:	learn: 2942354.7750698	total: 248ms	remaining: 16.3s
15:	learn: 2916642.1149732	total: 255ms	remaining: 15.7s
16:	learn: 2894598.5970997	total: 262ms	remaining: 15.1s
17:	learn: 2873

__Результат:__ Best parameters: {'model__depth': 10, 'model__iterations': 1200, 'model__learning_rate': 0.1}

Получим предикт

In [10]:
model = CatBoostRegressor(learning_rate=0.1, max_depth=10, n_estimators=1200)

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

0:	learn: 3891608.0588345	total: 30.7ms	remaining: 36.8s
1:	learn: 3720241.5012659	total: 61.6ms	remaining: 36.9s
2:	learn: 3570703.7257349	total: 92.4ms	remaining: 36.9s
3:	learn: 3440912.2938332	total: 121ms	remaining: 36.2s
4:	learn: 3332689.9944619	total: 151ms	remaining: 36.1s
5:	learn: 3232205.7146648	total: 181ms	remaining: 36s
6:	learn: 3149785.7261189	total: 211ms	remaining: 35.9s
7:	learn: 3077901.6581842	total: 241ms	remaining: 35.9s
8:	learn: 3017929.8409464	total: 273ms	remaining: 36.1s
9:	learn: 2962340.0195605	total: 305ms	remaining: 36.3s
10:	learn: 2914834.7203501	total: 337ms	remaining: 36.4s
11:	learn: 2874614.2281650	total: 367ms	remaining: 36.4s
12:	learn: 2839796.4334782	total: 398ms	remaining: 36.4s
13:	learn: 2810661.0895232	total: 430ms	remaining: 36.4s
14:	learn: 2782635.7984489	total: 460ms	remaining: 36.4s
15:	learn: 2757757.5104403	total: 491ms	remaining: 36.4s
16:	learn: 2734245.7041021	total: 522ms	remaining: 36.3s
17:	learn: 2713800.7524621	total: 553ms	

__Результат:__ Получили MAE 948239.9468025046

### Выберем две модели, показавшие лучшие результаты на валидационной выборке. Попробуем подставить параметры вручную

In [13]:
model = GradientBoostingRegressor(learning_rate=0.1, max_depth=10, n_estimators=1000)

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

949490.0736967775


In [12]:
model = CatBoostRegressor(learning_rate=0.1, max_depth=10, n_estimators=12000)

trainModel(x_train, y_train, x_test, y_test, contest_df, model, write_csv=True)

0:	learn: 3891608.0588345	total: 32.1ms	remaining: 6m 24s
1:	learn: 3720241.5012659	total: 62.7ms	remaining: 6m 15s
2:	learn: 3570703.7257349	total: 92.9ms	remaining: 6m 11s
3:	learn: 3440912.2938332	total: 123ms	remaining: 6m 9s
4:	learn: 3332689.9944619	total: 154ms	remaining: 6m 10s
5:	learn: 3232205.7146648	total: 184ms	remaining: 6m 8s
6:	learn: 3149785.7261189	total: 216ms	remaining: 6m 10s
7:	learn: 3077901.6581842	total: 247ms	remaining: 6m 10s
8:	learn: 3017929.8409464	total: 281ms	remaining: 6m 13s
9:	learn: 2962340.0195605	total: 323ms	remaining: 6m 26s
10:	learn: 2914834.7203501	total: 365ms	remaining: 6m 38s
11:	learn: 2874614.2281650	total: 401ms	remaining: 6m 40s
12:	learn: 2839796.4334782	total: 432ms	remaining: 6m 38s
13:	learn: 2810661.0895232	total: 462ms	remaining: 6m 35s
14:	learn: 2782635.7984489	total: 494ms	remaining: 6m 34s
15:	learn: 2757757.5104403	total: 524ms	remaining: 6m 32s
16:	learn: 2734245.7041021	total: 554ms	remaining: 6m 30s
17:	learn: 2713800.7524

### Вывод

Модель CatBoostRegressor показала лучший результат в контесте