In [27]:
import pandas as pd

In [28]:
df = pd.read_csv('D:/ds/leopard-challenge/train.csv')

In [29]:
test_new = pd.read_csv('D:/ds/leopard-challenge/test.csv')
test_new['Price'] = 1

In [25]:
df.shape

(18373, 22)

# Выполняем предобработку данных и избавляемся от NaN

In [31]:
df.isna().mean()

Suburb           0.000000
Address          0.000000
Rooms            0.000000
Type             0.000000
Price            0.000000
Method           0.000000
SellerG          0.000000
Date             0.000000
Distance         0.000000
Postcode         0.000000
Bedroom2         0.188755
Bathroom         0.188864
Car              0.194579
Landsize         0.260926
BuildingArea     0.578185
YearBuilt        0.513199
CouncilArea      0.000054
Lattitude        0.181353
Longtitude       0.181353
Regionname       0.000054
Propertycount    0.000054
id               0.000000
dtype: float64

In [32]:
categorical_columns = [c for c in df.columns if df[c].dtype.name == 'object']
numerical_columns   = [c for c in df.columns if df[c].dtype.name != 'object' and df[c].dtype.name != 'datetime64[ns]' and c != 'price']

In [33]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

In [34]:
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
test_new[numerical_columns] = imputer.transform(test_new[numerical_columns])

In [41]:
df['CouncilArea'] = df['CouncilArea'].transform(lambda x: x.fillna('no_info'))
df['Regionname'] = df['Regionname'].transform(lambda x: x.fillna('no_info'))

test_new['CouncilArea'] = test_new['CouncilArea'].transform(lambda x: x.fillna('no_info'))
test_new['Regionname'] = test_new['Regionname'].transform(lambda x: x.fillna('no_info'))

In [45]:
df.isna().mean()

Suburb           0.0
Address          0.0
Rooms            0.0
Type             0.0
Price            0.0
Method           0.0
SellerG          0.0
Date             0.0
Distance         0.0
Postcode         0.0
Bedroom2         0.0
Bathroom         0.0
Car              0.0
Landsize         0.0
BuildingArea     0.0
YearBuilt        0.0
CouncilArea      0.0
Lattitude        0.0
Longtitude       0.0
Regionname       0.0
Propertycount    0.0
id               0.0
dtype: float64

# Начинаем сборку модели машинного обучения

In [47]:
from sklearn.model_selection import train_test_split
train, test =  train_test_split(df,test_size=0.2,random_state=42)

In [52]:
y  = ['Price']
X = list(df.drop(columns = ['Price']).columns)
cat_features = list(df.select_dtypes(include = ['object']).columns)

In [53]:
from catboost import Pool

train_data = Pool(data=train[X], 
                  label=train[y], 
                  cat_features=cat_features)

# Подбор параметров

In [54]:
def objective(trial):
    global train_data

    param = {
        #'task_type':trial.suggest_categorical("task_type", 'CPU'),
        #'devices':trial.suggest_int('devices', 0, 1),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 4000, 200),
        "max_depth": trial.suggest_int("max_depth", 2, 10, 1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 8),
        "random_strength": trial.suggest_float("random_strength", 0.9, 1.4),
        "learning_rate": trial.suggest_float("eta", 1e-2, 1e-1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 3, 5),
        "grow_policy": trial.suggest_categorical("grow_policy", ["Lossguide"]),
        #"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "eval_metric": trial.suggest_categorical("eval_metric", ["MAPE"]),
        "loss_function": trial.suggest_categorical("loss_function", ["MAE"]),
        "silent": trial.suggest_categorical("silent", [True]),
        #'logging_level': trial.suggest_categorical('logging_level', ['Silent']),
    }
    
    scores = cv(train_data, param, fold_count=5)

    return scores['test-MAPE-mean'].values[-1]

In [56]:
! pip install optuna

Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
     -------------------------------------- 365.7/365.7 kB 3.2 MB/s eta 0:00:00
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
     ------------------------------------- 212.9/212.9 kB 12.7 MB/s eta 0:00:00
Collecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.12-cp310-cp310-win_amd64.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 11.4 MB/s eta 0:00:00
Collecting PyYAML (from optuna)
  Downloading PyYAML-6.0-cp310-cp310-win_amd64.whl (151 kB)
     -------------------------------------- 151.7/151.7 kB 9.4 MB/s eta 0:00:00
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
     -------------------------------

In [60]:
import optuna
from catboost import cv

In [62]:
%time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5, show_progress_bar = True)

[32m[I 2023-05-07 14:35:00,703][0m A new study created in memory with name: no-name-e20538ee-38f0-4c35-8ed5-6d68bd980a70[0m


CPU times: total: 0 ns
Wall time: 0 ns


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Training on fold [0/5]

bestTest = 0.1493512955
bestIteration = 1396

Training on fold [1/5]

bestTest = 0.1482837712
bestIteration = 1399

Training on fold [2/5]

bestTest = 0.1470203849
bestIteration = 1398

Training on fold [3/5]

bestTest = 0.1497812353
bestIteration = 1399

Training on fold [4/5]


Best trial: 0. Best value: 0.14818:  20%|████████▊                                   | 1/5 [18:23<1:13:34, 1103.53s/it]


bestTest = 0.1464602014
bestIteration = 1399

[32m[I 2023-05-07 14:53:24,223][0m Trial 0 finished with value: 0.14818004568188886 and parameters: {'n_estimators': 1400, 'max_depth': 5, 'subsample': 0.992822084894645, 'l2_leaf_reg': 3.5210642588270127, 'random_strength': 1.3405212959779882, 'eta': 0.027493292466393284, 'min_data_in_leaf': 3, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 0 with value: 0.14818004568188886.[0m
Training on fold [0/5]

bestTest = 0.1457338151
bestIteration = 2229

Training on fold [1/5]


Best trial: 0. Best value: 0.14818:  20%|████████▊                                   | 1/5 [35:28<2:21:55, 2128.95s/it]


[33m[W 2023-05-07 15:10:29,642][0m Trial 1 failed with parameters: {'n_estimators': 2400, 'max_depth': 9, 'subsample': 0.8756759818967023, 'l2_leaf_reg': 4.046819277837337, 'random_strength': 1.0173646454211678, 'eta': 0.04249855759074931, 'min_data_in_leaf': 4, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True} because of the following error: KeyboardInterrupt('').[0m
Traceback (most recent call last):
  File "C:\Users\ev708\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ev708\AppData\Local\Temp\ipykernel_16284\3597717925.py", line 22, in objective
    scores = cv(train_data, param, fold_count=5)
  File "C:\Users\ev708\AppData\Local\Programs\Python\Python310\lib\site-packages\catboost\core.py", line 6655, in cv
    return _cv(params, pool, fold_count, inverted, partition_random_seed, shuffle, stratified,
  File "_catboost.pyx", li

KeyboardInterrupt: 

In [68]:
study.best_params

{'n_estimators': 1400,
 'max_depth': 5,
 'subsample': 0.992822084894645,
 'l2_leaf_reg': 3.5210642588270127,
 'random_strength': 1.3405212959779882,
 'eta': 0.027493292466393284,
 'min_data_in_leaf': 3,
 'grow_policy': 'Lossguide',
 'eval_metric': 'MAPE',
 'loss_function': 'MAE',
 'silent': True}

In [69]:
from catboost import CatBoostRegressor

In [70]:
model = CatBoostRegressor(**study.best_params)

In [71]:
model.fit(train_data)

<catboost.core.CatBoostRegressor at 0x26bebc559c0>

In [73]:
! pip install joblib



In [76]:
import joblib


In [77]:
joblib.dump(model, "catboost.pkl", compress=9)

['catboost.pkl']

In [78]:
model_catboost_plk = joblib.load("catboost.pkl")

In [79]:
test['predict'] = model_catboost_plk.predict(test[X])

In [84]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

In [85]:
error(test['Price'],test['predict'])

168080.85415749435
0.1489306372044923


# Загружаем датасет для submission

In [86]:
test_new.shape

(8818, 22)

In [88]:
test_new['Price'] = model_catboost_plk.predict(test_new[X])

In [89]:
test_new.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,id,Price
0,Abbotsford,513/6 Acacia Pl,3.0,u,S,Dingle,7/10/2017,3.0,3067.0,3.0,...,440.0,126.0,1970.0,Yarra City Council,-37.8036,145.0011,Northern Metropolitan,4019.0,30130.0,906778.0
1,Abbotsford,60 Charles St,3.0,h,S,Jellis,7/10/2017,3.0,3067.0,3.0,...,192.0,109.0,1900.0,Yarra City Council,-37.80792,144.99508,Northern Metropolitan,4019.0,18311.0,1418212.0
2,Abbotsford,38 Studley St,3.0,h,S,Nelson,7/10/2017,3.0,3067.0,3.0,...,159.0,93.0,1890.0,Yarra City Council,-37.80124,144.99459,Northern Metropolitan,4019.0,18934.0,1274792.0
3,Airport West,18 Glenys Av,3.0,h,S,Raine,7/10/2017,10.4,3042.0,3.0,...,536.0,108.0,1980.0,Moonee Valley City Council,-37.72285,144.87539,Western Metropolitan,3464.0,1824.0,774727.4
4,Airport West,35 Thomas St,3.0,h,S,Nelson,7/10/2017,10.4,3042.0,3.0,...,440.0,130.0,2009.0,Moonee Valley City Council,-37.71754,144.87704,Western Metropolitan,3464.0,31339.0,739488.7


In [90]:
test_new[['id','Price']].to_csv('my_sub.csv', index=False)

In [91]:
test_ = pd.read_csv('D:/ds/leopard-challenge/my_sub.csv')

In [92]:
test_

Unnamed: 0,id,Price
0,30130.0,9.067780e+05
1,18311.0,1.418212e+06
2,18934.0,1.274792e+06
3,1824.0,7.747274e+05
4,31339.0,7.394887e+05
...,...,...
8813,11120.0,1.293813e+06
8814,3950.0,6.705707e+05
8815,2209.0,6.391828e+05
8816,23872.0,1.041797e+06
