In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('D:/ds/leopard-challenge/train.csv')

In [3]:
test_new = pd.read_csv('D:/ds/leopard-challenge/test.csv')
test_new['Price'] = 1

In [4]:
test_new.shape

(8818, 22)

In [5]:
df.shape

(18373, 22)

# Выполняем предобработку данных и избавляемся от NaN

In [6]:
df.isna().mean()

Suburb           0.000000
Address          0.000000
Rooms            0.000000
Type             0.000000
Price            0.000000
Method           0.000000
SellerG          0.000000
Date             0.000000
Distance         0.000000
Postcode         0.000000
Bedroom2         0.188755
Bathroom         0.188864
Car              0.194579
Landsize         0.260926
BuildingArea     0.578185
YearBuilt        0.513199
CouncilArea      0.000054
Lattitude        0.181353
Longtitude       0.181353
Regionname       0.000054
Propertycount    0.000054
id               0.000000
dtype: float64

In [7]:
categorical_columns = [c for c in df.columns if df[c].dtype.name == 'object']
numerical_columns   = [c for c in df.columns if df[c].dtype.name != 'object' and df[c].dtype.name != 'datetime64[ns]' and c != 'price']

In [8]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [9]:
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
test_new[numerical_columns] = imputer.transform(test_new[numerical_columns])

In [10]:
df['CouncilArea'] = df['CouncilArea'].transform(lambda x: x.fillna('no_info'))
df['Regionname'] = df['Regionname'].transform(lambda x: x.fillna('no_info'))

test_new['CouncilArea'] = test_new['CouncilArea'].transform(lambda x: x.fillna('no_info'))
test_new['Regionname'] = test_new['Regionname'].transform(lambda x: x.fillna('no_info'))

In [11]:
df.isna().mean()

Suburb           0.0
Address          0.0
Rooms            0.0
Type             0.0
Price            0.0
Method           0.0
SellerG          0.0
Date             0.0
Distance         0.0
Postcode         0.0
Bedroom2         0.0
Bathroom         0.0
Car              0.0
Landsize         0.0
BuildingArea     0.0
YearBuilt        0.0
CouncilArea      0.0
Lattitude        0.0
Longtitude       0.0
Regionname       0.0
Propertycount    0.0
id               0.0
dtype: float64

# Начинаем сборку модели машинного обучения

In [12]:
from sklearn.model_selection import train_test_split
train, test =  train_test_split(df,test_size=0.2,random_state=42)

In [13]:
y  = ['Price']
X = list(df.drop(columns = ['Price']).columns)
cat_features = list(df.select_dtypes(include = ['object']).columns)

In [14]:
from catboost import Pool

train_data = Pool(data=train[X], 
                  label=train[y], 
                  cat_features=cat_features)

# Подбор параметров

In [15]:
def objective(trial):
    global train_data

    param = {
        #'task_type':trial.suggest_categorical("task_type", 'CPU'),
        #'devices':trial.suggest_int('devices', 0, 1),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 4000, 200),
        "max_depth": trial.suggest_int("max_depth", 2, 10, 1),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 8),
        "random_strength": trial.suggest_float("random_strength", 0.9, 1.4),
        "learning_rate": trial.suggest_float("eta", 1e-2, 1e-1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 3, 5),
        "grow_policy": trial.suggest_categorical("grow_policy", ["Lossguide"]),
        #"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "eval_metric": trial.suggest_categorical("eval_metric", ["MAPE"]),
        "loss_function": trial.suggest_categorical("loss_function", ["MAE"]),
        "silent": trial.suggest_categorical("silent", [True]),
        #'logging_level': trial.suggest_categorical('logging_level', ['Silent']),
    }
    
    scores = cv(train_data, param, fold_count=5)

    return scores['test-MAPE-mean'].values[-1]

In [16]:
import optuna
from catboost import cv
#https://habr.com/ru/articles/704432/

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# %time
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=5, show_progress_bar = True)

[32m[I 2023-05-08 18:35:59,275][0m A new study created in memory with name: no-name-9ba9ebe9-3189-4343-a0d5-6038b56807ab[0m
  self._init_valid()


CPU times: total: 0 ns
Wall time: 0 ns


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

Training on fold [0/5]

bestTest = 0.1492435183
bestIteration = 1999

Training on fold [1/5]

bestTest = 0.1492415044
bestIteration = 1999

Training on fold [2/5]

bestTest = 0.1477802833
bestIteration = 1996

Training on fold [3/5]

bestTest = 0.1500945053
bestIteration = 1999

Training on fold [4/5]


Best trial: 0. Best value: 0.148857:  20%|████████▌                                  | 1/5 [35:39<2:22:36, 2139.24s/it]


bestTest = 0.1479239844
bestIteration = 1999

[32m[I 2023-05-08 19:11:38,511][0m Trial 0 finished with value: 0.14885676143491938 and parameters: {'n_estimators': 2000, 'max_depth': 7, 'subsample': 0.8134539266733086, 'l2_leaf_reg': 7.951565489233008, 'random_strength': 1.09983765402723, 'eta': 0.010363814896155194, 'min_data_in_leaf': 3, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 0 with value: 0.14885676143491938.[0m
Training on fold [0/5]

bestTest = 0.1591381588
bestIteration = 1978

Training on fold [1/5]

bestTest = 0.1608159356
bestIteration = 1997

Training on fold [2/5]

bestTest = 0.1581104602
bestIteration = 1999

Training on fold [3/5]

bestTest = 0.1618389411
bestIteration = 1998

Training on fold [4/5]


Best trial: 1. Best value: 0.159927:  40%|██████████████████                           | 2/5 [40:12<52:04, 1041.39s/it]


bestTest = 0.1596416611
bestIteration = 1964

[32m[I 2023-05-08 19:16:11,411][0m Trial 1 finished with value: 0.15992656026208046 and parameters: {'n_estimators': 2000, 'max_depth': 2, 'subsample': 0.7932186677292388, 'l2_leaf_reg': 5.626725698216935, 'random_strength': 1.0604184280973554, 'eta': 0.050340813245235205, 'min_data_in_leaf': 3, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 1 with value: 0.15992656026208046.[0m
Training on fold [0/5]

bestTest = 0.1545232374
bestIteration = 3579

Training on fold [1/5]

bestTest = 0.1569753945
bestIteration = 3599

Training on fold [2/5]

bestTest = 0.1547240494
bestIteration = 3599

Training on fold [3/5]

bestTest = 0.1591887145
bestIteration = 3598

Training on fold [4/5]


Best trial: 1. Best value: 0.159927:  60%|███████████████████████████▌                  | 3/5 [49:04<26:57, 808.85s/it]


bestTest = 0.1542523876
bestIteration = 3595

[32m[I 2023-05-08 19:25:03,523][0m Trial 2 finished with value: 0.15593693333975292 and parameters: {'n_estimators': 3600, 'max_depth': 2, 'subsample': 0.7640703244792189, 'l2_leaf_reg': 4.659490860139067, 'random_strength': 1.0959848389344362, 'eta': 0.05770304503017329, 'min_data_in_leaf': 5, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 1 with value: 0.15992656026208046.[0m
Training on fold [0/5]

bestTest = 0.1505366622
bestIteration = 1595

Training on fold [1/5]

bestTest = 0.1495758026
bestIteration = 1599

Training on fold [2/5]

bestTest = 0.1480476673
bestIteration = 1594

Training on fold [3/5]

bestTest = 0.1512713568
bestIteration = 1599

Training on fold [4/5]


Best trial: 1. Best value: 0.159927:  80%|████████████████████████████████████▊         | 4/5 [59:59<12:28, 748.16s/it]


bestTest = 0.1481322012
bestIteration = 1599

[32m[I 2023-05-08 19:35:58,656][0m Trial 3 finished with value: 0.14951614164215438 and parameters: {'n_estimators': 1600, 'max_depth': 4, 'subsample': 0.9676595526289506, 'l2_leaf_reg': 2.2645427456137512, 'random_strength': 1.3545613028140522, 'eta': 0.04241055507634931, 'min_data_in_leaf': 3, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 1 with value: 0.15992656026208046.[0m
Training on fold [0/5]

bestTest = 0.1570389519
bestIteration = 998

Training on fold [1/5]

bestTest = 0.1576113695
bestIteration = 999

Training on fold [2/5]

bestTest = 0.154196392
bestIteration = 999

Training on fold [3/5]

bestTest = 0.1579949588
bestIteration = 999

Training on fold [4/5]


Best trial: 1. Best value: 0.159927: 100%|████████████████████████████████████████████| 5/5 [1:04:01<00:00, 768.26s/it]


bestTest = 0.1550682095
bestIteration = 999

[32m[I 2023-05-08 19:40:00,559][0m Trial 4 finished with value: 0.15638208721597313 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'subsample': 0.6192691458871227, 'l2_leaf_reg': 5.549068223772672, 'random_strength': 1.148230182466269, 'eta': 0.01809462862480176, 'min_data_in_leaf': 5, 'grow_policy': 'Lossguide', 'eval_metric': 'MAPE', 'loss_function': 'MAE', 'silent': True}. Best is trial 1 with value: 0.15992656026208046.[0m





In [18]:
study.best_params

{'n_estimators': 2000,
 'max_depth': 2,
 'subsample': 0.7932186677292388,
 'l2_leaf_reg': 5.626725698216935,
 'random_strength': 1.0604184280973554,
 'eta': 0.050340813245235205,
 'min_data_in_leaf': 3,
 'grow_policy': 'Lossguide',
 'eval_metric': 'MAPE',
 'loss_function': 'MAE',
 'silent': True}

In [19]:
best_params1 = {'n_estimators': 2000,
 'max_depth': 2,
 'subsample': 0.7932186677292388,
 'l2_leaf_reg': 5.626725698216935,
 'random_strength': 1.0604184280973554,
 'eta': 0.050340813245235205,
 'min_data_in_leaf': 3,
 'grow_policy': 'Lossguide',
 'eval_metric': 'MAPE',
 'loss_function': 'MAE',
 'silent': True}

In [20]:
from catboost import CatBoostRegressor

In [21]:
model = CatBoostRegressor(**best_params1)

In [22]:
model.fit(train_data)

<catboost.core.CatBoostRegressor at 0x1c6129cf850>

In [23]:
import joblib

In [24]:
joblib.dump(model, "catboost_v2_mode.pkl", compress=9)

['catboost_v2_mode.pkl']

In [25]:
model_catboost_plk = joblib.load("catboost_v2_mode.pkl")

In [26]:
test['predict'] = model_catboost_plk.predict(test[X])

In [27]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

def error(y_true,y_pred):
  print(mean_absolute_error(y_true,y_pred))
  print(mean_absolute_percentage_error(y_true,y_pred))

In [28]:
error(test['Price'],test['predict'])

182575.05277768575
0.1634546585410933


In [29]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,Type,22.129792
1,Suburb,20.344826
2,Rooms,16.744124
3,CouncilArea,8.296153
4,Distance,6.356356
5,Regionname,5.932562
6,SellerG,4.699545
7,Bathroom,3.906006
8,BuildingArea,3.370081
9,YearBuilt,2.137879


# Загружаем датасет для submission

In [191]:
test_new.shape

(8818, 22)

In [30]:
test_new['Price'] = model_catboost_plk.predict(test_new[X])

In [31]:
test_new.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,...,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,id,Price
0,Abbotsford,513/6 Acacia Pl,3.0,u,S,Dingle,7/10/2017,3.0,3067.0,3.0,...,0.0,120.0,1970.0,Yarra City Council,-37.8361,144.9966,Northern Metropolitan,4019.0,30130.0,900309.8
1,Abbotsford,60 Charles St,3.0,h,S,Jellis,7/10/2017,3.0,3067.0,3.0,...,192.0,109.0,1900.0,Yarra City Council,-37.80792,144.99508,Northern Metropolitan,4019.0,18311.0,1454189.0
2,Abbotsford,38 Studley St,3.0,h,S,Nelson,7/10/2017,3.0,3067.0,3.0,...,159.0,93.0,1890.0,Yarra City Council,-37.80124,144.99459,Northern Metropolitan,4019.0,18934.0,1307728.0
3,Airport West,18 Glenys Av,3.0,h,S,Raine,7/10/2017,10.4,3042.0,3.0,...,536.0,108.0,1980.0,Moonee Valley City Council,-37.72285,144.87539,Western Metropolitan,3464.0,1824.0,766372.1
4,Airport West,35 Thomas St,3.0,h,S,Nelson,7/10/2017,10.4,3042.0,3.0,...,0.0,130.0,2009.0,Moonee Valley City Council,-37.71754,144.87704,Western Metropolitan,3464.0,31339.0,727820.3


In [32]:
test_new_id_price = test_new[['id','Price']]

In [33]:
test_new_id_price['id'] = test_new_id_price['id'].astype(int)
test_new_id_price

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_new_id_price['id'] = test_new_id_price['id'].astype(int)


Unnamed: 0,id,Price
0,30130,9.003098e+05
1,18311,1.454189e+06
2,18934,1.307728e+06
3,1824,7.663721e+05
4,31339,7.278203e+05
...,...,...
8813,11120,1.276129e+06
8814,3950,7.540082e+05
8815,2209,5.800789e+05
8816,23872,1.007986e+06


In [34]:
test_new_id_price.to_csv('my_sub_new.csv', index=False)

In [234]:
test_ = pd.read_csv('D:/ds/leopard-challenge/my_sub_new.csv')

In [235]:
test_

Unnamed: 0,id,Price
0,30130,8.877361e+05
1,18311,1.399765e+06
2,18934,1.283037e+06
3,1824,7.740891e+05
4,31339,7.637507e+05
...,...,...
8813,11120,1.231989e+06
8814,3950,6.784068e+05
8815,2209,6.244651e+05
8816,23872,1.034314e+06
