In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score

from tqdm import tqdm
tqdm.pandas()

import seaborn as sns
sns.set(font_scale=1.5)

import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

  from pandas import Panel


In [3]:
X_train = pd.read_csv('dataset/X_train.csv')
X_val = pd.read_csv('dataset/X_test.csv')

In [4]:
y_train = X_train.iloc[:, -1]
X_train = X_train.iloc[:, :-1]

In [5]:
y_val = X_val.iloc[:, -1]
X_val = X_val.iloc[:, :-1]

In [6]:
X_train.head(5)

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw8,kw9,kw10,kw11,kw12,kw13,mean_square_root_price,avg_room_area,area_and_balcon,mean_street_floor_square_price
0,0,2011-1,616,0.0,4,43,2,0,0.0,1.0,...,0,0,0,0,0,0,60749.113126,21.5,43.0,64713.953488
1,1,2011-1,112,0.0,3,33,1,0,15.0,1.0,...,0,0,0,0,0,0,70040.491118,33.0,33.0,61434.343434
2,2,2011-1,230,0.0,9,34,1,0,5.0,1.0,...,0,0,0,0,0,0,120319.574889,34.0,34.0,115369.747899
3,3,2011-1,302,1.0,4,60,3,0,15.0,0.0,...,0,0,0,0,0,0,67595.6,20.0,60.0,68161.458333
4,4,2011-1,578,0.0,3,49,2,0,0.0,0.0,...,0,0,0,0,0,0,46023.34988,24.5,49.0,46278.571429


In [7]:
from catboost import CatBoostRegressor

In [8]:
cat_features = ['street_id', 'build_tech', 'balcon', 'date']
X_train.drop(columns=['id'], axis=0, inplace=True)
X_val.drop(columns=['id'], axis=0, inplace=True)

In [9]:
X_train.build_tech = X_train.build_tech.astype(int)
X_train.metro_dist = X_train.metro_dist.astype(int)

In [10]:
X_val.build_tech = X_val.build_tech.astype(int)
X_val.metro_dist = X_val.metro_dist.astype(int)

In [11]:
Catboost = CatBoostRegressor(
    depth=6,
    n_estimators=1000,
    learning_rate=0.03,
    max_ctr_complexity=4,
    leaf_estimation_iterations=5,
    l2_leaf_reg=3,
    bagging_temperature=1,
    leaf_estimation_method='Newton',
    cat_features=cat_features,
    eval_metric='MAE',
)

In [12]:
Catboost.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=100,
)

0:	learn: 2582247.1040332	test: 2836503.9910044	best: 2836503.9910044 (0)	total: 187ms	remaining: 3m 6s
100:	learn: 849963.5227978	test: 1128635.0484014	best: 1128635.0484014 (100)	total: 10.6s	remaining: 1m 34s
200:	learn: 705782.8075413	test: 999534.3057777	best: 998517.1597952 (197)	total: 24.7s	remaining: 1m 38s
300:	learn: 632605.2679091	test: 942928.1263135	best: 942928.1263135 (300)	total: 38.1s	remaining: 1m 28s
400:	learn: 576567.5918752	test: 899057.8634852	best: 899057.8634852 (400)	total: 51.8s	remaining: 1m 17s
500:	learn: 538355.7823400	test: 874120.7055154	best: 874120.7055154 (500)	total: 1m 4s	remaining: 1m 4s
600:	learn: 512479.7747196	test: 853323.1085379	best: 852789.4934990 (599)	total: 1m 18s	remaining: 51.8s
700:	learn: 494371.6178123	test: 841126.7615332	best: 841126.7615332 (700)	total: 1m 30s	remaining: 38.8s
800:	learn: 478219.6880390	test: 829334.7478146	best: 829334.7478146 (800)	total: 1m 42s	remaining: 25.5s
900:	learn: 464394.6975354	test: 820686.9603955

<catboost.core.CatBoostRegressor at 0x1f2557b7d48>

In [13]:
y_pred = Catboost.predict(X_val)
coef = np.mean(y_val/y_pred)

In [14]:
mean_absolute_error(y_val, y_pred*coef)

550846.3526445916

In [15]:
Train = pd.read_csv('dataset/Train_f.csv')
Test = pd.read_csv('dataset/Test_f.csv')

In [17]:
target = Train.price
Train.drop(columns=['id', 'price'], axis=0, inplace=True)
Test.drop(columns=['id'], axis=0, inplace=True)

In [18]:
Train.build_tech = Train.build_tech.astype(int)
Train.metro_dist = Train.metro_dist.astype(int)

In [19]:
Test.build_tech = Test.build_tech.astype(int)
Test.metro_dist = Test.metro_dist.astype(int)

In [21]:
Catboost.fit(
    X=Train,
    y=target,
    verbose=100,
)

0:	learn: 2656830.0056109	total: 119ms	remaining: 1m 59s
100:	learn: 874781.4736265	total: 10.4s	remaining: 1m 32s
200:	learn: 712743.9535773	total: 21.6s	remaining: 1m 25s
300:	learn: 638722.5282593	total: 36.2s	remaining: 1m 23s
400:	learn: 584516.6889713	total: 49.8s	remaining: 1m 14s
500:	learn: 549770.0854802	total: 1m 2s	remaining: 1m 2s
600:	learn: 524800.3635548	total: 1m 15s	remaining: 50.2s
700:	learn: 505313.8312938	total: 1m 28s	remaining: 37.8s
800:	learn: 492033.9193854	total: 1m 42s	remaining: 25.4s
900:	learn: 478974.8247241	total: 1m 55s	remaining: 12.7s
999:	learn: 468669.7007672	total: 2m 8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1f2557b7d48>

In [26]:
submission = pd.read_csv('dataset/property-prices/SampleSubmission.csv')
submission['price'] = np.round(coef * Catboost.predict(Test), -3)

In [28]:
submission['price'].mean()

5904740.43

In [27]:
submission.to_csv('submission_1.csv', index=None)

**submission_0**: zeros only *5851954.56518* error <br>
**submission_1**: catboost: 
Catboost = CatBoostRegressor(
    depth=6,
    n_estimators=1000,
    learning_rate=0.03,
    max_ctr_complexity=4,
    leaf_estimation_iterations=5,
    l2_leaf_reg=3,
    bagging_temperature=1,
    leaf_estimation_method='Newton',
    cat_features=cat_features,
    eval_metric='MAE',
) *820071.17342*