In [2]:
import math
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
import os

os.chdir("./sber/")

In [19]:
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_vif = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [41]:
# load
df_train = pd.read_csv("train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_vif)
df_fix = pd.read_csv("BAD_ADDRESS_FIX.csv", sep=",")

id_test = df_test['id']
id_train = df_train['id']

In [42]:
# fix
df_all = pd.concat([df_train, df_test])

fix_cols = df_fix.columns[1:].values
fix_ids = df_fix.iloc[:, 0]

df_all.set_index('id', inplace=True)
df_fix.set_index('id', inplace=True)

def replace_col_value_by_ids(col, df_fix):
    return df_fix.loc[:, col.name]

df_all.loc[fix_ids, fix_cols] = df_all.loc[fix_ids, fix_cols].apply(replace_col_value_by_ids, df_fix=df_fix)

In [43]:
num_train = len(df_train)
y = np.log1p(df_train['price_doc'].values)  # log(y + 1)
df_all.drop('price_doc', axis=1, inplace=True)
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')

In [44]:
# Add month-year counts
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Relative squares and floor
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# feature macro selection (VIF)
s = set(df_macro.columns)
macro_columns = list(s.difference(macro_vif))
df_all.drop(macro_columns, axis=1, inplace=True)

# df_all.drop(['timestamp'], axis=1, inplace=True)

In [45]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)
df_columns = df_values.columns
X_all = df_values.values

In [46]:
# split 

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
y_train = y[:-num_val]
y_val = y[-num_val:]

X_test = X_all[num_train:]

In [86]:
from xgboost import XGBRegressor

default_params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'gamma': 0,
    'objective': 'reg:linear',
    'silent': 1,
    #'updater': 'grow_gpu',
    'n_estimators': 2000
}

# get best n estimators  
model = XGBRegressor(**default_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=100)
model.n_estimators = model.best_ntree_limit

[0]	validation_0-rmse:14.4748
Will train until validation_0-rmse hasn't improved in 20 rounds.
[100]	validation_0-rmse:0.454533
[200]	validation_0-rmse:0.4225
Stopping. Best iteration:
[268]	validation_0-rmse:0.421698



In [93]:
model.fit(X_train_all, y)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=269, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=1)

In [94]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_train_all)

err = np.sqrt(mean_squared_error(y, y_pred) / y.shape[0])
err

0.0023172704156614833

In [32]:
# from xgboost import XGBRegressor

# default_params = {
#     'learning_rate': 0.075,
#     'max_depth': 5,
#     'subsample': 1.0,
#     'colsample_bytree': 0.7,
#     'objective': 'reg:linear',
#     'eval_metric': 'rmse',
#     'silent': 1,
#     'n_estimators': 1000,
#     'updater': 'grow_gpu',
# }

# xgb_model = XGBRegressor(**default_params)

# fit_params = {
#     'eval_set': [(X_val, y_val)], 
#     'early_stopping_rounds': 20, 
#     'verbose': 50
# }

# params = dict(learning_rate = [0.05],
#               max_depth = [5],
#               subsample = [0.9],
#               colsample_bytree = [0.9],
#               gamma = [0, 0.5])

# from sklearn.model_selection import GridSearchCV

# grid_search = GridSearchCV(xgb_model, param_grid=params, cv=3, fit_params=fit_params)
# grid_search.fit(X_train, y_train)

In [33]:
# d = grid_search.cv_results_
# v = list(d.values())
# k = list(d.keys())
# pd.DataFrame(data=v, index=k)

In [74]:
ylog_pred = model.predict(X_test)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})
df_sub.to_csv('sub_xgb.csv', index=False)