In [None]:
import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [None]:
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [None]:
df_train = pd.read_csv("../input/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../input/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../input/macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols)

df_train.head()

In [None]:
ax = df_train['price_doc'].hist(bins=50)

In [None]:
# ylog will be log(1+y), as suggested by https://github.com/dmlc/xgboost/issues/446#issuecomment-135555130
ylog_train_all = np.log1p(df_train['price_doc'].values)
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')
print(df_all.shape)

# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)

In [None]:
 # max_floor cleaning
bad_index = df_all[df_all.max_floor > 57].index
df_all.loc[bad_index, 'max_floor'] = np.NAN
bad_index = df_all[df_all.max_floor == 0].index
df_all.loc[bad_index, 'max_floor'] = np.NAN
bad_index = df_all[(df_all.floor > 1) & (df_all.max_floor == 1)].index
df_all.loc[bad_index, 'max_floor'] = np.NAN
bad_index = df_all[df_all.floor > df_all.max_floor].index
df_all.loc[bad_index, 'max_floor'] = np.NAN

# build_year cleaning
wrong_index = df_all[df_all.build_year == 1691].index
df_all.loc[wrong_index, 'build_year'] = 1961
wrong_index = df_all[df_all.build_year == 215].index
df_all.loc[wrong_index, 'build_year'] = 2015
wrong_index = df_all[df_all.build_year == 4965].index
df_all.loc[wrong_index, 'build_year'] = 1965
wrong_index = df_all[df_all.build_year == 20].index
df_all.loc[wrong_index, 'build_year'] = 2014
wrong_index = df_all[df_all.build_year == 20052009].index
df_all.loc[wrong_index, 'build_year'] = 2009
wrong_index = df_all[df_all.build_year == 0].index
df_all.loc[wrong_index, 'build_year'] = np.NAN
wrong_index = df_all[df_all.build_year == 1].index
df_all.loc[wrong_index, 'build_year'] = np.NAN
wrong_index = df_all[df_all.build_year == 71].index
df_all.loc[wrong_index, 'build_year'] = np.NAN
wrong_index = df_all[df_all.build_year == 3].index
df_all.loc[wrong_index[0], 'build_year'] = 2013
df_all.loc[wrong_index[1], 'build_year'] = 1960

# full_sq cleaning
wrong_index = df_all[df_all.full_sq == 5326].index
df_all.loc[wrong_index, 'full_sq'] = 53
wrong_index = df_all[df_all.full_sq == 603].index
df_all.loc[wrong_index, 'full_sq'] = 60
wrong_index = df_all[df_all.full_sq == 412].index
df_all.loc[wrong_index, 'full_sq'] = 41
wrong_index = df_all[df_all.full_sq == 407].index
df_all.loc[wrong_index, 'full_sq'] = 40
wrong_index = df_all[df_all.full_sq == 403].index
df_all.loc[wrong_index, 'full_sq'] = 40
wrong_index = df_all[df_all.full_sq == 394].index
df_all.loc[wrong_index, 'full_sq'] = 39
wrong_index = df_all[df_all.full_sq == 388].index
df_all.loc[wrong_index, 'full_sq'] = 39
wrong_index = df_all[df_all.full_sq == 353].index
df_all.loc[wrong_index, 'full_sq'] = 35
wrong_index = df_all[df_all.full_sq < 10].index
df_all.loc[wrong_index, 'full_sq'] = np.NAN
wrong_index = df_all[df_all.full_sq > 1000].index
df_all.loc[wrong_index, 'full_sq'] /= 100
wrong_index = df_all[df_all.full_sq > 250].index
df_all.loc[wrong_index, 'full_sq'] /= 10

In [None]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

In [None]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
ylog_train = ylog_train_all[:-num_val]
ylog_val = ylog_train_all[-num_val:]

X_test = X_all[num_train:]

df_columns = df_values.columns

print('X_train_all shape is', X_train_all.shape)
print('X_train shape is', X_train.shape)
print('y_train shape is', ylog_train.shape)
print('X_val shape is', X_val.shape)
print('y_val shape is', ylog_val.shape)
print('X_test shape is', X_test.shape)

In [None]:
dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all, feature_names=df_columns)
dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

In [None]:
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
partial_model = xgb.train(xgb_params, dtrain, num_boost_round=1000, evals=[(dval, 'val')],
                       early_stopping_rounds=20, verbose_eval=20)

num_boost_round = partial_model.best_iteration

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(partial_model, max_num_features=50, height=0.5, ax=ax)

In [None]:
num_boost_round = partial_model.best_iteration

In [None]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round)

In [None]:
model.save_model('naivexgb.model')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 16))
xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)

In [None]:
ylog_pred = model.predict(dtest)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

df_sub.to_csv('sub.csv', index=False)

In [None]:
dtrain = xgb.DMatrix(X_train_all, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)

naive_train_pred = pd.DataFrame({'price_doc': model.predict(dtrain)})
naive_train_pred = np.exp(naive_train_pred) - 1
naive_train_pred.to_csv('naive_train_pred.csv')
df_sub.to_csv('naive_test_pred.csv', index = False)