In [1]:
import math
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline
import os

os.chdir("./sber/")

In [2]:
# From here: https://www.kaggle.com/robertoruiz/sberbank-russian-housing-market/dealing-with-multicollinearity/notebook
macro_vif = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
"micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
"income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"]

In [3]:
# load
df_train = pd.read_csv("train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'])
df_fix = pd.read_csv("BAD_ADDRESS_FIX.csv", sep=",")

id_test = df_test['id']
id_train = df_train['id']

In [4]:
# fix
df_all = pd.concat([df_train, df_test])

fix_cols = df_fix.columns[1:].values
fix_ids = df_fix.iloc[:, 0]

df_all.set_index('id', inplace=True)
df_fix.set_index('id', inplace=True)

def replace_col_value_by_ids(col, df_fix):
    return df_fix.loc[:, col.name]

df_all.loc[fix_ids, fix_cols] = df_all.loc[fix_ids, fix_cols].apply(replace_col_value_by_ids, df_fix=df_fix)

In [5]:
num_train = len(df_train)
y = np.log1p(df_train['price_doc'].values)  # log(y + 1)
df_all.drop('price_doc', axis=1, inplace=True)
df_all = pd.merge_ordered(df_all, df_macro, on='timestamp', how='left')

In [6]:
# Add month-year counts
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Relative squares and floor
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# feature macro selection (VIF)
s = set(df_macro.columns)
macro_columns = list(s.difference(macro_vif))
df_all.drop(macro_columns, axis=1, inplace=True)

# df_all.drop(['timestamp'], axis=1, inplace=True)

In [7]:
# one hot encoding

def make_dummies(df):
    
    """ make one hot encoding of object columns with pd dummies
    + delete last column
    df: pandas dataframe 
    """

    from sklearn.preprocessing import OneHotEncoder
    cat_vars = df.loc[:, df.dtypes == 'object'].columns
    tmp = pd.DataFrame()
    for var in cat_vars:
        dummy = pd.get_dummies(df.loc[:, var], prefix=var, prefix_sep="_").iloc[:, :-1]
        if tmp.empty:
            tmp = dummy
        else:
            tmp = pd.concat([tmp, dummy], axis=1)
    df.drop(cat_vars, axis=1, inplace=True)
    
    return pd.concat([df, tmp], axis=1)

df_all_dummy = make_dummies(df_all)
df_all_dummy.shape

df_all_dummy.loc[np.isinf(df_all_dummy.loc[:, 'rel_floor']), 'rel_floor' ] = np.nan # fix inf in rel floor
df_all = df_all_dummy
X_all = df_all.values

In [8]:
# split 

# Create a validation set, with last 20% of data
num_val = int(num_train * 0.2)

X_train_all = X_all[:num_train]
X_train = X_all[:num_train-num_val]
X_val = X_all[num_train-num_val:num_train]
y_train = y[:-num_val]
y_val = y[-num_val:]

X_test = X_all[num_train:]

In [9]:
# # split
# X, X_test = df_values.loc[id_train], df_values.loc[id_test]
# # X, X_test = df_all_dummy.loc[id_train], df_all_dummy.loc[id_test]

# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy="median")

X_imp = imp.fit_transform(X_train_all)
X_train_imp = imp.transform(X_train)
X_val_imp = imp.transform(X_val)
X_test_imp = imp.transform(X_test)

# scaling
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()

X_scl = scl.fit_transform(X_imp)
X_train_scl = scl.transform(X_train_imp)
X_val_scl = scl.transform(X_val_imp)
X_test_scl = scl.transform(X_test_imp)

In [11]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_regression
# X_F = SelectKBest(f_regression, k=200).fit_transform(X_scl, y)

In [12]:
# from sklearn.linear_model import Ridge
# from sklearn.feature_selection import RFE
# rfe = RFE(estimator=Ridge(), n_features_to_select=1, step=1)
# rfe.fit(X_scl, y)

In [40]:
# X_rfe = X_scl[:, rfe.ranking_ < 101]
# X_train_rfe = X_train_scl[:, rfe.ranking_ < 101]
# X_val_rfe = X_val_scl[:, rfe.ranking_ < 101]

In [52]:
from xgboost import XGBRegressor

default_params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'gamma': 0,
    'objective': 'reg:linear',
    'silent': 1,
    #'updater': 'grow_gpu',
    'n_estimators': 300
}

# get best n estimators  
model = XGBRegressor(**default_params)
# model.fit(X_train_rfe, y_train, eval_set=[(X_val_rfe, y_val)], early_stopping_rounds=20, verbose=100)

In [10]:
# from sklearn.decomposition import PCA, KernelPCA
# # kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) n_components + n_jobs 
# # X_kpca = kpca.fit_transform(X_train_imp)

# pca = PCA(n_components=150)
# train_pca = pca.fit_transform(X_train_imp)
# val_pca = pca.transform(X_val_imp)

In [95]:
# from sklearn.linear_model import Ridge
# ridge = Ridge(alpha=0.01)
# ridge.fit(X_train_scl, y_train)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [57]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline

# pipe = Pipeline([('pca', PCA()), ('ada', AdaBoostRegressor())])
# params = dict(pca__n_components=[100, 150],
#               ada__n_estimators=[20],
#               ada__learning_rate=[0.5],
#              )
# grid_search = GridSearchCV(pipe, param_grid=params, cv=5, verbose=2)
# grid_search.fit(X_train_imp, y_train)

In [58]:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import PolynomialFeatures

# pca
pca = PCA(n_components=20)
pca_train = pca.fit(X_train_scl)
pca_val = pca.transform(X_train_val)

# kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)

NameError: name 'X_train_val' is not defined

In [57]:
# poly feat
poly2 = PolynomialFeatures(degree=2)
X_train_poly2 = poly2.fit_transform(pca_train)
X_train_poly2

NameError: name 'pca_train' is not defined

In [52]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.01, l1_ratio=0.01, max_iter=2000)
# model.fit(X_train_scl, y_train)

In [51]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_val_scl)
err = np.sqrt(mean_squared_error(y_val, y_pred) / y_val.shape[0])
err

0.0056597256776318673

In [53]:
model = model.fit(X_scl, y)

ylog_pred = model.predict(X_test_scl)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})
df_sub.to_csv('sub_elastic.csv', index=False)



In [31]:
# ensemble 
ada = pd.read_csv('sub_ada.csv', usecols=['price_doc'])
svr = pd.read_csv('sub_svr_rbf.csv', usecols=['price_doc'])
ela = pd.read_csv('sub_elastic.csv', usecols=['price_doc'])
xgb = pd.read_csv('sub_xgb.csv', usecols=['price_doc'])
rid = pd.read_csv('sub_ridge.csv', usecols=['price_doc'])
las = pd.read_csv('sub_lasso.csv', usecols=['price_doc'])

ens = pd.concat([ada, svr, ela, xgb, rid, las], axis=1)
ens.columns = ['ada','svr','ela','xgb', 'rid', 'las']
ens.corr()
ens = ens.loc[:, ['ada','svr','xgb','rid']]

Unnamed: 0,ada,svr,xgb,rid
0,5.135655e+06,6.090720e+06,4960626.00,5.178996e+06
1,7.674368e+06,9.355186e+06,7137405.00,8.123915e+06
2,5.135752e+06,5.537884e+06,5276688.00,5.941323e+06
3,5.441998e+06,6.418087e+06,5018870.50,8.596488e+06
4,5.135752e+06,5.622275e+06,4686823.00,5.478934e+06
5,5.177093e+06,1.111516e+07,6251699.50,9.683693e+06
6,3.767814e+06,4.650287e+06,4312354.00,5.338707e+06
7,3.767814e+06,4.268967e+06,4252905.00,4.222250e+06
8,3.778604e+06,5.711753e+06,4582966.50,5.799146e+06
9,3.767814e+06,5.611669e+06,4193743.25,5.000234e+06


In [41]:
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

default_params = {
    'learning_rate': 0.05,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'gamma': 0,
    'objective': 'reg:linear',
    'silent': 1,
    #'updater': 'grow_gpu',
    'n_estimators': 400
}

xgb_reg = XGBRegressor(**default_params)
ela_reg = ElasticNet(alpha=0.01, l1_ratio=0.01, max_iter=2000)
svr_reg = SVR()
ada_reg = AdaBoostRegressor(n_estimators=100, learning_rate=0.001)

In [42]:
xgb_reg.fit(X_train_scl, y_train)
ela_reg.fit(X_train_scl, y_train)
svr_reg.fit(X_train_scl, y_train)
ada_reg.fit(X_train_scl, y_train)



AdaBoostRegressor(base_estimator=None, learning_rate=0.001, loss='linear',
         n_estimators=100, random_state=None)

In [67]:
xgb_pred = xgb_reg.predict(X_train_scl)
ela_pred = ela_reg.predict(X_train_scl)
svr_pred = svr_reg.predict(X_train_scl)
ada_pred = ada_reg.predict(X_train_scl)

In [104]:
train_ens = np.stack([xgb_pred, ela_pred, svr_pred, ada_pred], axis=1)

default_params = {
    'learning_rate': 0.05,
    'max_depth': 2,
    'subsample': 1,
    'colsample_bytree': 1,
    'gamma': 0,
    'objective': 'reg:linear',
    'silent': 1,
    #'updater': 'grow_gpu',
    'n_estimators': 200
}

xgb_ens = XGBRegressor(**default_params)

In [105]:
model = xgb_ens
model.fit(train_ens, y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=1, subsample=1)

In [106]:
xgb_val_pred = xgb_reg.predict(X_val_scl)
ela_val_pred = ela_reg.predict(X_val_scl)
svr_val_pred = svr_reg.predict(X_val_scl)
ada_val_pred = ada_reg.predict(X_val_scl)

In [107]:
val_ens = np.stack([xgb_val_pred, ela_val_pred, svr_val_pred, ada_val_pred], axis=1)

from sklearn.metrics import mean_squared_error
y_pred = model.predict(val_ens)
err = np.sqrt(mean_squared_error(y_val, y_pred) / y_val.shape[0])
err

0.0057221455222936099

In [108]:
xgb_pred = xgb_reg.predict(X_scl)
ela_pred = ela_reg.predict(X_scl)
svr_pred = svr_reg.predict(X_scl)
ada_pred = ada_reg.predict(X_scl)

In [112]:
ens = np.stack([xgb_pred, ela_pred, svr_pred, ada_pred], axis=1)

In [115]:
xgb_test_pred = xgb_reg.predict(X_test_scl)
ela_test_pred = ela_reg.predict(X_test_scl)
svr_test_pred = svr_reg.predict(X_test_scl)
ada_test_pred = ada_reg.predict(X_test_scl)

In [116]:
ens_test = np.stack([xgb_test_pred, ela_test_pred, 
                     svr_test_pred, ada_test_pred], axis=1)

In [117]:
model = model.fit(ens, y)

ylog_pred = model.predict(ens_test)
y_pred = np.exp(ylog_pred) - 1

df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})
df_sub.to_csv('sub_ensemble.csv', index=False)