In [17]:
%matplotlib inline
%load_ext ipycache

import pandas as pd
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt

The ipycache extension is already loaded. To reload it, use:
  %reload_ext ipycache


In [18]:
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 
        for i,pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def rmse(y, y_pred):
    return np.sqrt(((y_pred - y) ** 2).mean())

# Препроцессинг фич

In [19]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [20]:
def preprocess_anomaly(df):
    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
    
    # superclean
    # https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
    df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
    df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN

    df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
    df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN

    df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN

    df.ix[df[df.build_year > 1500].index, "build_year"] = np.NaN

    df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
    
    df.ix[df[df.floor == 0].index, "floor"] = np.NaN
    df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
    
    df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
    
    df.ix[df[df.state == 33].index, "state"] = np.NaN
    return df

In [37]:
def preprocess_categorial(df):
#     df = mess_y_categorial(df, 5)

    for c in df.columns:
        if df[c].dtype == 'object':
            lbl = sk.preprocessing.LabelEncoder()
            lbl.fit(list(train_raw[c].values) + list(test[c].values)) 
            df[c] = lbl.transform(list(df[c].values))

    df = df.select_dtypes(exclude=['object'])
    return df

def apply_categorial(test, train):
#     test = mess_y_categorial_fold(test, train)
#     test = test.select_dtypes(exclude=['object'])
    return preprocess_categorial(test)


def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")


def mess_y_categorial(df, nfolds=3, alpha=10):
    from sklearn.utils import shuffle
    from copy import copy

    folds = np.array_split(shuffle(df), nfolds)
    newfolds = []
    for i in range(nfolds):
        fold = folds[i]

        other_folds = copy(folds)
        other_folds.pop(i)
        other_fold = pd.concat(other_folds)

        newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))

    return pd.concat(newfolds)

def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:

        target_mean = other_fold[[c, y_col]].groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].value_counts().to_dict()

        fold[c + "_sll"] = fold[c].apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha) if x else float("NaN")
        )
    return fold

In [38]:
def apply_macro(df):
    macro_cols = [
        'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
        "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
        "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
    ]
    return pd.merge(df, macro, on='timestamp', how='left')

In [39]:
def preprocess(df):
    from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
    
#     df = apply_macro(df)
    
#     df["timestamp_year"] = df["timestamp"].apply(lambda x: x.split("-")[0])
#     df["timestamp_month"] = df["timestamp"].apply(lambda x: x.split("-")[1])
#     df["timestamp_year_month"] = df["timestamp"].apply(lambda x: x.split("-")[0] + "-" + x.split("-")[1]) 

    ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
    df["ecology_index"] = map(ecology.index, df["ecology"].values)
    
    bool_feats = [
        "thermal_power_plant_raion",
        "incineration_raion",
        "oil_chemistry_raion",
        "radiation_raion",
        "railroad_terminal_raion",
        "big_market_raion",
        "nuclear_reactor_raion",
        "detention_facility_raion",
        "water_1line",
        "big_road1_1line",
        "railroad_1line",
        "culture_objects_top_25"
    ]
    for bf in bool_feats:
        df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)

    df = preprocess_anomaly(df)

    df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
    df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
    df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)

    df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
    df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
    df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")

    df = df.drop(["id", "timestamp"], axis=1)

    return df

In [41]:
train_pr = preprocess(train_raw)
train = preprocess_categorial(train_pr)
# train = train.fillna(-1)

X = train.drop(["price_doc"], axis=1)
y = train["price_doc"].values

# Обучение моделей

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X.values, y, test_size=0.20, random_state=43)

dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val, y_val, feature_names=X.columns)

In [43]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 200,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
                  early_stopping_rounds=40, verbose_eval=40)

num_boost_round = model.best_iteration

[0]	val-rmse:8.28101e+06
Will train until val-rmse hasn't improved in 40 rounds.
[40]	val-rmse:5.9672e+06
[80]	val-rmse:4.52525e+06
[120]	val-rmse:3.67436e+06
[160]	val-rmse:3.20598e+06
[200]	val-rmse:2.95497e+06
[240]	val-rmse:2.82e+06
[280]	val-rmse:2.74392e+06
[320]	val-rmse:2.70376e+06
[360]	val-rmse:2.67989e+06
[400]	val-rmse:2.66425e+06
[440]	val-rmse:2.6541e+06
[480]	val-rmse:2.64534e+06
[520]	val-rmse:2.63860e+06
[560]	val-rmse:2.63046e+06
[600]	val-rmse:2.62637e+06
[640]	val-rmse:2.62211e+06
[680]	val-rmse:2.61746e+06
[720]	val-rmse:2.61364e+06
[760]	val-rmse:2.61086e+06
[800]	val-rmse:2.60873e+06
[840]	val-rmse:2.60563e+06
[880]	val-rmse:2.60277e+06
[920]	val-rmse:2.60014e+06
[960]	val-rmse:2.59738e+06
[1000]	val-rmse:2.59514e+06
[1040]	val-rmse:2.59305e+06
[1080]	val-rmse:2.59082e+06
[1120]	val-rmse:2.58863e+06
[1160]	val-rmse:2.5872e+06
[1200]	val-rmse:2.58576e+06
[1240]	val-rmse:2.58395e+06
[1280]	val-rmse:2.58229e+06
[1320]	val-rmse:2.58091e+06
[1360]	val-rmse:2.57875e+06

In [44]:
cv_output = xgb.cv(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

[0]	train-rmse:8.41652e+06+41855.1	test-rmse:8.41738e+06+84637.2
[40]	train-rmse:6.03464e+06+30061.6	test-rmse:6.08904e+06+72461.4
[80]	train-rmse:4.52836e+06+25723.2	test-rmse:4.65191e+06+68243
[120]	train-rmse:3.60111e+06+26054.8	test-rmse:3.80062e+06+64695
[160]	train-rmse:3.04323e+06+26940	test-rmse:3.32263e+06+68157.9
[200]	train-rmse:2.71488e+06+27223.9	test-rmse:3.06315e+06+69270.3
[240]	train-rmse:2.5176e+06+26093.6	test-rmse:2.92318e+06+66330.6
[280]	train-rmse:2.39955e+06+23368.6	test-rmse:2.84928e+06+66488.7
[320]	train-rmse:2.32403e+06+21979	test-rmse:2.80575e+06+67530.8
[360]	train-rmse:2.2727e+06+20888.1	test-rmse:2.77913e+06+68534.1
[400]	train-rmse:2.23188e+06+20350.9	test-rmse:2.76139e+06+68714.3
[440]	train-rmse:2.19573e+06+19302.3	test-rmse:2.74886e+06+68376.3
[480]	train-rmse:2.16652e+06+20289.7	test-rmse:2.73983e+06+68790.6
[520]	train-rmse:2.14135e+06+18704.5	test-rmse:2.73268e+06+69483.4
[560]	train-rmse:2.12125e+06+19128.8	test-rmse:2.72623e+06+69241.9
[600]	tra

KeyboardInterrupt: 

In [45]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
print "predict-train:", rmse(model.predict(dtrain_all), y)

predict-train: 1923466.41923


In [None]:
model = xgb.XGBRegressor(max_depth=5, n_estimators=100, learning_rate=0.01, nthread=-1, silent=False)
model.fit(X.values, y, verbose=20)

with open("scores.tsv", "a") as sf:
    sf.write("%s\n" % rmsle(model.predict(X.values), y))

!tail scores.tsv

In [None]:
show_weights(model, feature_names=list(X.columns), importance_type="weight")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def validate(clf):c
    cval = np.abs(cross_val_score(clf, X.values, y, cv=3, 
                                  scoring=make_scorer(rmsle, False), verbose=2))
    return np.mean(cval), cval

print validate(model)

# Submission

In [46]:
test_pr = preprocess(test)
test_pr = apply_categorial(test_pr, train_pr)
# test_pr = test_pr.fillna(-1)

dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = model.predict(dtest)

# y_pred = model.predict(test_pr.values)

# y_pred = np.exp(y_pred) - 1

submdf = pd.DataFrame({"id": test["id"], "price_doc": y_pred})
submdf.to_csv("data/submission.csv", header=True, index=False)
!head data/submission.csv

id,price_doc
30474,5449376.0
30475,8106252.0
30476,5591976.0
30477,6180818.0
30478,5165608.5
30479,8126853.0
30480,4380783.5
30481,3898090.25
30482,4637226.5


without noise xgb logarithmic y

    val-rmse:0.478924

macro 10*400
    
    val-rmse:0.480618

macro 5*200

    val-rmse:0.476849

macro 5*200 no month and year

    val-rmse:0.477861

macro 5*200 no month and year
    
    val-rmse:0.473012

macro 5*200 no month and year

    val-rmse:0.471758
    predict-train: 0.427215115875

macro 5*200 no month and year, train_without_noise

    val-rmse:0.461684
    train-rmse:0.411116+0.00299259	test-rmse:0.472202+0.00166791
    predict-train: 0.423849149218
    kaggle: 0.36027

5*200, no macro no add features, train_without_noise:
    
    val-rmse:0.471989
    train-rmse:0.425924+0.00643495	test-rmse:0.473873+0.0131213
    predict-train: 0.43508730101

5*200, no macro add rel features, train_without_noise:

    val-rmse:0.471808
    train-rmse:0.425264+0.00595741	test-rmse:0.47383+0.0130655
    predict-train: 0.435635092773
    kaggle: 0.32837

5*200, no macro, add rel features, no log price, train_without_noise:
    
    val-rmse:2.63772e+06
    train-rmse:1.9989e+06+10986.4	test-rmse:2.69158e+06+53020
    predict-train: 2076010.27131
    kaggle: 0.31720

5*200, no macro, add rel features, no log price, train_with_noise:

    val-rmse:2.53378e+06
    train-rmse:1.95069e+06+16166.4	test-rmse:2.69703e+06+61455.1
    predict-train: 2054421.59869
    kaggle: 0.32056

5*200, macro, add rel features, no log price, train_without_noise:
    
    val-rmse:2.79632e+06
    train-rmse:1.81015e+06+19781.2	test-rmse:2.6641e+06+123875
    predict-train: 1904063.27368
    kaggle: 0.32976

5*200, no macro, add rel features, no log price, train_without_noise:
    
    val-rmse:2.61682e+06
    train-rmse:1.81123e+06+27681.2	test-rmse:2.66923e+06+53925.7
    predict-train: 1899129.43771
    kaggle: 0.31592

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

    val-rmse:2.61055e+06
    train-rmse:1.71826e+06+30076.1	test-rmse:2.66515e+06+54583.5
    predict-train: 1814572.97424
    kaggle: 0.31602

7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

    val-rmse:2.59955e+06
    train-rmse:1.41393e+06+21208.1	test-rmse:2.6763e+06+35553.3
    predict-train: 1548257.49121
    kaggle: 0.31768

4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:
    
    val-rmse:2.63407e+06
    train-rmse:1.96513e+06+21470.8	test-rmse:2.69417e+06+74288.3
    predict-train: 2062299.41091
    kaggle: 0.31952

7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

    val-rmse:2.59955e+06
    train-rmse:1.41393e+06+21208.1	test-rmse:2.6763e+06+35553.3
    predict-train: 1548257.49121

5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:
    
    val-rmse:2.61055e+06
    train-rmse:1.71826e+06+30076.1	test-rmse:2.66515e+06+54583.5
    predict-train: 1814572.97424

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:

    val-rmse:2.61664e+06
    train-rmse:1.77892e+06+23111	test-rmse:2.65829e+06+56398.6
    predict-train: 1875799.54634
    kaggle: 0.31521

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:
    
    val-rmse:2.6265e+06
    train-rmse:1.78478e+06+22545.4	test-rmse:2.66179e+06+60626.3
    predict-train: 1881672.27588
    