In [1]:
import math
from copy import deepcopy

import holoviews as hv
import numpy as np
import pandas as pd
from scipy.stats import ttest_rel

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [2]:
hv.extension('matplotlib')

  return inst.__call__(*args,**params)


In [3]:
DEFAULT_PARAMS = {
    "max_depth": 4,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "random_seed": 0
}
MODEL_TYPES = (CatBoostRegressor, XGBRegressor, LGBMRegressor)

def build_model_name(model):
    constarint_param_name = 'monotone_constraints'
    if constarint_param_name in model.get_params():
        model_name = "{} with monotone constraints {}".format(
            model.__class__.__name__, model.get_params()[constarint_param_name])
    else:
        model_name = "{} without monotone constraints".format(model.__class__.__name__)
    return model_name

def build_regressor_params(model_type, monotone_constraints):
    assert model_type in MODEL_TYPES
    params = deepcopy(DEFAULT_PARAMS)
    if model_type == CatBoostRegressor:
        params.update(dict(
            leaf_estimation_method='Gradient',
            verbose=False
        ))
    if monotone_constraints:
        pattern = "{}" if model_type == LGBMRegressor else "({})" 
        params['monotone_constraints'] = pattern.format(",".join(str(i) for i in monotone_constraints))
    return params

def plot_level_contours(regressor):
    step_count = 100
    X_synthetic = (
        np.mgrid[0:1:1/float(step_count), 0:1:1/float(step_count)]
        .reshape(2, -1).T
    )
    y = regressor.predict(X_synthetic)
    data = pd.Series(data=y, index=pd.MultiIndex.from_arrays(X_synthetic.T))
    image_data = (
        data
        .unstack()
        .values
        [::-1,:]
    )
    
    def plot_line(df):
        return hv.Curve(
            (df['level_1'].values, df['predict'].values),
            kdims='$x_2$',
            vdims='predict value'
        )
    
    holomap = hv.HoloMap({
        v: plot_line(
            data
            .to_frame('predict')
            .reset_index()
            .loc[lambda df: df['level_0'] == v]
        )
        for v in np.linspace(0, 1, step_count // 4, endpoint=False)
    })
    
    image = (
        hv.Image(
            image_data,
            bounds=(0, 0, 1, 1),
#             label=get_model_name(regressor),
            kdims=['$x_2$', '$x_1$']
        )
        .opts(colorbar=True)
    )
    
    contour_levels = 70
    contours = (
        hv.operation.contours(image, levels=contour_levels)
        .opts(show_legend=False, cmap='fire_r', linewidth=0.8)
    )
    
    %%output fig='svg'
    %%opts Image [fig_size=200] 
    %%opts Curve (color=hv.Palette('copper_r'))
    display((
        image * contours
        + holomap.overlay().opts(show_legend=False)
    ).relabel(label=build_model_name(regressor)))

def eval_all_models(X, y, monotone_constraints,
                    test_part=0.5, **kwargs):
    assert not monotone_constraints or X.shape[1] == len(monotone_constraints)
    X_learn, X_test, y_learn, y_test = train_test_split(X, y, test_size=test_part, random_state=0)
    X_train, X_validate, y_train, y_validate = train_test_split(X_learn, y_learn, test_size=0.2, random_state=0)
    results = []
    catboost_errs = None
    for model_type in MODEL_TYPES:
        params = build_regressor_params(model_type, monotone_constraints)
        params.update(kwargs)
        model = model_type(**params)
        model.fit(X_train, y_train, eval_set=[(X_validate, y_validate)],
                  early_stopping_rounds=20, verbose=False)

        try:
            best_iteration = model.best_iteration
        except:
            best_iteration = model.best_iteration_
        err_msg = "overfitting detector was not triggered."
        assert best_iteration is not None, err_msg
        assert best_iteration < DEFAULT_PARAMS["n_estimators"] - 1, err_msg

        y_predicted = model.predict(X_test)
        errs = (y_test - y_predicted)**2
        mse_value = errs.mean()
        pvalue_vs_catboost = None
        if model_type == CatBoostRegressor:
            catboost_errs = errs
        else:
            pvalue_vs_catboost = ttest_rel(catboost_errs, errs).pvalue

        results.append([model_type.__name__, mse_value, pvalue_vs_catboost, best_iteration])
        if X.shape[1] == 2:
            plot_level_contours(model)
    return pd.DataFrame(results, columns=["model", "mse", "pvalue", "best_iteration"])


In [4]:
def create_synthetic_data(sample_size, random_seed=1):
    np.random.seed(random_seed)
    X = np.random.random(size=(sample_size, 2))
    x1, x2 = X.T
    y = (
        + 5 * x1 + np.sin(5 * 2 * math.pi * x1)
        - 5 * x2 - np.cos(5 * 2 * math.pi * x2)
        + np.random.normal(loc=0.0, scale=0.01, size=sample_size)
    )
    return X, y


In [5]:
X_synthetic, y_synthetic = create_synthetic_data(2000)
monotone_constraints = (1, -1)
eval_all_models(X_synthetic, y_synthetic, monotone_constraints)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.604393,,165
1,XGBRegressor,0.54028,1.194554e-13,65
2,LGBMRegressor,0.548859,1.366633e-10,76


In [6]:
eval_all_models(X_synthetic, y_synthetic, None, learning_rate=0.5)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.066497,,233
1,XGBRegressor,0.049131,3.250563e-05,119
2,LGBMRegressor,0.04298,1.71237e-08,389


## Geo

In [7]:
def read_geo_data():
    data = pd.read_csv('geo_data/geo_data_example.tsv', sep='\t', header=None).drop_duplicates()
    y = data[0].values
    X = data[[1, 2, 3, 4]].values
    return X, y
X_geo, y_geo = read_geo_data()
print(X_geo.shape)

(6829L, 4L)


In [8]:
eval_all_models(X_geo, y_geo, (1, 1, 1, 1), learning_rate=0.01)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.201728,,560
1,XGBRegressor,0.202283,0.02627,114
2,LGBMRegressor,0.202163,0.052314,122


In [9]:
eval_all_models(X_geo, y_geo, None, learning_rate=0.01)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.18481,,868
1,XGBRegressor,0.188733,3.089233e-07,365
2,LGBMRegressor,0.187267,0.00351536,407


## California housing

In [10]:
housing_data = fetch_california_housing()
monotone_constraints=(1, 0, 1, 1, 0, 1, 0, 0)
print(housing_data.data.shape)
# print(housing_data.DESCR)

(20640L, 8L)


In [11]:
eval_all_models(housing_data.data, housing_data.target, monotone_constraints, test_part=0.9)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.407897,,241
1,XGBRegressor,0.350355,5.291923e-49,142
2,LGBMRegressor,0.33498,1.38244e-134,122


In [12]:
eval_all_models(housing_data.data, housing_data.target, None, test_part=0.9)

Unnamed: 0,model,mse,pvalue,best_iteration
0,CatBoostRegressor,0.315234,,340
1,XGBRegressor,0.299345,2.832657e-12,199
2,LGBMRegressor,0.292451,2.201725e-24,242
