In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import uci_datasets as uci

from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import r2_score, mean_squared_error as mse

from tmpnn import TMPNNRegressor
from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm



# Average

In [2]:
def score(model):
    tr, ts = [], []
    for name, shape in uci.all_datasets.items():
        if shape[1] > 50 or name in ['challenger','forest','solar','breastcancer','fertility','tamielectric']: continue
        data = uci.Dataset(name, np.float32, False)
        x_tr, x_ts, y_tr, y_ts = train_test_split(data.x, data.y, train_size=min(0.8, 15000/shape[0]), random_state=0)
        model.fit(x_tr, y_tr)

        tr.append( r2_score(y_tr, model.predict(x_tr)) )
        ts.append( r2_score(y_ts, model.predict(x_ts)) )
    return f'tr: {np.mean(tr):.3f}+{np.std(tr):.3f} ts: {np.mean(ts):.3f}+{np.std(ts):.3f}'

In [119]:
cat = CatBoostRegressor(random_state=0, verbose=0)
print('cat  ', score(cat))

cat   tr: 0.969+0.060 ts: 0.873+0.150


In [5]:
class ClipMinMaxScaler(MinMaxScaler):
    def fit(self, X, y = None):
        return super().fit(X, y)

    def inverse_transform(self, X) -> np.ndarray:
        if self.clip: np.clip(X, self.feature_range[0], self.feature_range[1], out=X)
        return super().inverse_transform(X)

tmpnn = TransformedTargetRegressor(
    regressor=Pipeline([
        ('imp', MinMaxScaler((-0.5,0.5))),
        ('est', TMPNNRegressor(random_state=0,
            # max_epochs=200, regularizer=tf.keras.regularizers.L2(1e-4)
            ))
    ]),
    transformer=Pipeline([
        # ('pt', PowerTransformer()),
        ('mms', ClipMinMaxScaler((-0.5,0.5), clip=True))
    ])
)
print('tmpnn', score(tmpnn))

tmpnn tr: 0.903+0.148 ts: 0.849+0.175


# Dataset-vise

In [109]:
# tmpnn heuristic
tmpnn = TransformedTargetRegressor(
    regressor=Pipeline([
        ('imp', MinMaxScaler((-0.5,0.5))),
        ('est', TMPNNRegressor(random_state=0, regularizer=tf.keras.regularizers.L2(1e-4)))
    ]),
    transformer=Pipeline([
        # ('pt', PowerTransformer()),
        ('mms', ClipMinMaxScaler((-0.5,0.5), clip=True))
    ])
)
for name, shape in uci.all_datasets.items():
    if shape[1] > 50: continue
    data = uci.Dataset(name, np.float32, False)
    x_tr, x_ts, y_tr, y_ts = train_test_split(data.x, data.y, train_size=min(0.8, 15000/shape[0]), random_state=0)
    tmpnn.fit(x_tr, y_tr)

    row = f'tr: '
    try: row += f'{r2_score(y_tr, tmpnn.predict(x_tr)):1.4f}'
    except: row += 'nan   '
    row += '\tts: '
    try: row += f'{r2_score(y_ts, tmpnn.predict(x_ts)):1.4f}'
    except: row += 'nan   '
    print(row + '\t' + name)

tr: 0.4690	ts: 0.4810	3droad
tr: 0.9054	ts: 0.8685	autompg
tr: 0.9844	ts: 0.9825	bike
tr: 0.4317	ts: 0.0000	challenger
tr: 0.9866	ts: 0.9151	concreteslump
tr: 0.9946	ts: 0.9915	energy
tr: 0.3642	ts: -0.3544	forest
tr: 0.9958	ts: 0.9955	houseelectric
tr: 0.9912	ts: 0.9904	keggdirected
tr: 0.9560	ts: 0.9535	kin40k
tr: 0.9901	ts: 0.9748	parkinsons
tr: 0.9916	ts: 0.9915	pol
tr: 0.9755	ts: 0.8990	pumadyn32nm
tr: 0.3701	ts: -0.1665	solar
tr: 0.9393	ts: 0.8692	stock
tr: 0.9949	ts: 0.9870	yacht
tr: 0.8485	ts: 0.8242	airfoil
tr: 0.9864	ts: 0.8367	autos
tr: 0.9962	ts: -0.6207	breastcancer
tr: 0.9127	ts: 0.8446	concrete
tr: 0.8764	ts: 0.8694	elevators
tr: 0.9094	ts: -2.6063	fertility
tr: 0.9594	ts: 0.8745	housing
tr: 0.9847	ts: 0.9825	keggundirected
tr: 0.8756	ts: 0.8116	machine
tr: 0.9342	ts: 0.8047	pendulum
tr: 0.4932	ts: 0.4741	protein
tr: 0.8669	ts: 0.8794	servo
tr: 0.7166	ts: 0.3865	skillcraft
tr: 0.9947	ts: 0.9944	sml
tr: 0.0007	ts: -0.0006	tamielectric
tr: 0.8552	ts: 0.7129	wine


In [108]:
# catboost
cat = CatBoostRegressor(random_state=0, verbose=0)
for name, shape in uci.all_datasets.items():
    if shape[1] > 50: continue
    data = uci.Dataset(name, np.float32, False)
    x_tr, x_ts, y_tr, y_ts = train_test_split(data.x, data.y, train_size=min(0.8, 15000/shape[0]), random_state=0)
    cat.fit(x_tr, y_tr)

    row = f'tr: '
    try: row += f'{r2_score(y_tr, cat.predict(x_tr)):1.4f}'
    except: row += 'nan   '
    row += '\tts: '
    try: row += f'{r2_score(y_ts, cat.predict(x_ts)):1.4f}'
    except: row += 'nan   '
    print(row + '\t' + name)

tr: 0.8279	ts: 0.7909	3droad
tr: 0.9923	ts: 0.9050	autompg
tr: 0.9999	ts: 0.9996	bike
tr: 0.9999	ts: 0.0000	challenger
tr: 0.9999	ts: 0.5125	concreteslump
tr: 0.9998	ts: 0.9984	energy
tr: 0.9305	ts: -0.0506	forest
tr: 0.9984	ts: 0.9977	houseelectric
tr: 0.9945	ts: 0.9924	keggdirected
tr: 0.9382	ts: 0.8803	kin40k
tr: 0.9987	ts: 0.9965	parkinsons
tr: 0.9937	ts: 0.9863	pol
tr: 0.9800	ts: 0.9355	pumadyn32nm
tr: 0.5426	ts: -0.5067	solar
tr: 0.9953	ts: 0.8788	stock
tr: 1.0000	ts: 0.9915	yacht
tr: 0.9843	ts: 0.9587	airfoil
tr: 0.9960	ts: 0.8883	autos
tr: 0.9999	ts: 0.3517	breastcancer
tr: 0.9875	ts: 0.9397	concrete
tr: 0.9088	ts: 0.8679	elevators
tr: 0.9701	ts: -0.0091	fertility
tr: 0.9973	ts: 0.9024	housing
tr: 0.9878	ts: 0.9858	keggundirected
tr: 0.9932	ts: 0.8987	machine
tr: 0.9934	ts: 0.5812	pendulum
tr: 0.7442	ts: 0.6275	protein
tr: 0.9973	ts: 0.9278	servo
tr: 0.9089	ts: 0.5073	skillcraft
tr: 0.9993	ts: 0.9981	sml
tr: 0.0839	ts: -0.0287	tamielectric
tr: 0.9802	ts: 0.7607	wine
