# GBRT experimets - environment test

## import python packages

In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from tqdm.notebook import trange, tqdm
from itertools import product, chain
from scipy.stats.qmc import LatinHypercube as LHS

import seaborn as sns
sns.set()

from scipy.stats import norm, invgamma, rv_discrete
from scipy import integrate
from ibug import IBUGWrapper
from sklearn.neighbors import NearestCentroid
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import ttest_ind, chisquare
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize_scalar, minimize

## set up julia runtime

In [2]:
%%time
import julia, os

julia_path_ = '/home/jungadam/idms/julia-1.9.2/bin/julia' 
# julia_path_ = 'julia-1.9.2/bin/julia'

julia.Julia(runtime=julia_path_, compiled_modules=False)

%load_ext julia.magic

%julia include("Julia_CRPS_Scoring.jl")
%julia using .Julia_CRPS_Scoring

print("done Julia_CRPS_Scoring.jl include")

Initializing Julia interpreter. This may take some time...
done Julia_CRPS_Scoring.jl include
CPU times: user 1min 5s, sys: 3.15 s, total: 1min 8s
Wall time: 1min 8s


## test run with a selection of models

### model implementations

In [3]:
from scipy.stats import ecdf as ecdf_special
class ECDF_RV:
    """wrapper for work with ecdf result as standard scipy RVs"""
    
    def __init__(self, ecdf_rv, samples):
        self.ecdf_rv = ecdf_rv
        self.samples = samples
        
    def cdf(self, x):
        return self.ecdf_rv.cdf.evaluate(x)
    
    def ppf(self, x):
        i = np.searchsorted(self.ecdf_rv.cdf.probabilities, x)
        return self.ecdf_rv.cdf.quantiles[i]
    
    def mean(self):
        return self.samples.mean()
    
    def var(self):
        return self.samples.var()
    
ecdf = lambda samples : ECDF_RV(ecdf_special(samples), samples)

In [7]:
def fit_MM_inv_gamma(m, s):
    """
    fit inv gamma to first two moments 'm' and 's',
    with method of moments.
    """
    
    # assert s > 0, "elfajult szoras"
    if s == 0:
        return rv_discrete(name='custm', values=(m, 1))
        
    
    fit_alpha = m**2 / s + 2
    fit_scale = m * (fit_alpha - 1)
    
    rv = invgamma(a = fit_alpha, scale = fit_scale, loc = 0)
    
    if np.isnan(rv.mean()) or np.isnan(rv.var()):
        raise Exception(f"fit error {(m, s,  fit_alpha, fit_scale)}")
    
    return rv

fit_MM_inv_gamma = np.vectorize(fit_MM_inv_gamma)

In [4]:
def auto_scale_variance(loc, var, y):
    """return optimal_variance_coeff, CRPS_score"""
    def f(s, loc = loc, y = y):
        var_scaled = var * s
        crps = %julia CRPS_distribution($loc, $var_scaled, $y)
        return crps.mean()

    res = minimize_scalar(f)
    return res.x, res.fun

In [5]:
class UncertaintyMeasure:
    def __init__(self, model_params = {}):
        self.model = lgb.LGBMRegressor(**model_params)
        
    def fit(self, X_train, y_train, X_opt, y_opt, model_fit_params = {}):
        self.X_train = X_train
        self.y_train = y_train
        self.model.fit(self.X_train, 
                       np.log(self.y_train), **model_fit_params)
        return self
        
    def predict_loc(self, X_pred):
        return np.exp(self.model.predict(X_pred))

    def predict_rv(self, X):
        """pont becsles"""
        y = np.exp(self.model.predict(X))
        return np.array([rv_discrete(values=(m, 1)) for m in y])

    
    def crps_score(self, X_val, y_val):
        loc = self.predict_loc(X_val)
        samples = loc.reshape(-1,1)
        crps_score = %julia CRPS_ecdf($samples, $y_val)   
        return crps_score
    
    
    def skill_scores(self, X_val, y_val, ref_scores):
        crps_score = self.crps_score(X_val, y_val)
        skill_scores = {"crps" : crps_score, 
                          **{n : skill_score(crps_score, ref_scores[n].mean()) 
                                                 for n in ref_scores.columns[:-1]}}
        return pd.DataFrame.from_dict(skill_scores)

    
    def score(self, X_val, y_val, ref_scores):
        """
        retrun skill scores and p value
        on validation set
        """
        skill_scores = self.skill_scores(X_val, y_val, ref_scores)
        pvalue = ttest_ind(ref_scores["location_ig"], skill_scores["crps"]).pvalue
        return {**skill_scores.mean().to_dict(), "p_value" : pvalue}

In [6]:
class UMMoments(UncertaintyMeasure):
    """
    uncertainty measure using a 2 parameter
    distribution (e.g. inverse gamma)
    """
    
    def __init__(self, model_params = {}, distribution = "InverseGamma"):
        super().__init__(model_params)
        self.distribution = distribution
        self.affin_coeffs = (1,0) # (a,b) : var := a * var + b
        
    def get_moments(self, X_pred):
        """point predict"""
        loc = self.predict_loc(X_pred)
        var = np.zeros_like(loc)
        return loc, var
        
    def predict_rv(self, X):
        loc, var = self.get_moments(X)
        
        assert self.distribution == "InverseGamma", f"{self.distribution} is not implemented"
        return fit_MM_inv_gamma(loc, var)
        
    def crps_score(self, X_val, y_val):
        loc, var = self.get_moments(X_val)
        distr = self.distribution
        crps_score = %julia CRPS_distribution($loc, $var, $y_val, $distr)
        return crps_score
    
    
    def auto_tune_affin_global(self, X_opt, y_opt):
        """find optimal variance scaling on `opt` dataset"""
        
        loc, var = self.get_moments(X_opt)
        a, crps = auto_scale_variance(loc, var, y_opt)
        
        self.affin_coeffs = (a,0)
        
        return crps

In [8]:
class LocAffin_var(UMMoments):
    """
    loc = gbdt prediction
    var = a * loc + b
    """
    
    def fit(self, X_train, y_train, X_opt, y_opt):
        super().fit(X_train, y_train, X_opt, y_opt)
        self.optimize_var_coeffs(X_opt, y_opt)  
        return self
    
    def get_moments(self, X_pred):
        loc = np.exp(self.model.predict(X_pred))
        var = loc * self.var_coeffs[0] + self.var_coeffs[1]
        var[var <= 0] = 0
        return loc, var
    
    def optimize_var_coeffs(self, X_opt, y_opt):
        loc = np.exp(self.model.predict(X_opt))
        def f(x, y_opt = y_opt):
            var = loc * x[0] + x[1]
            var[var <= 0] = 0
            crps = %julia CRPS_distribution($loc, $var, $y_opt)
            return crps.mean()

        res = minimize(f, x0 = [1,0])
        self.var_coeffs = res.x

In [9]:
class Constant_var(UMMoments):
    """
    loc = gbdt prediction
    var = const
    """

    def fit(self, X_train, y_train, X_opt, y_opt):
        super().fit(X_train, y_train, X_opt, y_opt)
        self.optimize_var_coeff(X_opt, y_opt)  
        return self
    
    def get_moments(self, X_pred):
        loc = np.exp(self.model.predict(X_pred))
        var = np.ones_like(loc) * self.var_coeff
        var[var <= 0] = 0
        return loc, var
    
    def optimize_var_coeff(self, X_opt, y_opt):
        loc = np.exp(self.model.predict(X_opt))
        def f(x, y_opt = y_opt):
            var = np.ones_like(loc) * x
            var[var <= 0] = 0
            crps = %julia CRPS_distribution($loc, $var, $y_opt)
            return crps.mean()

        res = minimize_scalar(f)
        self.var_coeff = res.x

In [66]:
class InstanceBasedUncertainty(UMMoments):
    def fit(self,X_train, y_train, X_opt, y_opt, model_params = {"k" : 40}):
        super().fit(X_train, y_train, X_opt, y_opt)
        self.pmodel = IBUGWrapper(**model_params)
        self.pmodel.fit(self.model, 
                        self.X_train.to_numpy(), 
                        np.log(self.y_train.to_numpy()))
        
        # self.auto_fit_k()
        # self.auto_tune_affin_global(X_opt, y_opt)
        self.affin_coeffs = (0.08,0)
        return self
    
    def get_moments(self, X, return_kneighbours_y = False):
    
        loc, _, _, train_vals = self.pmodel.pred_dist(X.to_numpy(), 
                                                      return_kneighbors=True)
        loc = np.exp(loc)
        train_vals = np.exp(train_vals)

        mean, var = train_vals.mean(axis = 1), train_vals.var(axis = 1)
        var = self.affin_coeffs[0] * var + self.affin_coeffs[1]
        
        if return_kneighbours_y: return loc, var, train_vals
        else:                    return loc, var
        
    
    def auto_fit_k(self):
        def f(k):
            if k <= 0:
                return np.inf
            k = int(k)
            
            self.pmodel = IBUGWrapper(k = k)
            self.pmodel.fit(self.model, 
                            self.X_train.to_numpy(), 
                            np.log(self.y_train.to_numpy()))
            
            return self.crps_score(X_opt, y_opt).mean()

        res = minimize_scalar(f, bounds = [2,300])
        return res.fun, int(res.x)

In [222]:
class NpyInstanceBasedUncertainty(UMMoments):
    def fit(self,X_train, y_train, X_opt, y_opt, k = 40, model_fit_params = {}):
        super().fit(X_train, y_train, X_opt, y_opt)
        
        self.L_train = self.model.predict(X_train, pred_leaf = True)
        self.k = k
        
        self.auto_tune_affin_global(X_opt, y_opt)
        self.auto_tune_k(X_opt, y_opt)
        return self
    
    def get_y_simmilarities(self, X_pred):
        L_pred = self.model.predict(X_pred, pred_leaf = True)
        L_train_b = self.L_train.T[np.newaxis, :,:]
        L_pred_b = np.broadcast_to(L_pred[:,:,np.newaxis], 
                                   shape = (*L_pred.shape, self.L_train.shape[0]))
        leaf_matches = L_train_b == L_pred_b
        
        sum_of_same_leaf = leaf_matches.sum(axis = 1)
        y_train_br = np.broadcast_to(y_train.values[np.newaxis, :], shape = sum_of_same_leaf.shape)
        perm = np.argsort(sum_of_same_leaf, axis = 1)
        simmilar_train_y = np.take_along_axis(y_train_br, perm, axis = 1)
        return simmilar_train_y
    
    def get_moments(self, X_pred):
        loc =  np.exp(self.model.predict(X_pred))
        

        simmilar_train_y = self.get_y_simmilarities(X_pred)
        k = self.k
        var = simmilar_train_y[:,-k:].var(axis = 1)
        var = self.affin_coeffs[0] * var + self.affin_coeffs[1]
        return loc, var
    
    def auto_tune_k(self, X_opt, y_opt):
        simmilar_train_y = self.get_y_simmilarities(X_opt)
        loc =  np.exp(self.model.predict(X_opt))

        def f(k, y_opt = y_opt):
            if k <= 0:
                return np.inf
            k = int(k)
            
            var = simmilar_train_y[:,-k:].var(axis = 1)
            var = self.affin_coeffs[0] * var + self.affin_coeffs[1]
            lloc, ly_opt = loc, y_opt
            crps = %julia CRPS_distribution($lloc, $var, $ly_opt)
            return crps.mean()
        
        # res = minimize_scalar(f)
        # self.k = int(res.x)
        test_k = [*range(2,100)]#[3,4,5,10,20,30,40,60,80,100]
        test = [(k, f(k)) for k in tqdm(test_k, desc= "optimise k", leave=False)]
        
        k, m = min(test, key = lambda t : t[1])
        self.k = k
        
        
        return k, m, test
            

### test models

In [11]:
#load sgem matrix product dataset
df = pd.read_csv("sgemm_product.csv")

runCols = ["Run1 (ms)", "Run2 (ms)", "Run3 (ms)", "Run4 (ms)"]
df["minRun"] = df[runCols].min(axis = 1)
df["meanRun"] = df[runCols].mean(axis = 1)

X = df.drop(columns = [*runCols, "minRun", "meanRun"])
target = "minRun"
Y = df[target]

In [12]:
#split data 

# validate model:
X_data, X_val, y_data, y_val = train_test_split(X, Y, test_size = 50000, random_state=1)
# fine tune hyperparam:
X_data, X_opt, y_data, y_opt = train_test_split(X_data, y_data, test_size = 999, random_state=1)
# train model:
X_train, X_data, y_train, y_data = train_test_split(X_data, y_data, train_size = 1000, random_state=1)

In [13]:
train_set = X_train, y_train, X_opt, y_opt

In [272]:
models = [UMMoments, LocAffin_var, Constant_var, NpyInstanceBasedUncertainty]

In [273]:
scores = {}
for Model in tqdm(models):
    m = Model().fit(X_train, y_train, X_opt, y_opt)
    crps = m.crps_score(X_val, y_val)
    scores[Model.__name__] = crps

  0%|          | 0/4 [00:00<?, ?it/s]

optimise k:   0%|          | 0/98 [00:00<?, ?it/s]

In [274]:
pd.DataFrame(scores).mean()

UMMoments                      53.395407
LocAffin_var                   39.045181
Constant_var                   44.126079
NpyInstanceBasedUncertainty    39.544588
dtype: float64

In [98]:
m = InstanceBasedUncertainty()

In [99]:
m.fit(X_train, y_train, X_opt, y_opt)

In [69]:
crps = m.crps_score(X_val, y_val)

In [71]:
crps.mean()

46.05919077734371

In [108]:
m.auto_tune_affin_global(X_opt, y_opt)

42.70783223603656

In [100]:
m.auto_tune_affin_global(X_opt, y_opt)
m.auto_fit_k()

43.4650382760263

In [110]:
crps = m.crps_score(X_val, y_val)

In [111]:
crps.mean()

39.21474971943074

#### replace ibug

In [254]:
m_ibug = InstanceBasedUncertainty().fit(*train_set, model_params={'k': 1})

In [257]:
m_npy = NpyInstanceBasedUncertainty()
m_npy.model = m_ibug.model
m_npy.L_train = m_npy.model.predict(X_train, pred_leaf = True)

In [258]:
simmilar_train_y = m_npy.get_y_simmilarities(X_opt)

In [261]:
_,_,sv_ind,sv = m_ibug.pmodel.pred_dist(X_opt.to_numpy(), return_kneighbors=True)

In [262]:
np_y = simmilar_train_y[:, -1]
ibug_y = np.exp( sv[:,0])

In [270]:
np.all(np.isclose(kdf["np"], kdf["ibug"]))

True

In [223]:
m = NpyInstanceBasedUncertainty().fit(*train_set)

optimise k:   0%|          | 0/98 [00:00<?, ?it/s]

In [225]:
m.crps_score(X_val, y_val).mean()

39.544587957557695

In [None]:
class combinedmodel:
    """
    loc = gbrt prediction
    var = b + a1 * nn_y_var + a2 * loc
    """

In [None]:
m.auto_tune_affin_global(X_opt, y_opt)

43.69787865471943

In [None]:
m.affin_coeffs = (0.19665441948315218, 0)