In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# System
import os
import sys
sys.path.append('/home/helfrech/Tools/Toolbox/utils')

# Maths
import numpy as np
from numpy.random import default_rng

# Plotting
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# ML
from soap import extract_species_pair_groups
from skcosmo.decomposition import PCovR

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold, PredefinedSplit
from sklearn.compose import TransformedTargetRegressor, ColumnTransformer

# Utilities
import functools
import h5py
import json
import itertools
from tempfile import mkdtemp
from shutil import rmtree
from copy import deepcopy
#from tqdm.notebook import tqdm
from tqdm.auto import tqdm
import project_utils as utils
from tools import load_json, save_json

# Import COSMO style toolkit
import cosmoplot.colorbars as cosmocbars
import cosmoplot.utils as cosmoutils
import cosmoplot.style as cosmostyle

cosmostyle.set_style('article')
colorList = cosmostyle.color_cycle

# Load train and test splits

In [3]:
# Load SOAP cutoffs
soap_hyperparameters = load_json('../Processed_Data/soap_hyperparameters.json')   
cutoffs = soap_hyperparameters['interaction_cutoff']

In [4]:
# Load train sets for IZA and Deem
iza_train_idxs = np.loadtxt('../Processed_Data/IZA_230/svm_train.idxs', dtype=int)
iza_sort_idxs = np.argsort(iza_train_idxs)
iza_unsort_idxs = np.argsort(iza_sort_idxs)
deem_train_idxs = np.loadtxt('../Processed_Data/DEEM_330k/svm_train.idxs', dtype=int)

In [5]:
# Load cantons for IZA and Deem
iza_cantons = np.loadtxt('../Raw_Data/IZA_230/cantons.dat', usecols=1, dtype=int)
deem_cantons_2 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_2-class.dat', dtype=int)
deem_cantons_4 = np.loadtxt('../Processed_Data/DEEM_330k/Data/cantons_4-class.dat', dtype=int)

In [6]:
# Build set of "master" canton labels
cantons = {}

cantons[4] = np.concatenate((
    iza_cantons[iza_train_idxs], 
    deem_cantons_4[deem_train_idxs]
))

cantons[2] = np.concatenate((
    np.ones(len(iza_train_idxs), dtype=int),
    deem_cantons_2[deem_train_idxs]
))

# Build set of class weights (by sample) for centering and scaling
class_weights = {n_cantons: utils.balanced_class_weights(cantons[n_cantons]) for n_cantons in (2, 4)}

In [7]:
# Load dummy Deem cantons to test the "null" case
dummy_cantons = {}
dummy_cantons[2] = np.loadtxt('../Processed_Data/DEEM_330k/Data/dummy_cantons_2-class.dat', dtype=int)
dummy_cantons[2] = dummy_cantons[2][deem_train_idxs]
dummy_cantons[4] = np.loadtxt('../Processed_Data/DEEM_330k/Data/dummy_cantons_4-class.dat', dtype=int)
dummy_cantons[4] = dummy_cantons[4][deem_train_idxs]

# Build set of dummy class weights (by sample) for centering and scaling
dummy_class_weights = {n_cantons: utils.balanced_class_weights(dummy_cantons[n_cantons]) for n_cantons in (2, 4)}

# Model setup

In [8]:
model_dir = '../Processed_Data/Models'

deem_name = 'DEEM_330k'
iza_name = 'IZA_230'
deem_dir = f'../Processed_Data/{deem_name}/Data'
iza_dir = f'../Processed_Data/{iza_name}/Data'

In [9]:
# CV splits
n_splits = 2

y_scaler_parameters = dict(featurewise=False)

pcovr_parameters = dict(n_components=4)
ridge_parameters = dict(fit_intercept=False, tol=1.0E-12)

pcovr_parameter_grid = dict(
    pcovr__regressor__mixing=np.linspace(0.0, 1.0, 3),
    pcovr__regressor__regressor__alpha=np.logspace(-10, 0, 3)
)

ridge_parameter_grid = dict(ridge__regressor__alpha=np.logspace(-10, 0, 3))

# PCovR

In [10]:
# Load SOAPs
iza_file = f'{iza_dir}/6.0/soaps_power_full_avg_nonorm.hdf5'
idxs_sort = np.argsort(iza_train_idxs)
idxs_rev = np.argsort(idxs_sort)
iza_soaps = utils.load_hdf5(iza_file, indices=iza_train_idxs[idxs_sort])
iza_soaps = iza_soaps[idxs_rev]
n_iza_soaps = iza_soaps.shape[0]
# n_train_iza = 50
n_train_iza = n_iza_soaps // 2
# n_train_iza = n_iza_soaps - 2
n_test_iza = n_iza_soaps - n_train_iza

deem_file = f'{deem_dir}/6.0/soaps_power_full_avg_nonorm.hdf5'
deem_soaps = utils.load_hdf5(deem_file, indices=deem_train_idxs)
n_deem_soaps = deem_soaps.shape[0]
# n_train_deem = 5000
n_train_deem = n_deem_soaps // 2
# n_train_deem = n_deem_soaps - 2
n_test_deem = n_deem_soaps - n_train_deem

deem_idxs = np.arange(0, n_deem_soaps)
np.random.shuffle(deem_idxs)
deem_train_idxs2, deem_test_idxs2 = np.split(deem_idxs, [n_train_deem])

iza_idxs = np.arange(0, n_iza_soaps)
np.random.shuffle(iza_idxs)
iza_train_idxs2, iza_test_idxs2 = np.split(iza_idxs, [n_train_iza])

train_soaps = np.vstack((iza_soaps[iza_train_idxs2], deem_soaps[deem_train_idxs2]))
test_soaps = np.vstack((iza_soaps[iza_test_idxs2], deem_soaps[deem_test_idxs2]))

train_cantons = {}
test_cantons = {}

train_cantons[2] = np.concatenate((
    np.ones(n_train_iza, dtype=int),
    np.ones(n_train_deem, dtype=int) * 2
))

test_cantons[2] = np.concatenate((
    np.ones(n_test_iza, dtype=int),
    np.ones(n_test_deem, dtype=int) * 2
))

train_cantons[4] = np.concatenate((
    iza_cantons[iza_train_idxs][iza_train_idxs2],
    np.ones(n_train_deem, dtype=int) * 4
))

test_cantons[4] = np.concatenate((
    iza_cantons[iza_train_idxs][iza_test_idxs2],
    np.ones(n_test_deem, dtype=int) * 4
))

class_weights = {}
class_weights[2] = utils.balanced_class_weights(train_cantons[2])
class_weights[4] = utils.balanced_class_weights(train_cantons[4])

In [11]:
n_cantons = 2

In [12]:
# Load decision functions
iza_dfs = np.loadtxt(f'{iza_dir}/6.0/LSVC/{n_cantons}-Class/Power/OO+OSi+SiSi/svc_structure_dfs.dat')
deem_dfs = np.loadtxt(f'{deem_dir}/6.0/LSVC/{n_cantons}-Class/Power/OO+OSi+SiSi/svc_structure_dfs.dat')

train_dfs = np.concatenate((iza_dfs[iza_train_idxs][iza_train_idxs2], deem_dfs[deem_train_idxs][deem_train_idxs2]))
test_dfs = np.concatenate((iza_dfs[iza_train_idxs][iza_test_idxs2], deem_dfs[deem_train_idxs][deem_test_idxs2]))

# Single point

## Ridge

In [13]:
ridge_pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('ridge', TransformedTargetRegressor(
            regressor=Ridge(alpha=1.0E-6, **ridge_parameters),
            transformer=utils.StandardNormScaler(**y_scaler_parameters)
        ))
    ],
)

ridge_pipeline.fit(train_soaps, train_dfs)
ridge_pipeline.predict(test_soaps)

array([-2.87091954, -2.25153113, -1.87853216, ...,  2.34189772,
        2.41276949,  2.06307019])

## PCovR

In [14]:
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('pcovr', TransformedTargetRegressor(
            regressor=PCovR(
                **pcovr_parameters, 
                mixing=0.0,
                regressor=Ridge(alpha=1.0E-6, **ridge_parameters),
            ),
            transformer=utils.StandardNormScaler(**y_scaler_parameters)
        ))
    ],
)

pipeline.fit(train_soaps, train_dfs)
pipeline.predict(test_soaps)

array([-2.87091957, -2.25153115, -1.87853219, ...,  2.34189773,
        2.41276949,  2.06307019])

# CV

## Ridge

In [15]:
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('ridge', TransformedTargetRegressor(
            regressor=Ridge(**ridge_parameters),
            transformer=utils.StandardNormScaler(**y_scaler_parameters)
        ))
    ],
)

# IZA + DEEM classification
gscv = GridSearchCV(
    pipeline, ridge_parameter_grid,
    scoring='neg_root_mean_squared_error',
    cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
    refit=False, return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, train_dfs)

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ......................ridge__regressor__alpha=1e-10; total time=   0.4s
[CV] END ......................ridge__regressor__alpha=1e-10; total time=   0.3s
[CV] END ......................ridge__regressor__alpha=1e-05; total time=   0.4s
[CV] END ......................ridge__regressor__alpha=1e-05; total time=   0.3s
[CV] END ........................ridge__regressor__alpha=1.0; total time=   0.4s
[CV] END ........................ridge__regressor__alpha=1.0; total time=   0.3s


GridSearchCV(cv=KFold(n_splits=2, random_state=0, shuffle=True),
             error_score='raise',
             estimator=Pipeline(steps=[('norm_scaler', StandardNormScaler()),
                                       ('ridge',
                                        TransformedTargetRegressor(regressor=Ridge(fit_intercept=False,
                                                                                   tol=1e-12),
                                                                   transformer=StandardNormScaler()))]),
             param_grid={'ridge__regressor__alpha': array([1.e-10, 1.e-05, 1.e+00])},
             refit=False, return_train_score=True,
             scoring='neg_root_mean_squared_error', verbose=2)

In [16]:
gscv.cv_results_

{'mean_fit_time': array([0.33721697, 0.31307101, 0.31380606]),
 'std_fit_time': array([0.02304709, 0.00983119, 0.01015139]),
 'mean_score_time': array([0.02914107, 0.03050268, 0.03041935]),
 'std_score_time': array([0.0023073 , 0.00031483, 0.00027728]),
 'param_ridge__regressor__alpha': masked_array(data=[1e-10, 1e-05, 1.0],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ridge__regressor__alpha': 1e-10},
  {'ridge__regressor__alpha': 1e-05},
  {'ridge__regressor__alpha': 1.0}],
 'split0_test_score': array([-1.14701053e-10, -8.77499805e-06, -5.47710108e-02]),
 'split1_test_score': array([-4.04637623e-10, -9.30601010e-06, -5.56151714e-02]),
 'mean_test_score': array([-2.59669338e-10, -9.04050407e-06, -5.51930911e-02]),
 'std_test_score': array([1.44968285e-10, 2.65506027e-07, 4.22080282e-04]),
 'rank_test_score': array([1, 2, 3], dtype=int32),
 'split0_train_score': array([-4.00044401e-11, -3.52366354e-06, -5.26510378e-02]),
 '

## PCovR

In [17]:
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('pcovr', TransformedTargetRegressor(
            regressor=PCovR(**pcovr_parameters),
            transformer=utils.StandardNormScaler(**y_scaler_parameters)
        ))
    ],
)

# IZA + DEEM classification
gscv = GridSearchCV(
    pipeline, pcovr_parameter_grid,
    scoring=utils.pcovr_score,
    cv=KFold(n_splits=n_splits, shuffle=True, random_state=0),
    refit=False, return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, train_dfs)

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-10; total time=   6.2s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-10; total time=   6.3s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-05; total time=   6.2s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-05; total time=   6.1s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1.0; total time=   6.1s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1.0; total time=   6.1s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-10; total time=   6.2s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-10; total time=   6.3s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-05; total time=   6.2s
[CV] END pcovr__regressor__mixing=0.5, pcovr

GridSearchCV(cv=KFold(n_splits=2, random_state=0, shuffle=True),
             error_score='raise',
             estimator=Pipeline(steps=[('norm_scaler', StandardNormScaler()),
                                       ('pcovr',
                                        TransformedTargetRegressor(regressor=PCovR(n_components=4),
                                                                   transformer=StandardNormScaler()))]),
             param_grid={'pcovr__regressor__mixing': array([0. , 0.5, 1. ]),
                         'pcovr__regressor__regressor__alpha': array([1.e-10, 1.e-05, 1.e+00])},
             refit=False, return_train_score=True,
             scoring=<function pcovr_score at 0x7fd7463299d8>, verbose=2)

In [18]:
gscv.cv_results_

{'mean_fit_time': array([6.10389733, 6.02509153, 5.99428022, 6.13617504, 6.11217666,
        6.11929381, 6.13292754, 6.11214542, 6.08698165]),
 'std_fit_time': array([0.04745603, 0.02182877, 0.00472414, 0.0087117 , 0.00712395,
        0.00991237, 0.00554502, 0.01084042, 0.00543916]),
 'mean_score_time': array([0.14243543, 0.13907838, 0.11555803, 0.10922492, 0.10851419,
        0.14373469, 0.13766146, 0.12782061, 0.11375105]),
 'std_score_time': array([4.05156612e-03, 4.31466103e-03, 7.97998905e-03, 6.29782677e-04,
        4.88758087e-06, 4.83274460e-04, 4.08768654e-03, 6.09719753e-03,
        5.34784794e-03]),
 'param_pcovr__regressor__mixing': masked_array(data=[0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_pcovr__regressor__regressor__alpha': masked_array(data=[1e-10, 1e-05, 1.0, 1e-10, 1e-05, 1.0, 1e-10, 1e-05,
          

# CV with oversampling

In [19]:
class DataPrinter():
    def fit(self, X, y=None):
        print(X)
        if y is not None:
            print(y)
            
        return self
            
    def transform(self, X):
        print(X)
        return X

## Ridge

In [20]:
y_pipeline = Pipeline(
    [
        ('drop_features', utils.ColumnTransformerInverse([('drop_features', 'passthrough', [0])])),
        ('norm_scaler', utils.StandardNormScaler(**y_scaler_parameters))
    ]
)

pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
#         ('print', DataPrinter()),
        ('ridge', TransformedTargetRegressor(
            regressor=Ridge(**ridge_parameters),
            transformer=y_pipeline,
            check_inverse=False
        ))
    ],
)

# IZA + DEEM classification
gscv = GridSearchCV(
    pipeline, ridge_parameter_grid,
    scoring=make_scorer(
        utils.class_balanced_metric_score,
        scorer=mean_squared_error,
        class_col=-1,
        squared=False,
        greater_is_better=False
    ),
    cv=utils.ReplicatedStratifiedKFold(n_splits=n_splits, stratify_col=-1, shuffle=True, random_state=0),
    refit=True, return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, np.column_stack((train_dfs, train_cantons[n_cantons])))

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ......................ridge__regressor__alpha=1e-10; total time=   0.5s
[CV] END ......................ridge__regressor__alpha=1e-10; total time=   0.5s
[CV] END ......................ridge__regressor__alpha=1e-05; total time=   0.5s
[CV] END ......................ridge__regressor__alpha=1e-05; total time=   0.5s
[CV] END ........................ridge__regressor__alpha=1.0; total time=   0.5s
[CV] END ........................ridge__regressor__alpha=1.0; total time=   0.5s


GridSearchCV(cv=ReplicatedStratifiedKFold(n_splits=2, random_state=0, shuffle=True,
             stratify_col=-1),
             error_score='raise',
             estimator=Pipeline(steps=[('norm_scaler', StandardNormScaler()),
                                       ('ridge',
                                        TransformedTargetRegressor(check_inverse=False,
                                                                   regressor=Ridge(fit_intercept=False,
                                                                                   tol=1e-12),
                                                                   transformer=Pipeline(steps=[('drop_features',
                                                                                                ColumnTransformerInve...ransformers=[('drop_features',
                                                                                                                                        'passthrough',
                      

In [21]:
gscv.predict(test_soaps)

array([[-2.8709197 ],
       [-2.25153163],
       [-1.87853221],
       ...,
       [ 2.34189766],
       [ 2.41276934],
       [ 2.06307012]])

## PCovR

In [22]:
y_pipeline = Pipeline(
    [
        ('drop_features', utils.ColumnTransformerInverse([('drop_features', 'passthrough', [0])])),
        ('norm_scaler', utils.StandardNormScaler(**y_scaler_parameters))
    ]
)

pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('pcovr', TransformedTargetRegressor(
#             regressor=PCovR(mixing=0.0, **pcovr_parameters),
            regressor=PCovR(**pcovr_parameters),
            transformer=y_pipeline,
            check_inverse=False
        ))
    ],
)

# IZA + DEEM classification
gscv = GridSearchCV(
    pipeline, pcovr_parameter_grid,
    scoring={
        'pcovr': functools.partial(utils.class_balanced_pcovr_score, class_col=-1),
        'rmse': make_scorer(
            utils.class_balanced_metric_score,
            scorer=mean_squared_error,
            class_col=-1,
            squared=False,
            greater_is_better=False
        )
    },
    cv=utils.ReplicatedStratifiedKFold(n_splits=n_splits, stratify_col=-1, shuffle=True, random_state=0),
    refit='pcovr', return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, np.column_stack((train_dfs, train_cantons[n_cantons])))

Fitting 2 folds for each of 9 candidates, totalling 18 fits
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-10; total time=   6.6s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-10; total time=   6.4s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-05; total time=   6.4s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1e-05; total time=   6.5s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1.0; total time=   6.4s
[CV] END pcovr__regressor__mixing=0.0, pcovr__regressor__regressor__alpha=1.0; total time=   6.3s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-10; total time=   6.5s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-10; total time=   6.5s
[CV] END pcovr__regressor__mixing=0.5, pcovr__regressor__regressor__alpha=1e-05; total time=   6.4s
[CV] END pcovr__regressor__mixing=0.5, pcovr

GridSearchCV(cv=ReplicatedStratifiedKFold(n_splits=2, random_state=0, shuffle=True,
             stratify_col=-1),
             error_score='raise',
             estimator=Pipeline(steps=[('norm_scaler', StandardNormScaler()),
                                       ('pcovr',
                                        TransformedTargetRegressor(check_inverse=False,
                                                                   regressor=PCovR(n_components=4),
                                                                   transformer=Pipeline(steps=[('drop_features',
                                                                                                ColumnTransformerInverse(transformer...
                         'pcovr__regressor__regressor__alpha': array([1.e-10, 1.e-05, 1.e+00])},
             refit='pcovr', return_train_score=True,
             scoring={'pcovr': functools.partial(<function class_balanced_pcovr_score at 0x7fd746329a60>, class_col=-1),
                

In [23]:
gscv.predict(test_soaps)

array([[-2.83344095],
       [-2.19005616],
       [-1.83995736],
       ...,
       [ 2.30944517],
       [ 2.38278481],
       [ 2.0797624 ]])

In [24]:
gscv.cv_results_

{'mean_fit_time': array([6.3054899 , 6.29282749, 6.17224383, 6.31191695, 6.28120601,
        6.29823864, 6.3026067 , 6.29776394, 6.28737533]),
 'std_fit_time': array([0.07514679, 0.04826152, 0.01819658, 0.00365031, 0.01173365,
        0.00025427, 0.00025403, 0.00641119, 0.00362766]),
 'mean_score_time': array([0.1761775 , 0.18165267, 0.17947423, 0.1976105 , 0.1871109 ,
        0.17372298, 0.17897904, 0.18496096, 0.18070209]),
 'std_score_time': array([0.01060009, 0.01162302, 0.0087558 , 0.00382245, 0.0078907 ,
        0.0043757 , 0.00212562, 0.00732172, 0.01073492]),
 'param_pcovr__regressor__mixing': masked_array(data=[0.0, 0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_pcovr__regressor__regressor__alpha': masked_array(data=[1e-10, 1e-05, 1.0, 1e-10, 1e-05, 1.0, 1e-10, 1e-05,
                    1.0],
              mask=[False, F

# Sample weight pipeline

In [None]:
from sklearn.svm import LinearSVC

## Single point

In [None]:
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('svc', LinearSVC(class_weight='balanced', max_iter=10000, dual=False))
    ],
)

wts = utils.balanced_class_weights(train_cantons[n_cantons])
fit_params = {'norm_scaler__sample_weight': wts}
pipeline.fit(train_soaps, train_cantons[n_cantons], **fit_params)
pipeline.decision_function(test_soaps)

In [None]:
pipeline = utils.ClassBalancedPipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('svc', LinearSVC(class_weight='balanced', max_iter=10000, dual=False))
    ],
)

pipeline.fit(train_soaps, train_cantons[n_cantons], keys=['norm_scaler__sample_weight'])
pipeline.decision_function(test_soaps)

## CV

In [None]:
svc_parameter_grid = {'svc__C': np.logspace(-1, 1, 3)}

In [None]:
# No weights
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('svc', LinearSVC(class_weight='balanced', max_iter=10000, dual=False))
    ],
)

gscv = GridSearchCV(
    pipeline, svc_parameter_grid,
    scoring='balanced_accuracy',
    cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0),
    refit=True, return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, train_cantons[n_cantons])
gscv.decision_function(test_soaps)

In [None]:
gscv.cv_results_

In [None]:
# fit_params weights
pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('svc', LinearSVC(class_weight='balanced', max_iter=10000, dual=False))
    ],
)

gscv = GridSearchCV(
    pipeline, svc_parameter_grid,
    scoring='balanced_accuracy',
    cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0),
    refit=True, return_train_score=True, error_score='raise',
    verbose=2
)

wts = utils.balanced_class_weights(train_cantons[n_cantons])
fit_params = {'norm_scaler__sample_weight': wts}
gscv.fit(train_soaps, train_cantons[n_cantons], **fit_params)
gscv.decision_function(test_soaps)

In [None]:
gscv.cv_results_

In [None]:
# ClassBalancedPipeline weights
pipeline = utils.ClassBalancedPipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('svc', LinearSVC(class_weight='balanced', max_iter=10000, dual=False))
    ],
)

gscv = GridSearchCV(
    pipeline, svc_parameter_grid,
    scoring='balanced_accuracy',
    cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0),
    refit=True, return_train_score=True, error_score='raise',
    verbose=2
)

gscv.fit(train_soaps, train_cantons[n_cantons], keys=['norm_scaler__sample_weight'])
gscv.decision_function(test_soaps)

In [None]:
gscv.cv_results_

# Testing weights

This might work but probably better for the time being to work with replicated samples, despite the computational expense...

In [None]:
w = utils.balanced_class_weights(train_cantons[n_cantons])
W = np.diagflat(w)
Wsqrt = np.sqrt(W)

In [None]:
# Weights as they should be
np.trace(Wsqrt @ train_soaps @ train_soaps.T @ Wsqrt)

In [None]:
# Weighted covariance
train_soaps.T @ W @ train_soaps

In [None]:
rpt_idxs = repeat_idxs(train_cantons[n_cantons])
replicated_train_soaps = np.repeat(train_soaps, rpt_idxs, axis=0)
replicated_train_cantons = np.repeat(train_cantons[n_cantons], rpt_idxs)

In [None]:
# "Weights" from replication
np.trace(replicated_train_soaps @ replicated_train_soaps.T) / len(replicated_train_soaps)

In [None]:
# Replicated covariance
replicated_train_soaps.T @ replicated_train_soaps / len(replicated_train_soaps)

In [None]:
wv, wu = np.linalg.eigh(Wsqrt @ train_soaps @ train_soaps.T @ Wsqrt)

In [None]:
rv, ru = np.linalg.eigh(replicated_train_soaps @ replicated_train_soaps.T / len(replicated_train_soaps))

In [None]:
np.count_nonzero(wv > 1.0E-12), np.count_nonzero(rv  > 1.0E-12)

In [None]:
unique_labels, label_counts = np.unique(train_cantons[n_cantons], return_counts=True)
split_rpt_idxs = np.split(rpt_idxs, np.cumsum(label_counts))[0:-1]
class_counts = np.array([len(counts) for counts in split_rpt_idxs])
replicated_class_counts = np.array([np.sum(counts) for counts in split_rpt_idxs])
extra_samples = (replicated_class_counts - replicated_class_counts[-1]) / class_counts

labels = train_cantons[n_cantons]
n_samples = len(train_cantons[n_cantons])
n_classes = len(unique_labels)
wr = np.zeros(n_samples)
for ul, lc in zip(unique_labels, label_counts+extra_samples):
    label_weight = (n_samples + np.sum(extra_samples * class_counts)) / (n_classes * lc)
    wr[labels == ul] = label_weight
    
wr /= np.sum(wr)
Wr = np.diagflat(wr)
Wrsqrt = np.sqrt(Wr)

In [None]:
np.trace(Wrsqrt @ train_soaps @ train_soaps.T @ Wrsqrt)

# Testing RR with replicated samples vs. sample weights

In [None]:
# Repeats
ridge_pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('ridge', TransformedTargetRegressor(
            regressor=RidgeCV(alphas=np.logspace(-8, 0, 5), fit_intercept=False, cv=2),
            #regressor=Ridge(alpha=1.0E-6, **ridge_parameters),
            transformer=utils.StandardNormScaler(featurewise=True)
        ))
    ],
)

ridge_pipeline.fit(np.repeat(train_soaps, rpt_idxs, axis=0), np.repeat(train_dfs, rpt_idxs, axis=0))
ridge_pipeline.predict(test_soaps)

In [None]:
rmse = np.zeros(len(unique_labels))
for udx, ul in enumerate(unique_labels):
    rmse[udx] = mean_squared_error(
        test_dfs[test_cantons[n_cantons] == ul], 
        ridge_pipeline.predict(test_soaps[test_cantons[n_cantons] == ul]), 
        squared=False
    )

print(mean_squared_error(test_dfs, ridge_pipeline.predict(test_soaps), squared=False))
print(rmse)
print(np.mean(rmse))

In [None]:
# Weights
ww = np.ones(len(train_dfs)) / len(train_dfs)
ridge_pipeline = Pipeline(
    [
        ('norm_scaler', utils.StandardNormScaler()),
        ('ridge', TransformedTargetRegressor(
            regressor=RidgeCV(alphas=np.logspace(-8, 0, 5), fit_intercept=False, cv=2),
            #regressor=Ridge(alpha=1.0E-6, **ridge_parameters),
            transformer=utils.DataSplitter(
                model=utils.StandardNormScaler(featurewise=True),
                X_cols=slice(0, -1),
                weight_col=-1
            ),
            check_inverse=False
        ))
    ],
)
fit_params = dict(
    norm_scaler__sample_weight=ww,
    ridge__sample_weight=ww
)
ridge_pipeline.fit(train_soaps, np.column_stack((train_dfs, ww)), **fit_params)
ridge_pipeline.predict(test_soaps)

In [None]:
rmse = np.zeros(len(unique_labels))
for udx, ul in enumerate(unique_labels):
    rmse[udx] = mean_squared_error(
        test_dfs[test_cantons[n_cantons] == ul], 
        ridge_pipeline.predict(test_soaps[test_cantons[n_cantons] == ul]), 
        squared=False
    )
    
print(mean_squared_error(test_dfs, ridge_pipeline.predict(test_soaps).squeeze(), squared=False))
print(rmse)
print(np.mean(rmse))

# Testing PCA with replicated samples vs. sample weights