In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import pickle
import logging
from itertools import combinations
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
lgcg_path = os.path.abspath(os.path.join('../../../lazified-generalized-conditional-gradient/lgcg'))
if lgcg_path not in sys.path:
    sys.path.append(lgcg_path)
from sisso import SISSO
from lgcg import LGCG_finite

# Load Data

In [3]:
K = pickle.load(open("evals_thermal.pkl", "rb"))
K.shape

(233799, 75)

In [4]:
y = pd.read_csv("thermal_conductivity_data.csv")["log kappa_L"].values
y.shape

(75,)

# SISSO

In [5]:
exp_sisso = SISSO(K=K.T, target=y)

In [6]:
exp_sisso.fit()

INFO:root:Iteration: 1
INFO:root:Error: 0.30373924013773684
INFO:root:Number of combinations: 25
INFO:root:Optimal combination: (17654,)
INFO:root:Optimal coefficients: [11.06288295  0.11840932]
INFO:root:------------------


INFO:root:Iteration: 2
INFO:root:Error: 0.23759122318118392
INFO:root:Number of combinations: 1225
INFO:root:Optimal combination: (17654, 55857)
INFO:root:Optimal coefficients: [ 9.27143569 -2.28259027  0.44046387]
INFO:root:------------------
INFO:root:Iteration: 3
INFO:root:Error: 0.17777442750366648
INFO:root:Number of combinations: 67525
INFO:root:Optimal combination: (55857, 113455, 228993)
INFO:root:Optimal coefficients: [-2.45898648  9.9177373  -1.60420282  0.47186572]
INFO:root:------------------
INFO:root:Iteration: 4
INFO:root:Error: 0.1638370934699184
INFO:root:Number of combinations: 3921225
INFO:root:Optimal combination: (17654, 55857, 151583, 228993)
INFO:root:Optimal coefficients: [ 9.2733953  -2.0677772  -0.9916738  -1.52842136  0.57010248]
INFO:root:------------------


[array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.11840932]),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44046387]),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.47186572]),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.57010248])]

In [7]:
del exp_sisso

# LGCG

In [8]:
exp = LGCG_finite(K=K.T, target=y, alpha=0.1)

In [9]:
result = exp.solve(tol=1e-11)

DEBUG:root:SSN in 2 dimensions converged in 2 iterations to tolerance 1.000E-01
INFO:root:1: Phi 3.644E-01, epsilon 2.550E+00, support [ 46695 233799], Psi 1.000E-01
DEBUG:root:SSN in 3 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:2: Phi 1.864E-01, epsilon 3.195E+00, support [ 19855  46695 233799], Psi 1.000E-01
DEBUG:root:SSN in 4 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:3: Phi 1.565E-01, epsilon 3.250E+00, support [ 19855  46695 153142 233799], Psi 1.000E-01
DEBUG:root:SSN in 5 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:4: Phi 1.210E-01, epsilon 3.107E+00, support [ 19855  46695  85007 153142 233799], Psi 1.000E-01
DEBUG:root:SSN in 6 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:5: Phi 5.902E-02, epsilon 2.905E+00, support [ 19855  85007 114978 153142 233799], Psi 1.000E-01
DEBUG:root:SSN in 6 dimensions converged in 0 iterations to tolerance 1.000E-01
INFO:root:6: Phi 5.193E-02,

DEBUG:root:SSN in 7 dimensions converged in 1 iterations to tolerance 6.250E-03
INFO:root:19: Phi 7.140E-03, epsilon 1.223E+00, support [ 30460  40241  46171 110190 115194 126919 149760], Psi 6.250E-03
DEBUG:root:SSN in 8 dimensions converged in 0 iterations to tolerance 6.250E-03
INFO:root:20: Phi 6.237E-03, epsilon 1.171E+00, support [ 30460  36431  40241  46171 110190 115194 126919 149760], Psi 6.250E-03
DEBUG:root:SSN in 8 dimensions converged in 2 iterations to tolerance 3.125E-03
INFO:root:21: Phi 8.775E-03, epsilon 1.122E+00, support [ 30460  36431  40241  46171 110190 115194 149760], Psi 3.125E-03
DEBUG:root:SSN in 8 dimensions converged in 1 iterations to tolerance 3.125E-03
INFO:root:22: Phi 3.255E-03, epsilon 1.078E+00, support [ 30460  36431  40241  46171 110190 115194 149760 183341], Psi 3.125E-03
DEBUG:root:SSN in 9 dimensions converged in 0 iterations to tolerance 3.125E-03
INFO:root:23: Phi 2.901E-03, epsilon 1.037E+00, support [ 19073  30460  36431  40241  46171 110190

In [10]:
u = result["u"]
print(u)
support = result["support"]
print(support)
u_bar = np.zeros(exp.K.shape[1])
for ind, val in zip(support, u):
    u_bar[ind] = val

[ 5.63073546e-01  4.30162553e-04  1.06077913e-02  8.12262120e-03
 -8.50070612e-02 -4.64990300e-01  1.75735707e-04  1.14962932e-04
  1.46636015e-03  2.82628591e-03  6.31656823e-03]
[  3664  18331  19073  21404  36431  46171  74388  75210 110190 126955
 149760]


In [11]:
# Renormalize u_bar
# u_bar = u_bar/exp.target_norm
# for iter, nor in enumerate(exp.K_norms):
#     u_bar[iter] *= nor

In [12]:
# RMSError wrt target
np.sqrt(np.mean(np.square(np.matmul(np.append(K.T, np.ones((K.shape[1], 1)), axis=1), u_bar) - y)))

0.24840872239600664

### Correlations

In [13]:
K_support = exp.K[:, support]
K_support.shape

(75, 11)

In [14]:
df =pd.DataFrame(K_support)

In [15]:
corr = df.corr()
corr.style.background_gradient(cmap='BrBG_r', axis=None).format(precision=2)

DEBUG:matplotlib:matplotlib data path: /vol/cs-hu/hnatiuar@hu-berlin.de/miniconda3/envs/sissopp_env/lib/python3.9/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux
DEBUG:matplotlib:CACHEDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib/fontlist-v330.json


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.58,0.59,0.39,-0.77,-0.92,0.63,0.54,0.47,0.56,0.61
1,0.58,1.0,0.88,0.69,-0.6,-0.74,0.96,0.98,0.65,0.82,0.84
2,0.59,0.88,1.0,0.6,-0.51,-0.7,0.86,0.86,0.85,0.77,0.89
3,0.39,0.69,0.6,1.0,-0.26,-0.57,0.69,0.68,0.41,0.42,0.59
4,-0.77,-0.6,-0.51,-0.26,1.0,0.72,-0.64,-0.59,-0.43,-0.78,-0.5
5,-0.92,-0.74,-0.7,-0.57,0.72,1.0,-0.76,-0.7,-0.49,-0.63,-0.74
6,0.63,0.96,0.86,0.69,-0.64,-0.76,1.0,0.97,0.64,0.8,0.77
7,0.54,0.98,0.86,0.68,-0.59,-0.7,0.97,1.0,0.66,0.79,0.77
8,0.47,0.65,0.85,0.41,-0.43,-0.49,0.64,0.66,1.0,0.67,0.78
9,0.56,0.82,0.77,0.42,-0.78,-0.63,0.8,0.79,0.67,1.0,0.77


### SISSO Step

In [16]:
# SO
error = 10
for n in range(1,len(support)+1):
    min_error = 10 * error
    combinatorial_combinations = combinations(support, n)
    optimal_combination = None
    optimal_coefficients = None
    combinatorial_counter = 0
    for combination in combinatorial_combinations:
        combinatorial_counter += 1
        if exp.K.shape[1]-1 in support: # Constant term is already in the support
            submatrix = exp.K[:, np.array(combination)]
        else:
            submatrix = np.append(exp.K[:, np.array(combination)], np.ones((exp.K.shape[0], 1)), axis=1)
        try:
            least_squares, res, rank, s = np.linalg.lstsq(
                submatrix, y, rcond=None
            )
            local_error = np.sqrt(
                np.mean(
                    np.square(np.matmul(submatrix, least_squares) - y)
                )
            )  # RMSE
        except np.linalg.LinAlgError:
            local_error = min_error
        if local_error < min_error:
            min_error = local_error
            optimal_combination = combination
            optimal_coefficients = least_squares
    error = min_error

    logging.info(f"Iteration: {n}")
    logging.info(f"Error: {error}")
    logging.info(f"Number of combinations: {combinatorial_counter}")
    logging.info(f"Optimal combination: {optimal_combination}")
    logging.info(f"Optimal coefficients: {optimal_coefficients}")
    logging.info("------------------")

INFO:root:Iteration: 1
INFO:root:Error: 0.34652639094464455
INFO:root:Number of combinations: 11
INFO:root:Optimal combination: (149760,)
INFO:root:Optimal coefficients: [13.24568038 -0.18019397]
INFO:root:------------------
INFO:root:Iteration: 2
INFO:root:Error: 0.24529862495500807
INFO:root:Number of combinations: 55
INFO:root:Optimal combination: (19073, 46171)
INFO:root:Optimal coefficients: [ 6.38343945 -5.226458    0.0979673 ]
INFO:root:------------------
INFO:root:Iteration: 3
INFO:root:Error: 0.20590683397571385
INFO:root:Number of combinations: 165
INFO:root:Optimal combination: (3664, 75210, 149760)
INFO:root:Optimal coefficients: [ 3.92391854  3.49652666  6.13769723 -0.12596177]
INFO:root:------------------
INFO:root:Iteration: 4
INFO:root:Error: 0.19873944223662543
INFO:root:Number of combinations: 330
INFO:root:Optimal combination: (3664, 75210, 110190, 149760)
INFO:root:Optimal coefficients: [ 3.96084385  3.30661799  1.55174112  4.95399608 -0.14549949]
INFO:root:--------

INFO:root:------------------
INFO:root:Iteration: 7
INFO:root:Error: 0.1824317242001786
INFO:root:Number of combinations: 330
INFO:root:Optimal combination: (3664, 18331, 19073, 21404, 36431, 75210, 149760)
INFO:root:Optimal coefficients: [ 2.62309049 -4.30774824  1.83661171  1.13710739 -1.65992088  5.04153592
  6.40590392 -0.20380974]
INFO:root:------------------
INFO:root:Iteration: 8
INFO:root:Error: 0.18194086682365085
INFO:root:Number of combinations: 165
INFO:root:Optimal combination: (3664, 18331, 19073, 21404, 36431, 75210, 110190, 149760)
INFO:root:Optimal coefficients: [ 2.70419694 -3.4472118   1.38594985  1.16144992 -1.5749378   4.44188589
  0.60232498  5.95842242 -0.19915728]
INFO:root:------------------
INFO:root:Iteration: 9
INFO:root:Error: 0.1813068119643968
INFO:root:Number of combinations: 55
INFO:root:Optimal combination: (3664, 18331, 19073, 21404, 36431, 46171, 75210, 110190, 149760)
INFO:root:Optimal coefficients: [ 1.98854963 -3.12614072  1.19670116  1.0761242  -

# Exact GCG

In [17]:
exp_exact = LGCG_finite(K=K.T, target=y, alpha=0.1)

In [18]:
result_exact = exp_exact.solve_exact(tol=1e-11)

DEBUG:root:SSN in 2 dimensions converged in 9 iterations to tolerance 1.000E-11
INFO:root:1: Phi 3.635E-01, support [ 46695 233799]
DEBUG:root:SSN in 3 dimensions converged in 11 iterations to tolerance 1.000E-11
INFO:root:2: Phi 2.020E-01, support [ 19855  46695 233799]
DEBUG:root:SSN in 4 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:3: Phi 1.728E-01, support [ 19855  46695 173837 233799]
DEBUG:root:SSN in 5 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:4: Phi 9.462E-02, support [ 19855  85007 173837 233799]
DEBUG:root:SSN in 5 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:5: Phi 7.677E-02, support [ 19855  85007 105954 173837 233799]
DEBUG:root:SSN in 6 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:6: Phi 5.863E-02, support [ 19855  85007 105954 110190 173837 233799]
DEBUG:root:SSN in 7 dimensions converged in 10 iterations to tolerance 1.000E-11
INFO:root:7: Phi 4.675E-02, support [ 1043

In [19]:
u = result_exact["u"]
print(u)
support = result_exact["support"]
print(support)
u_bar = np.zeros(exp_exact.K.shape[1])
for ind, val in zip(support, u):
    u_bar[ind] = val

[ 5.63073546e-01  4.30162553e-04  1.06077913e-02  8.12262120e-03
 -8.50070612e-02 -4.64990300e-01  1.75735707e-04  1.14962932e-04
  1.46636015e-03  2.82628591e-03  6.31656823e-03]
[  3664  18331  19073  21404  36431  46171  74388  75210 110190 126955
 149760]


In [20]:
# Renormalize u_bar
u_bar_r = u_bar/exp_exact.target_norm
for iter, nor in enumerate(exp_exact.K_norms):
    u_bar_r[iter] *= nor

In [21]:
p_bar = np.abs(exp_exact.p(u_bar_r))

In [22]:
for x in support:
    print(p_bar[x])

0.10000000000000006
0.10000000000000006
0.10000000000000006
0.10000000000000006
0.10000000000000006
0.10000000000000006
0.10000000000000006
0.10000000000000009
0.10000000000000006
0.10000000000000007
0.10000000000000006


In [23]:
# Sigma in our theoretocal considerations of the finite setting
0.5*(0.1-np.max(p_bar[p_bar<0.0999999]))

2.8623519061785063e-05

In [24]:
# Error wrt target
np.sqrt(np.mean(np.square(np.matmul(np.append(K.T, np.ones((K.shape[1], 1)), axis=1), u_bar) - y)))

0.24840872239600628

### Correlations

In [25]:
K_support = exp_exact.K[:, support]
K_support.shape

(75, 11)

In [26]:
df =pd.DataFrame(K_support)

In [27]:
corr = df.corr()
corr.style.background_gradient(cmap='BrBG_r', axis=None).format(precision=2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.58,0.59,0.39,-0.77,-0.92,0.63,0.54,0.47,0.56,0.61
1,0.58,1.0,0.88,0.69,-0.6,-0.74,0.96,0.98,0.65,0.82,0.84
2,0.59,0.88,1.0,0.6,-0.51,-0.7,0.86,0.86,0.85,0.77,0.89
3,0.39,0.69,0.6,1.0,-0.26,-0.57,0.69,0.68,0.41,0.42,0.59
4,-0.77,-0.6,-0.51,-0.26,1.0,0.72,-0.64,-0.59,-0.43,-0.78,-0.5
5,-0.92,-0.74,-0.7,-0.57,0.72,1.0,-0.76,-0.7,-0.49,-0.63,-0.74
6,0.63,0.96,0.86,0.69,-0.64,-0.76,1.0,0.97,0.64,0.8,0.77
7,0.54,0.98,0.86,0.68,-0.59,-0.7,0.97,1.0,0.66,0.79,0.77
8,0.47,0.65,0.85,0.41,-0.43,-0.49,0.64,0.66,1.0,0.67,0.78
9,0.56,0.82,0.77,0.42,-0.78,-0.63,0.8,0.79,0.67,1.0,0.77


# Cross Validation

In [35]:
def get_sisso_errors(X, y, X_test, y_test):
    exp = SISSO(X, y)
    n_solutions = exp.fit(max_iterations=3)
    X_test = np.append(X_test, np.ones((X_test.shape[0], 1)), axis=1)
    errors = [np.sqrt(np.mean(np.square(np.matmul(X_test,solution)-y_test))) for solution in n_solutions]  #RMSE
    del exp
    return errors

def get_gcg_errors(X,y,X_test,y_test):
    exp = LGCG_finite(K=X, target=y, alpha=0.1)
    result = exp.solve_exact(tol=1e-11)
    u_bar = result["u"]
    support = result["support"]
    ones_norm = np.linalg.norm(np.ones((X_test.shape[0],1)))
    X_test = np.append(X_test, np.ones((X_test.shape[0], 1))/ones_norm, axis=1)
    X = np.append(X, np.ones((X.shape[0], 1))/ones_norm, axis=1)
    errors = [np.sqrt(np.mean(np.square(np.matmul(X_test[:,support],u_bar)-y_test)))]

    error = 1e14
    if exp.K.shape[1]-1 not in support: # Constant is not in the support
        support = np.append(support, exp.K.shape[1]-1)
    for n in range(1,4):
        min_error = 10 * error
        combinatorial_combinations = combinations(support, n)
        optimal_combination = None
        optimal_coefficients = None
        for combination in combinatorial_combinations:
            submatrix = X[:, np.array(combination)]
            try:
                least_squares, res, rank, s = np.linalg.lstsq(
                    submatrix, y, rcond=None
                )
                local_error = np.sqrt(
                    np.mean(
                        np.square(np.matmul(submatrix, least_squares) - y)
                    )
                )  # RMSE
            except np.linalg.LinAlgError:
                local_error = min_error
            if local_error < min_error:
                min_error = local_error
                optimal_combination = np.array(combination)
                optimal_coefficients = least_squares
        error = min_error
        errors.append(np.sqrt(np.mean(np.square(np.matmul(X_test[:,optimal_combination], optimal_coefficients) - y_test))))
    
    del exp
    return errors

In [29]:
X = K.T
print(X.shape)

(75, 233799)


In [30]:
logging.getLogger().setLevel(logging.CRITICAL) # Supress logging

In [39]:
columns = ["sisso_1", "sisso_2", "sisso_3", "gcg_all", "gcg_1", "gcg_2", "gcg_3"]
cv = KFold(n_splits=3, random_state=7, shuffle=True)
errors = []
for i, (train_index, test_index) in enumerate(cv.split(X)):
    print(f"Split {i},train size {len(train_index)}, test size {len(test_index)}")
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    sisso_errors = get_sisso_errors(X_train, y_train, X_test, y_test)
    gcg_errors = get_gcg_errors(X_train, y_train, X_test, y_test)
    combined_errors = sisso_errors+gcg_errors
    # print(combined_errors)
    errors.append(combined_errors)
error_df = pd.DataFrame(errors, columns=columns)

Split 0,train size 50, test size 25
Split 1,train size 50, test size 25
Split 2,train size 50, test size 25


In [32]:
# Up to 4 combinations
error_df

Unnamed: 0,sisso_1,siso_2,sisso_3,sisso_4,gcg_all,gcg_1,gcg_2,gcg_3,gcg_4
0,0.397652,0.329251,0.196044,0.219888,0.271997,0.340561,0.384616,0.326953,0.312405
1,0.28881,0.226036,0.203377,0.217295,0.263853,0.460637,0.204641,0.310384,0.286214
2,0.355225,0.482745,0.456003,0.462067,0.270788,0.847289,0.304853,0.254495,0.267928
3,0.299355,0.254563,0.208269,0.223966,0.273509,0.445146,0.326076,0.256542,0.316305
4,0.301245,0.242842,0.202979,0.200465,0.293968,0.355174,0.316207,0.264887,0.250785


In [33]:
# Up to 4 combinations
error_df.mean()

sisso_1    0.328457
siso_2     0.307087
sisso_3    0.253335
sisso_4    0.264736
gcg_all    0.274823
gcg_1      0.489762
gcg_2      0.307279
gcg_3      0.282652
gcg_4      0.286727
dtype: float64

In [37]:
# Up to 3 combinations
error_df

Unnamed: 0,sisso_1,siso_2,sisso_3,gcg_all,gcg_1,gcg_2,gcg_3
0,0.447166,0.326363,0.210053,0.29602,0.374789,0.410996,0.368792
1,0.281492,0.307067,0.214096,0.232736,0.157616,0.303559,0.250761
2,0.307585,0.198688,0.20538,0.244891,0.585415,0.207238,0.387743
3,0.255499,0.363984,0.223242,0.364316,0.488329,0.248658,0.463877
4,0.431701,0.412297,0.25857,0.237611,1.12161,0.382852,0.277265
5,0.202724,0.264904,0.126728,0.279661,0.549367,0.426031,0.202979
6,0.362688,0.281513,0.234432,0.329212,0.39124,0.253092,0.303016
7,0.257493,0.190936,0.199664,0.263272,0.361454,0.257421,0.205387
8,0.276093,0.225918,0.165432,0.314314,0.31173,0.38464,0.301817
9,0.331224,0.245872,0.264488,0.263544,0.389717,0.276477,0.223146


In [38]:
# Up to 3 combinations
error_df.mean()

sisso_1    0.315366
siso_2     0.281754
sisso_3    0.210209
gcg_all    0.282558
gcg_1      0.473127
gcg_2      0.315096
gcg_3      0.298478
dtype: float64

In [40]:
error_df

Unnamed: 0,sisso_1,sisso_2,sisso_3,gcg_all,gcg_1,gcg_2,gcg_3
0,0.358891,0.29721,0.312224,0.254531,0.451146,0.455089,0.2854
1,0.345205,0.316713,0.389222,0.455302,0.472608,0.708366,0.45645
2,0.28623,0.29916,0.204644,0.268442,0.358407,0.315897,0.241552


In [41]:
error_df.mean()

sisso_1    0.330109
sisso_2    0.304361
sisso_3    0.302030
gcg_all    0.326092
gcg_1      0.427387
gcg_2      0.493117
gcg_3      0.327800
dtype: float64