In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import pickle
import logging
from itertools import combinations
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
lgcg_path = os.path.abspath(os.path.join('../../../lazified-generalized-conditional-gradient/lgcg'))
if lgcg_path not in sys.path:
    sys.path.append(lgcg_path)
from sisso import SISSO
from lgcg import LGCG_finite

# Load Data

In [2]:
K = pickle.load(open("evals_thermal_2M.pkl", "rb"))
K.shape

(1991889, 75)

In [3]:
y = pd.read_csv("thermal_conductivity_data.csv")["log kappa_L"].values
y.shape

(75,)

# SISSO

In [4]:
exp_sisso = SISSO(K=K.T, target=y)

In [5]:
exp_sisso.fit()

INFO:root:Iteration: 1
INFO:root:Error: 0.3037392401377369
INFO:root:Number of combinations: 25
INFO:root:Optimal combination: (502413,)
INFO:root:Optimal coefficients: [11.06288295  0.11840932]
INFO:root:------------------
INFO:root:Iteration: 2
INFO:root:Error: 0.23759122318118403
INFO:root:Number of combinations: 1225
INFO:root:Optimal combination: (343109, 502413)
INFO:root:Optimal coefficients: [-2.28259027  9.27143569  0.44046387]
INFO:root:------------------
INFO:root:Iteration: 3
INFO:root:Error: 0.17777442750366654
INFO:root:Number of combinations: 67525
INFO:root:Optimal combination: (343109, 502413, 1158923)
INFO:root:Optimal coefficients: [-2.45898648  9.9177373  -1.60420282  0.47186572]
INFO:root:------------------


In [6]:
del exp_sisso

# LGCG

In [4]:
exp = LGCG_finite(K=K.T, target=y, alpha=0.1)

In [5]:
result = exp.solve(tol=1e-11)

DEBUG:root:SSN in 2 dimensions converged in 2 iterations to tolerance 1.000E-01
INFO:root:1: Phi 3.060E-01, epsilon 2.550E+00, support [ 393284 1991889], Psi 1.000E-01
DEBUG:root:SSN in 3 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:2: Phi 1.497E-01, epsilon 3.195E+00, support [ 393284 1486605 1991889], Psi 1.000E-01
DEBUG:root:SSN in 4 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:3: Phi 1.718E-01, epsilon 3.250E+00, support [ 330868  393284 1486605 1991889], Psi 1.000E-01
DEBUG:root:SSN in 5 dimensions converged in 1 iterations to tolerance 1.000E-01
INFO:root:4: Phi 7.787E-02, epsilon 3.107E+00, support [ 330868 1191023 1486605 1991889], Psi 1.000E-01
DEBUG:root:SSN in 5 dimensions converged in 0 iterations to tolerance 1.000E-01
INFO:root:5: Phi 6.804E-02, epsilon 2.905E+00, support [ 330868  492537 1191023 1486605 1991889], Psi 1.000E-01
DEBUG:root:SSN in 5 dimensions converged in 1 iterations to tolerance 5.000E-02
INFO:root:6: Phi

In [6]:
u = result["u"]
print(u)
support = result["support"]
print(support)
u_bar = np.zeros(exp.K.shape[1])
for ind, val in zip(support, u):
    u_bar[ind] = val

[ 4.89623489e-04 -3.87462191e-04 -9.03777735e-01  2.76966278e-01
 -1.34086555e-01  6.50345069e-02  3.50170060e-04  8.12021573e-03
 -9.97713886e-05 -6.82283170e-05  7.85053862e-04  1.47932508e-01
 -7.08149341e-04  5.58445930e-04  7.88147064e-05  7.23932216e-03
  2.49456545e-02  2.23869843e-01  1.14736704e-01]
[  69716   69721  340637  386839  492542  501065  546103  549795  831208
  831259 1118941 1119764 1171818 1380117 1380121 1512372 1551504 1709307
 1709308]


In [10]:
# Renormalize u_bar
# u_bar = u_bar/exp.target_norm
# for iter, nor in enumerate(exp.K_norms):
#     u_bar[iter] *= nor

In [17]:
# RMSError wrt target
np.sqrt(np.mean(np.square(np.matmul(np.append(K.T, np.ones((K.shape[1], 1)), axis=1), u_bar) - y)))

0.20710212704026942

### Correlations

In [10]:
K_support = exp.K[:, support]
K_support.shape

(75, 19)

In [11]:
df =pd.DataFrame(K_support)

In [12]:
corr = df.corr()
corr.style.background_gradient(cmap='BrBG_r', axis=None).format(precision=2)

DEBUG:matplotlib:matplotlib data path: /vol/cs-hu/hnatiuar@hu-berlin.de/miniconda3/envs/sissopp_env/lib/python3.9/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux
DEBUG:matplotlib:CACHEDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib/fontlist-v330.json


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,-0.71,-0.79,0.68,-0.66,0.62,0.76,0.59,-0.82,-0.82,0.53,0.65,-0.58,0.74,0.76,0.73,0.7,0.59,0.47
1,-0.71,1.0,0.77,-0.67,0.68,-0.64,-0.81,-0.7,0.91,0.88,-0.6,-0.76,0.66,-0.82,-0.8,-0.68,-0.65,-0.53,-0.72
2,-0.79,0.77,1.0,-0.79,0.8,-0.63,-0.85,-0.78,0.88,0.86,-0.45,-0.71,0.63,-0.71,-0.81,-0.65,-0.71,-0.66,-0.67
3,0.68,-0.67,-0.79,1.0,-0.95,0.54,0.69,0.71,-0.72,-0.69,0.58,0.7,-0.49,0.72,0.77,0.56,0.63,0.7,0.7
4,-0.66,0.68,0.8,-0.95,1.0,-0.56,-0.7,-0.75,0.74,0.72,-0.57,-0.74,0.47,-0.71,-0.76,-0.56,-0.63,-0.74,-0.74
5,0.62,-0.64,-0.63,0.54,-0.56,1.0,0.82,0.75,-0.72,-0.67,0.69,0.8,-0.33,0.78,0.65,0.73,0.69,0.48,0.51
6,0.76,-0.81,-0.85,0.69,-0.7,0.82,1.0,0.83,-0.91,-0.89,0.57,0.85,-0.54,0.89,0.84,0.73,0.76,0.69,0.71
7,0.59,-0.7,-0.78,0.71,-0.75,0.75,0.83,1.0,-0.79,-0.75,0.56,0.84,-0.42,0.74,0.71,0.62,0.69,0.67,0.74
8,-0.82,0.91,0.88,-0.72,0.74,-0.72,-0.91,-0.79,1.0,0.98,-0.56,-0.83,0.61,-0.84,-0.82,-0.77,-0.8,-0.67,-0.7
9,-0.82,0.88,0.86,-0.69,0.72,-0.67,-0.89,-0.75,0.98,1.0,-0.51,-0.79,0.61,-0.81,-0.81,-0.76,-0.76,-0.66,-0.67


### SISSO Step

In [18]:
# SO
error = 10
for n in range(1,len(support)+1):
    min_error = 10 * error
    combinatorial_combinations = combinations(support, n)
    optimal_combination = None
    optimal_coefficients = None
    combinatorial_counter = 0
    for combination in combinatorial_combinations:
        combinatorial_counter += 1
        if exp.K.shape[1]-1 in support: # Constant term is already in the support
            submatrix = exp.K[:, np.array(combination)]
        else:
            submatrix = np.append(exp.K[:, np.array(combination)], np.ones((exp.K.shape[0], 1)), axis=1)
        try:
            least_squares, res, rank, s = np.linalg.lstsq(
                submatrix, y, rcond=None
            )
            local_error = np.sqrt(
                np.mean(
                    np.square(np.matmul(submatrix, least_squares) - y)
                )
            )  # RMSE
        except np.linalg.LinAlgError:
            local_error = min_error
        if local_error < min_error:
            min_error = local_error
            optimal_combination = combination
            optimal_coefficients = least_squares
    error = min_error

    logging.info(f"Iteration: {n}")
    logging.info(f"Error: {error}")
    logging.info(f"Number of combinations: {combinatorial_counter}")
    logging.info(f"Optimal combination: {optimal_combination}")
    logging.info(f"Optimal coefficients: {optimal_coefficients}")
    logging.info("------------------")

INFO:root:Iteration: 1
INFO:root:Error: 0.31120865504101186
INFO:root:Number of combinations: 19
INFO:root:Optimal combination: (831208,)
INFO:root:Optimal coefficients: [-9.61334526  0.31968491]
INFO:root:------------------
INFO:root:Iteration: 2
INFO:root:Error: 0.2259618927147472
INFO:root:Number of combinations: 171
INFO:root:Optimal combination: (386839, 831208)
INFO:root:Optimal coefficients: [ 5.32160806 -6.44398923  0.07882843]
INFO:root:------------------


INFO:root:Iteration: 3
INFO:root:Error: 0.17748825343095706
INFO:root:Number of combinations: 969
INFO:root:Optimal combination: (340637, 1118941, 1709307)
INFO:root:Optimal coefficients: [-4.22342437  4.13017976  4.86835214 -0.02190325]
INFO:root:------------------
INFO:root:Iteration: 4
INFO:root:Error: 0.15329345703971878
INFO:root:Number of combinations: 3876
INFO:root:Optimal combination: (69721, 340637, 1118941, 1709307)
INFO:root:Optimal coefficients: [-1.94695499 -3.02671729  3.40333775  4.666747    0.00833095]
INFO:root:------------------
INFO:root:Iteration: 5
INFO:root:Error: 0.14524057355800452
INFO:root:Number of combinations: 11628
INFO:root:Optimal combination: (69721, 340637, 1118941, 1512372, 1709307)
INFO:root:Optimal coefficients: [-1.64514645 -2.90472181  3.22254763  1.2168242   4.31870819 -0.02511948]
INFO:root:------------------
INFO:root:Iteration: 6
INFO:root:Error: 0.1320662719674624
INFO:root:Number of combinations: 27132
INFO:root:Optimal combination: (69721,

# Exact GCG

In [4]:
exp_exact = LGCG_finite(K=K.T, target=y, alpha=0.1)

In [5]:
result_exact = exp_exact.solve_exact(tol=1e-11)

DEBUG:root:SSN in 2 dimensions converged in 9 iterations to tolerance 1.000E-11
INFO:root:1: Phi 3.043E-01, support [ 393284 1991889]
DEBUG:root:SSN in 3 dimensions converged in 9 iterations to tolerance 1.000E-11
INFO:root:2: Phi 1.784E-01, support [ 393284 1486605 1991889]
DEBUG:root:SSN in 4 dimensions converged in 10 iterations to tolerance 1.000E-11
INFO:root:3: Phi 1.752E-01, support [ 330868 1486605 1991889]
DEBUG:root:SSN in 4 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:4: Phi 7.907E-02, support [ 330868 1191023 1486605 1991889]
DEBUG:root:SSN in 5 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:5: Phi 8.330E-02, support [ 330868  491904 1191023 1486605 1991889]
DEBUG:root:SSN in 6 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:6: Phi 7.560E-02, support [ 330868  491904  501065 1191023 1486605 1991889]
DEBUG:root:SSN in 7 dimensions converged in 8 iterations to tolerance 1.000E-11
INFO:root:7: Phi 4.959E-02,

In [6]:
u = result_exact["u"]
print(u)
support = result_exact["support"]
print(support)
u_bar = np.zeros(exp_exact.K.shape[1])
for ind, val in zip(support, u):
    u_bar[ind] = val

[ 4.89623490e-04 -3.87462181e-04 -9.03777731e-01  2.76966279e-01
 -1.34086555e-01  6.50345070e-02  3.50170052e-04  8.12021569e-03
 -9.97713898e-05 -6.82283160e-05  7.85053862e-04  1.47932525e-01
 -7.08149344e-04  5.58445930e-04  7.88147065e-05  7.23932217e-03
  2.49456544e-02  2.23869843e-01  1.14736705e-01]
[  69716   69721  340637  386839  492542  501065  546103  549795  831208
  831259 1118941 1119764 1171818 1380117 1380121 1512372 1551504 1709307
 1709308]


In [10]:
# Renormalize u_bar
u_bar_r = u_bar/exp_exact.target_norm
for iter, nor in enumerate(exp_exact.K_norms):
    u_bar_r[iter] *= nor

In [11]:
p_bar = np.abs(exp_exact.p(u_bar_r))

In [12]:
for x in support:
    print(p_bar[x])

0.10000000000000002
0.1
0.1
0.09999999999999999
0.09999999999999999
0.1
0.09999999999999999
0.1
0.1
0.1
0.09999999999999998
0.09999999999999998
0.1
0.09999999999999998
0.1
0.09999999999999998
0.09999999999999998
0.1
0.1


In [13]:
# Sigma in our theoretocal considerations of the finite setting
0.5*(0.1-np.max(p_bar[p_bar<0.0999999]))

6.896156955940591e-06

In [14]:
# Error wrt target
np.sqrt(np.mean(np.square(np.matmul(np.append(K.T, np.ones((K.shape[1], 1)), axis=1), u_bar) - y)))

0.20710212706474032

### Correlations

In [12]:
K_support = exp_exact.K[:, support]
K_support.shape

(3702, 18)

In [13]:
df =pd.DataFrame(K_support)

In [14]:
corr = df.corr()
corr.style.background_gradient(cmap='BrBG_r', axis=None).format(precision=2)

DEBUG:matplotlib:matplotlib data path: /vol/cs-hu/hnatiuar@hu-berlin.de/miniconda3/envs/sissopp_env/lib/python3.9/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux
DEBUG:matplotlib:CACHEDIR=/vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /vol/cs-hu/hnatiuar@hu-berlin.de/.cache/matplotlib/fontlist-v330.json


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,1.0,0.29,-0.27,0.87,0.32,-0.38,-0.31,-0.53,-0.26,0.06,-0.42,-0.01,-0.12,-0.03,-0.01,-0.18,-0.12,0.05
1,0.29,1.0,-0.1,0.41,0.89,-0.23,-0.25,-0.08,-0.33,-0.16,-0.43,-0.12,0.02,-0.06,-0.03,-0.04,-0.14,0.02
2,-0.27,-0.1,1.0,-0.27,-0.11,0.2,0.11,0.1,0.16,-0.01,0.07,-0.02,0.09,0.01,0.0,0.11,0.06,0.03
3,0.87,0.41,-0.27,1.0,0.47,-0.53,-0.53,-0.36,-0.41,0.11,-0.59,0.06,-0.0,-0.0,-0.04,-0.16,-0.27,0.13
4,0.32,0.89,-0.11,0.47,1.0,-0.33,-0.35,-0.1,-0.25,-0.2,-0.33,-0.2,0.0,0.03,0.01,-0.13,-0.16,0.01
5,-0.38,-0.23,0.2,-0.53,-0.33,1.0,0.37,0.06,0.39,0.03,0.37,0.05,0.04,-0.03,-0.03,0.01,0.11,0.1
6,-0.31,-0.25,0.11,-0.53,-0.35,0.37,1.0,0.52,0.76,-0.1,0.55,-0.1,-0.14,-0.05,0.07,-0.02,0.34,0.07
7,-0.53,-0.08,0.1,-0.36,-0.1,0.06,0.52,1.0,0.39,-0.04,0.22,0.02,0.19,0.02,0.01,0.12,0.09,0.21
8,-0.26,-0.33,0.16,-0.41,-0.25,0.39,0.76,0.39,1.0,0.1,0.67,-0.05,0.02,-0.02,0.12,0.09,0.25,0.19
9,0.06,-0.16,-0.01,0.11,-0.2,0.03,-0.1,-0.04,0.1,1.0,-0.06,0.87,0.01,-0.08,-0.07,0.35,-0.1,0.05


# Cross Validation

In [4]:
def get_sisso_errors(X, y, X_test, y_test):
    exp = SISSO(X, y)
    n_solutions = exp.fit()
    X_test = np.append(X_test, np.ones((X_test.shape[0], 1)), axis=1)
    errors = [np.sqrt(np.mean(np.square(np.matmul(X_test,solution)-y_test))) for solution in n_solutions]  #RMSE
    del exp
    return errors

def get_gcg_errors(X,y,X_test,y_test):
    exp = LGCG_finite(K=X, target=y, alpha=0.1)
    result = exp.solve_exact(tol=1e-11)
    u_bar = result["u"]
    support = result["support"]
    ones_norm = np.linalg.norm(np.ones((X_test.shape[0],1)))
    X_test = np.append(X_test, np.ones((X_test.shape[0], 1))/ones_norm, axis=1)
    X = np.append(X, np.ones((X.shape[0], 1))/ones_norm, axis=1)
    errors = [np.sqrt(np.mean(np.square(np.matmul(X_test[:,support],u_bar)-y_test)))]

    error = 1e14
    if exp.K.shape[1]-1 not in support: # Constant is not in the support
        support = np.append(support, exp.K.shape[1]-1)
    for n in range(1,5):
        min_error = 10 * error
        combinatorial_combinations = combinations(support, n)
        optimal_combination = None
        optimal_coefficients = None
        for combination in combinatorial_combinations:
            submatrix = X[:, np.array(combination)]
            try:
                least_squares, res, rank, s = np.linalg.lstsq(
                    submatrix, y, rcond=None
                )
                local_error = np.sqrt(
                    np.mean(
                        np.square(np.matmul(submatrix, least_squares) - y)
                    )
                )  # RMSE
            except np.linalg.LinAlgError:
                local_error = min_error
            if local_error < min_error:
                min_error = local_error
                optimal_combination = np.array(combination)
                optimal_coefficients = least_squares
        error = min_error
        errors.append(np.sqrt(np.mean(np.square(np.matmul(X_test[:,optimal_combination], optimal_coefficients) - y_test))))
    
    del exp
    return errors

In [5]:
X = K.T
print(X.shape)

(75, 1991889)


In [6]:
logging.getLogger().setLevel(logging.CRITICAL) # Supress logging

In [7]:
columns = ["sisso_1", "siso_2", "sisso_3", "sisso_4", "gcg_all", "gcg_1", "gcg_2", "gcg_3", "gcg_4"]
cv = KFold(n_splits=5, random_state=7, shuffle=True)
errors = []
for i, (train_index, test_index) in enumerate(cv.split(X)):
    print(f"Split {i},train size {len(train_index)}, test size {len(test_index)}")
    X_train = X[train_index]
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    sisso_errors = get_sisso_errors(X_train, y_train, X_test, y_test)
    gcg_errors = get_gcg_errors(X_train, y_train, X_test, y_test)
    combined_errors = sisso_errors+gcg_errors
    print(combined_errors)
    errors.append(combined_errors)
error_df = pd.DataFrame(errors, columns=columns)

Split 0,train size 60, test size 15


[0.251071446726552, 0.3619772041827458, 0.3347893342392812, 0.27940929378714613, 0.26916365097460887]
[0.5490251761483282, 0.39356915784422924, 0.2972473066690002, 0.317870660667793, 0.251071446726552, 0.3619772041827458, 0.3347893342392812, 0.27940929378714613, 0.26916365097460887]
Split 1,train size 60, test size 15


In [8]:
error_df

Unnamed: 0,sisso_1,siso_2,sisso_3,sisso_4,gcg_all,gcg_1,gcg_2,gcg_3,gcg_4
0,0.512786,0.417551,0.28703,0.317502,0.251071,0.361977,0.334789,0.279409,0.269164
1,0.342832,0.40089,0.432458,0.267557,0.264991,0.406026,0.29678,0.375567,0.259899
2,0.636159,0.681065,22.292779,0.622592,0.320384,0.580963,0.916014,0.197845,0.261443
3,0.357317,0.283029,0.30619,0.297573,0.24019,0.445455,0.284558,0.191103,0.178037
4,0.361304,2.398399,2.603525,2.693551,0.268457,0.369019,0.259546,0.225441,0.242335


In [9]:
error_df.mean()

sisso_1    0.442080
siso_2     0.836187
sisso_3    5.184396
sisso_4    0.839755
gcg_all    0.269019
gcg_1      0.432688
gcg_2      0.418337
gcg_3      0.253873
gcg_4      0.242176
dtype: float64