In [1]:
import numpy as np
import pandas as pd
from scipy.linalg import logm, expm, eigh


In [13]:
def is_spd(A, tol=1e-8):
    # Check symmetry
    if not np.allclose(A, A.T, atol=tol):
        return False
    # Check eigenvalues > 0
    eigvals = np.linalg.eigvalsh(A)
    return np.all(eigvals > tol)

def project_to_spd(A, tol=1e-8):
    # Make symmetric
    A = (A + A.T) / 2
    eigvals, eigvecs = eigh(A)
    eigvals_clipped = np.clip(eigvals, tol, None)  # set eigenvalues < tol to tol
    return eigvecs @ np.diag(eigvals_clipped) @ eigvecs.T

def make_psd(K, min_eig=1e-6):
    K = (K + K.T) / 2
    eigvals = np.linalg.eigvalsh(K)
    if np.min(eigvals) < min_eig:
        K += np.eye(K.shape[0]) * (min_eig - np.min(eigvals))
    return K

def weighted_log_euclidean_mean(kernels, weights):
    weights = np.array(weights)
    weights = weights / weights.sum()
    
    log_sum = np.zeros_like(kernels[0])
    for w, K in zip(weights, kernels):
        K = make_psd(K)
        log_K = logm(K)
        # Handle potential complex results more carefully
        if np.iscomplexobj(log_K) and np.allclose(log_K.imag, 0, atol=1e-10):
            log_K = log_K.real
        log_sum += w * log_K
    
    return expm(log_sum)

In [14]:
# Set seed for reproducibility
np.random.seed(42)

# Parameters
n_samples = 100  # total samples
n_pos = 30       # number of positive samples
n_neg = n_samples - n_pos
n_features_per_set = 5
feature_list = ['ppi_2019', 'bioconcept', 'uniport', 'esm2']

# Generate random features
data = {}
for feature in feature_list:
    for i in range(n_features_per_set):
        col_name = f'{feature}_f{i}'
        data[col_name] = np.random.rand(n_samples)

# Create DataFrame
df = pd.DataFrame(data)

# Generate labels: 1 for positive, 0 for negative
y = np.array([1] * n_pos + [0] * n_neg)

# Shuffle positive indices and split into train/test
pos_indices = np.where(y == 1)[0]
np.random.shuffle(pos_indices)
train_pos_indices = pos_indices[:20]
test_pos_indices = pos_indices[20:]

# Get train/test DataFrames for positive samples
train_pos_df = df.iloc[train_pos_indices]
test_pos_df = df.iloc[test_pos_indices]

In [4]:
df

Unnamed: 0,ppi_2019_f0,ppi_2019_f1,ppi_2019_f2,ppi_2019_f3,ppi_2019_f4,bioconcept_f0,bioconcept_f1,bioconcept_f2,bioconcept_f3,bioconcept_f4,uniport_f0,uniport_f1,uniport_f2,uniport_f3,uniport_f4,esm2_f0,esm2_f1,esm2_f2,esm2_f3,esm2_f4
0,0.374540,0.031429,0.642032,0.051682,0.103124,0.698162,0.168935,0.532589,0.707239,0.207886,0.185133,0.057843,0.758263,0.031586,0.167042,0.519082,0.191867,0.965822,0.956501,0.554227
1,0.950714,0.636410,0.084140,0.531355,0.902553,0.536096,0.278590,0.051824,0.152539,0.026532,0.541901,0.969103,0.024587,0.936212,0.167619,0.479182,0.323372,0.432498,0.737508,0.005230
2,0.731994,0.314356,0.161629,0.540635,0.505252,0.309528,0.177010,0.336604,0.576288,0.181435,0.872946,0.883786,0.022124,0.051971,0.036671,0.025642,0.226656,0.311816,0.353251,0.760991
3,0.598658,0.508571,0.898554,0.637430,0.826457,0.813795,0.088703,0.134415,0.606715,0.583042,0.732225,0.927752,0.323610,0.541296,0.736402,0.341248,0.354996,0.506142,0.296536,0.035311
4,0.156019,0.907566,0.606429,0.726091,0.320050,0.684731,0.120636,0.063375,0.424131,0.421425,0.806561,0.994908,0.488643,0.709061,0.663805,0.380196,0.069424,0.439512,0.349703,0.745734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.493796,0.349210,0.522243,0.930757,0.353352,0.473962,0.610620,0.872124,0.204984,0.091582,0.176528,0.871537,0.200150,0.828883,0.668213,0.578387,0.068172,0.084792,0.733071,0.656955
96,0.522733,0.725956,0.769994,0.858413,0.583656,0.667558,0.288631,0.932118,0.293148,0.917314,0.220486,0.973489,0.167483,0.430888,0.619490,0.274161,0.025812,0.716323,0.615985,0.956615
97,0.427541,0.897110,0.215821,0.428994,0.077735,0.172320,0.581238,0.565133,0.896336,0.136819,0.186438,0.968878,0.104568,0.248714,0.463494,0.079419,0.135166,0.072084,0.188025,0.068958
98,0.025419,0.887086,0.622890,0.750871,0.974395,0.192289,0.154363,0.696651,0.013002,0.950237,0.779584,0.749652,0.636430,0.617145,0.379786,0.085658,0.963115,0.071257,0.355385,0.057055


In [5]:
neg_num = n_pos

In [7]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn import svm

In [15]:
if 'linear_fused' in feature_list:
    feature_list.remove('linear_fused')
if 'geo_fused' in feature_list:
    feature_list.remove('geo_fused')


# Work with DataFrames to maintain indices
neg_df = df[y == 0]

# Randomly select 'neg_num' samples from negative class
train_neg_df = neg_df.sample(n=neg_num, random_state=42)

# Get the all negative samples
test_neg_df = neg_df

# Combine positive and negative samples for training
train_df = pd.concat([train_pos_df, train_neg_df])
test_df = pd.concat([test_pos_df, test_neg_df])

X_train_mats = []
X_test_mats = []
for feature_name in feature_list:
    select_columns = [col for col in df.columns if col.startswith(feature_name)]
    X_train_mats.append(train_df[select_columns].values)
    X_test_mats.append(test_df[select_columns].values)

feature_weights = [1 / len(feature_list)] * len(feature_list)

y_train = np.array([1] * len(train_pos_df) + [0] * len(train_neg_df))
y_test = np.array([1] * len(test_pos_df) + [0] * len(test_neg_df))

kernels_all = []
kernels_train = []
kernels_test = []

# For each feature set
for X_tr, X_te in zip(X_train_mats, X_test_mats):
    X_all = np.concatenate([X_tr, X_te], axis=0)

    nbrs = NearestNeighbors(n_neighbors=2).fit(X_all)
    distances, _ = nbrs.kneighbors(X_all)
    avg_nn_dist = np.mean(distances[:, 1])  # skip self-distance
    gamma = 1 / (2 * avg_nn_dist ** 2)
    K_full = rbf_kernel(X_all, X_all, gamma=gamma)
    kernels_all.append(K_full)

    n_train = len(X_tr)
    kernels_train.append(K_full[:n_train, :n_train])
    kernels_test.append(K_full[n_train:, :n_train])

K_linear_all = sum(w * K_train_i for w, K_train_i in zip(feature_weights, kernels_all))
kernels_train.append(K_linear_all[:n_train, :n_train])
kernels_test.append(K_linear_all[n_train:, :n_train])
feature_list.append('linear_fused')

K_geo_all = weighted_log_euclidean_mean(kernels_all, feature_weights)
kernels_train.append(K_geo_all[:n_train, :n_train])
kernels_test.append(K_geo_all[n_train:, :n_train])
feature_list.append('geo_fused')

# Store original indices for training set
train_indices = train_df.index.values
# Store original indices for test set
test_indices = test_df.index.values

for feature_index, feature_name in enumerate(feature_list):

    best_svm = svm.SVC(kernel='precomputed')
    best_svm.fit(kernels_train[feature_index], y_train)
    y_scores = best_svm.decision_function(kernels_test[feature_index])



In [16]:
y_scores

array([-0.3600256 , -0.40212777, -0.37958595, -0.35768611, -0.31949512,
       -0.42627313, -0.41499389, -0.42664193, -0.36613789, -0.36357097,
       -1.00005111, -0.53678294, -0.43380382, -0.45338878, -1.00048439,
       -0.99981887, -0.32776124, -0.99994971, -0.38756416, -0.99983823,
       -0.99973707, -0.35530394, -0.99989028, -0.32431224, -0.32283358,
       -0.27697504, -0.99965087, -0.33448737, -0.99967178, -0.99993025,
       -0.3526812 , -0.29879896, -1.00057826, -0.48550251, -0.44699192,
       -0.41531682, -0.5232809 , -0.37559849, -1.00031418, -0.33251132,
       -0.9998895 , -0.99999807, -0.15749358, -1.00024024, -1.00029009,
       -1.00030917, -0.44275377, -0.39106283, -0.28904986, -0.40078049,
       -0.99986656, -0.39948145, -0.99989397, -0.30905025, -0.99996719,
       -0.99986251, -0.99987454, -0.99998889, -0.67199598, -0.99997117,
       -0.4018168 , -0.35855996, -0.45191801, -1.00061512, -0.99969031,
       -0.38311595, -0.34286891, -0.99975839, -0.99987518, -0.47