In [1]:
import time
from tqdm import trange
from joblib import Parallel, delayed

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import RepeatedKFold
from sklearn_rvm import EMRVR

In [2]:
data = pd.read_csv("./wholeSFC.csv")
# first column: y, other columns: feature
data.columns = ["y"] + [f"feature{i}" for i in range(len(data.columns)-1)]

In [3]:
# get all data
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values

In [4]:
permu_num = 1000
random_state = 88
coefs, pvalues = np.zeros(permu_num), np.zeros(permu_num)
# permutation test
def regression(y_shuffle, train_index, test_index):
    # select feature based on p value
    feature_index = np.argwhere(np.array([stats.pearsonr(X[train_index, i], y_shuffle[train_index])[1] for i in range(X.shape[1])]) < 0.01).reshape(-1, )
    # get train&&test data
    X_train, X_test = X[np.ix_(train_index, feature_index)], X[np.ix_(test_index, feature_index)]
    y_train, y_test = y_shuffle[train_index], y_shuffle[test_index]
    # fit data and predict
    # https://github.com/JamesRitchie/scikit-rvm/issues/9
    rvr = EMRVR(kernel="linear", bias_used=False)
    rvr.fit(X_train, y_train)
    y_p = rvr.predict(X_test)
    # add predict value and ground truth
    return dict(y_t=y_test.tolist(), y_p=y_p.tolist())

with Parallel(n_jobs=32) as parallel:
    for i in trange(permu_num):
        # shuffle y
        y_shuffle = np.random.RandomState(seed=i).permutation(y)
        # define kfold
        rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=random_state)
        # save predict and actual value
        y_actual, y_predict = list(), list()
        result = parallel(delayed(regression)(y_shuffle, train_index, test_index)
            for train_index, test_index in rkf.split(X))
        # save all predict value and actual value
        [(y_actual.extend(r["y_t"]), y_predict.extend(r["y_p"])) for r in result]
        # get correlation coefficient
        coefs[i], pvalues[i] = stats.spearmanr(y_actual, y_predict)
        print(f"permutation {i}: coef is {coefs[i]:.5f}, p value is {pvalues[i]:.5f}")
    # get p_value of permutation
    permutation_p = (coefs > 0.1928).mean()
    print(permutation_p)

 50%|█████     | 1/2 [00:09<00:09,  9.50s/it]

permutation 0: coef is 0.02991, p value is 0.61199


100%|██████████| 2/2 [00:18<00:00,  9.11s/it]

permutation 1: coef is 0.07385, p value is 0.20989
0.0





In [5]:
# save coefficient and pvalue
df = pd.DataFrame(dict(coef=coefs, pvalue=pvalues))
df.to_csv(f"./spearman_permutation-{permu_num}_datetime-{time.strftime('%Y%m%d-%H-%M-%S', time.localtime())}.csv", index=False)