In [6]:
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import StandardScaler as zscore # zscore
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso #LRlasso
from joblib import dump, load #to save models in files
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import re
import json
import os

In [7]:
from sklearn.model_selection import GridSearchCV
def gridcv(X, y, model, param_grid, naimpute=False, prepy=True, scorer = 'neg_mean_squared_error', cv_meth = LeaveOneOut()):
    """
    Perform Cross-Validation (defaukt: LOOCV) with hyperparameter tuning using GridSearchCV.
    
    Parameters:
    ----------
    X : pandas DataFrame or numpy array
        The feature matrix.
        
    y : pandas Series or numpy array
        The target variable.
        
    model : scikit-learn estimator
        The machine learning model to be used, should be an uninitialized model instance 
        (e.g., Lasso(), not Lasso(alpha=1.0)).
        
    param_grid : dict
        Dictionary containing the hyperparameters to be tuned and their possible values. 
        The keys should be prefixed with 'regressor__' to work with the pipeline.
        
    naimpute : bool, optional (default=False)
        Toggle imputation for missing values. 
        Currently not implemented; will print a message and return 0 if set to True.
        
    prepy : bool, optional (default=True)
        Toggle preprocessing target variable 'y' by setting any negative values to zero.
        
    scorer : str, callable, or None, optional (default='neg_mean_squared_error')
        A string or a scorer callable object / function with signature scorer(estimator, X, y). 
        For valid scoring strings, see the scikit-learn documentation.
        
    cv_meth : cross-validation generator, optional (default=LeaveOneOut())
        A cross-validation splitting strategy. 
        Possible inputs for cv are integers to specify the number of folds in a (Stratified)KFold, 
        CV splitter, cross-validation generator iterators, or an iterable.
        
    Returns:
    -------
    overall_metric : dict
        Dictionary containing the overall metrics and other details from the GridSearchCV.
        
    out_model : GridSearchCV object
        Fitted GridSearchCV object.
        
    best_params : dict
        Dictionary containing the best hyperparameters found by GridSearchCV.

    Call:
    ------
    from sklearn.model_selection import KFold

    # set up KFold cross-validator
    kfold_cv = KFold(n_splits=3, shuffle=True, random_state=42)

    param_grid = {
        'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
        'regressor__fit_intercept': [True, False]
    }
    print(param_grid)

    # Call the gridcv function with KFold as the cross-validation method
    lasso_fullkfold_scores, lasso_fullkfold_model, best_param = gridcv(
        X, 
        y,
        Lasso(max_iter=4000),
        param_grid,
        scorer='r2', 
        cv_meth=kfold_cv
    )
    dump(lasso_fullkfold_model, './models/lasso_fullkfold_model.pkl') # save the model as .pkl
    """

    # overall_metric = {'CV': cv_meth, 'scoring_metric': scorer} originally
    overall_metric = {'CV': str(cv_meth), 'scoring_metric': str(scorer)} # transformed to string because json dump scores later

    if prepy:
        y[y < 0] = 0
    
    if naimpute:
      print("not implemented")
      return 0


    pipeline = Pipeline([
        ('scaler', zscore()), 
        ('regressor', model)        # Regression model
    ])

    
    # declaring an Grid object
    # score : https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    out_model = GridSearchCV(pipeline, param_grid=param_grid, cv=cv_meth, scoring=scorer).fit(X,y)
    # GridSearchCV need the regressor__ prefix for the pipiline object in the para_grid later when called

    best_pipeline = out_model.best_estimator_
    y_pred = best_pipeline.predict(X)

    overall_metric['correlation_true_pred'] = list(np.corrcoef(list(y), list(y_pred)))
    overall_metric['correlation_true_pred'][0] = list(overall_metric['correlation_true_pred'][0])
    overall_metric['correlation_true_pred'][1] = list(overall_metric['correlation_true_pred'][1])


    # LOOCV folds: split{i}_test_score (number of data points minus one) 
    overall_metric['fold_scores'] = [out_model.cv_results_[f'split{i}_test_score'][out_model.best_index_] for i in range(out_model.n_splits_)]
    best_params = out_model.best_params_


    # access the 'regressor' step from the best pipeline and then its coefficients
    coefficients = best_pipeline.named_steps['regressor'].coef_
    overall_metric['non_zero_coefficients'] = coefficients[coefficients != 0]
    overall_metric['non_zero_coefficients'] = overall_metric['non_zero_coefficients'].tolist()
    overall_metric['non_zero_features'] = list(X.columns[np.where(coefficients != 0)[0]])

    # printing section
    print("best parameter from gridsearch>>\n", out_model.best_params_)
    print(overall_metric['CV'])
    print(overall_metric['scoring_metric'])
    print("correlation Matrix>>\n", overall_metric['correlation_true_pred'])
    print("non_zero_features>>\n",overall_metric['non_zero_features'])
    print("scores for each fold>>\n",overall_metric['fold_scores'])

    return overall_metric, out_model, best_params


In [126]:
def nX_cross_validation(X, target, param_grid, scorer_estimate, output_prefix, random_states, output_path='./models/10xKfold/', n_splits=3):
    if os.path.exists(output_path):
        print(f"The path {output_path} exists.")
    else:
        print(f"The path {output_path} does not exist.")
        raise FileNotFoundError(f"The path {output_path} does not exist.")
    best_fold_mean = float('-inf')
    best_model = []
    #cv_results = {'random_state': [], 'scores': {}, 'mean_scores': [], 'common_features': {}, 'model': {}}
    cv_results = {'random_state': [], 'scores': {}, 'mean_scores': [], 'selected_features': {}, 'best_param': []}
    for ran_state in random_states:
        print(ran_state)
        kfold_cv = KFold(n_splits=n_splits, shuffle=True, random_state=ran_state)
        scores, model, best_param = gridcv(
            X, 
            target,
            Lasso(),
            param_grid,
            prepy=False,
            scorer=scorer_estimate, 
            cv_meth=kfold_cv
        )
        cv_results['random_state'].append(ran_state)
        cv_results['scores'][ran_state] = scores
        cv_results['mean_scores'].append(np.mean(scores['fold_scores']))
        if best_fold_mean == -100:
            best_fold_mean = np.mean(scores['fold_scores'])
            cv_results['best_param'] = best_param, ran_state, np.mean(scores['fold_scores'])
        elif best_fold_mean < np.mean(scores['fold_scores']):
            best_fold_mean = np.mean(scores['fold_scores'])
            cv_results['best_param'] = best_param, ran_state, np.mean(scores['fold_scores'])


        # cv_results['model'][ran_state] = model

    # Determine common features...
    #cv_results['common_features'] = set(cv_results['scores'][42]['non_zero_features'])
    #for r in cv_results['random_state'][1:]:
    #    current_features = set(cv_results['scores'][r]['non_zero_features'])
    #    cv_results['common_features'] = cv_results['common_features'].intersection(current_features)
    #cv_results['common_features'] = list(cv_results['common_features'])
    print(f"best estimator>>\n found in split: {cv_results['best_param'][1]}\n param_grid: {cv_results['best_param'][0]}\n mean fold score {cv_results['best_param'][2]}")    
    best_model = Lasso(alpha=cv_results['best_param'][0]['regressor__alpha'], fit_intercept=cv_results['best_param'][0]['regressor__fit_intercept']).fit(X, target)
    cv_results['selected_features'] = list(X.columns[np.where(best_model.coef_ != 0)[0]])
    #save to json
    with open(f"{output_path}{output_prefix}_nXcv.json", 'w') as file:
       json.dump(cv_results, file)
    file.close()

    return cv_results




In [9]:
tr_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/tr_mut.csv", sep=",")
gcms_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/gcms_mut.csv", sep=",")
lcms_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/lcms_mut.csv", sep=",")

X = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/tr_mut_transposed.csv", sep=",")


In [10]:
def to_valid_variable_name(name):
    # Replace special characters with underscores
    name = re.sub(r'\W|^(?=\d)', '_', name)
    # Reduce multiple consecutive underscores to one
    name = re.sub(r'_{2,}', '_', name)
    # Truncate length if necessary
    max_length = 30
    if len(name) > max_length:
        name = name[:max_length]
    # Ensure it doesn't start with a number
    if name[0].isdigit():
        name = "_" + name
    return name

In [11]:
gcms_target_dict = {}
for target in gcms_mut['metabolite']:
    o = to_valid_variable_name(target)
    #print(f"{o} == \t {target}")
    gcms_target_dict[o] = str(target)

In [12]:
lcms_target_dict = {}
for target in lcms_mut['metabolite']:
    o = to_valid_variable_name(target)
    #print(f"{o} == \t {target}")
    lcms_target_dict[o] = str(target)
    

In [13]:
for e, value in lcms_target_dict.items():
    print(f"{e} \t {value}")

alpha_L_Fucose_1_Phosphate 	 alpha-L-(-)-Fucose-1-Phosphate
Trehalose 	 Trehalose
Trehalose_phenolic_acid_ 	 Trehalose + phenolic acid?*
Maltotriose_ 	 Maltotriose*
_366_9939681_1_170199352 	 366.9939681/1.170199352
_R_2_Phenylglycin 	 (R)-(-)-2-Phenylglycin
Oxidized_glutathione_ 	 Oxidized glutathione*
_312_0943959_2_614228811 	 312.0943959/2.614228811
Guanosine_ 	 Guanosine*
gamma_Glutamylvaline_ 	 gamma-Glutamylvaline*
_512_1445532_3_132648409 	 512.1445532/3.132648409
Catechol_ 	 Catechol?*
_303_0720727_3_335904942 	 303.0720727/3.335904942
Quinic_acid_derivative_3_56 	 Quinic acid derivative*_3.56
_374_1568694_3_674168475 	 374.1568694/3.674168475
Phaseoloidin 	 Phaseoloidin
_496_1502126_3_792994891 	 496.1502126/3.792994891
Vanilloloside 	 Vanilloloside
CGA_hexose_ 	 CGA+hexose?*
Phenolic_glycoside_ 	 Phenolic? - glycoside*
gamma_Glutamylisoleucine_ 	 gamma-Glutamylisoleucine*
L_gamma_glutamyl_L_isoleucine 	 L-gamma-glutamyl-L-isoleucine
_658_1575839_4_528042834 	 658.1575839/4.5

In [5]:
# primary Met
y_sucrose = gcms_mut.iloc[59,1:]
print(gcms_mut.iloc[59,0])

y_citricAcid = gcms_mut.iloc[13,1:]
print(gcms_mut.iloc[13,0])

y_glucose = gcms_mut.iloc[23,1:]
print(gcms_mut.iloc[23,0])

# branched AA
y_isoleucine = gcms_mut.iloc[38,1:]
print(gcms_mut.iloc[38,0])

y_leucine = gcms_mut.iloc[39,1:]
print(gcms_mut.iloc[39,0])

y_valine =  gcms_mut.iloc[66,1:]
print(gcms_mut.iloc[66,0])

# 2ndary Met
y_trehalose = lcms_mut.iloc[2,1:]
print(lcms_mut.iloc[2,0])

y_rutin = lcms_mut.iloc[44,1:]
print(lcms_mut.iloc[44,0])

y_oxGlut = lcms_mut.iloc[7,1:]
print(lcms_mut.iloc[7,0])




sucrose_437_361_rt13.77
citric_acid_273_375_rt9.72
glucose_160_rt9.81
isoleucin_158_233_rt.5.21
leucine_158_232_rt4.97
valine_144_218_rt4.42
Trehalose + phenolic acid?*
641.171822/6.885700966
312.0943959/2.614228811


In [27]:
#tenX = [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]
tenX = [42, 43 ]

out = './py/10xKfold/'
param_grid = {
    'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
    'regressor__fit_intercept': [True, False]
}
glucose_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_glucose, param_grid, 'r2', "glucose", random_states=tenX, output_path=out)
sucrose_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_sucrose, param_grid, 'r2', "sucrose", tenX, output_path=out)
citricAcid_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_citricAcid, param_grid, 'r2', "citricAcid", tenX, output_path=out)
isoleucine_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_isoleucine, param_grid, 'r2', "isoleucine", tenX, output_path=out)
leucine_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_leucine, param_grid, 'r2', "leucine", tenX, output_path=out)
valine_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_valine, param_grid, 'r2', "valine", tenX, output_path=out)
trehalose_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_trehalose, param_grid, 'r2', "trehalose", tenX, output_path=out)
rutin_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_rutin, param_grid, 'r2', "rutin", tenX, output_path=out)
oxGlut_10xKfold = nX_cross_validation(X.iloc[:,0:50], y_oxGlut, param_grid, 'r2', "oxGlut", tenX, output_path=out)

The path ./py/10xKfold/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8817032281542764], [0.8817032281542764, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.18', 'Xele.ptg000045l.33', 'Xele.ptg000045l.34', 'Xele.ptg000045l.36', 'Xele.ptg000045l.38', 'Xele.ptg000045l.41', 'Xele.ptg000045l.48', 'Xele.ptg000045l.50', 'Xele.ptg000045l.51', 'Xele.ptg000045l.55', 'Xele.ptg000045l.56']
scores for each fold>>
 [-1.317083485777105, 0.045600854000389046, 0.06066564394285734]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.025, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.92373405813201], [0.92373405813201, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000

In [127]:
#tenX = [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]
tenX = [42, 43 ]

out = './py/10xKfold/test_full/'
param_grid = {
    'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
    'regressor__fit_intercept': [True, False]
}

for i, (lcms_target, orig_str) in enumerate(lcms_target_dict.items()):
    print(f"{lcms_mut.iloc[i,0]}\t{lcms_target}")
    tmp_10xKfold = nX_cross_validation(X.iloc[:,0:25], lcms_mut.iloc[i,1:], param_grid, 'r2', str(lcms_target), random_states=tenX, output_path=out)


with open(f"{out}lcms_dict_nXcv.json", 'w') as file:
    json.dump(lcms_target_dict, file)

alpha-L-(-)-Fucose-1-Phosphate	alpha_L_Fucose_1_Phosphate
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8057166076597204], [0.8057166076597204, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.24', 'Xele.ptg000045l.26']
scores for each fold>>
 [-0.790537231898585, 0.4586400584996887, 0.16023900239379718]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.030000000000000002, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8150757650104326], [0.8150757650104326, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xel

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999999, 0.8564760156177935], [0.8564760156177934, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-0.7799357729814855, -1.1612370406644628, -3.038269185460603]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999999, 0.8564760156177935], [0.8564760156177934, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-3.3895871278884835, -0.768624690826859, -1.0275183966995747]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -1.659813999702184
427.1824019/5.95992104	_427_1824019_5_95

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9661565811561071], [0.9661565811561071, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29']
scores for each fold>>
 [-3.5870448109367103, -0.01019097124032875, 0.6080482049001665]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.027500000000000004, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9692054945467412], [0.9692054945467411, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29']
scores for each fold>>
 [-5.052666676081658, 0.008625961976717833, 0.4155223730343851]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -0.9963958590922909
441.1978503

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7705635601986093], [0.7705635601986093, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.22', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29']
scores for each fold>>
 [-0.4391237912215169, -1.9350631971041712, -1.544515102304561]
43


  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7705635601986093], [0.7705635601986093, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.22', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29']
scores for each fold>>
 [-2.0956235952553364, -1.5437013892709426, -0.24127668798610502]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -1.2935338908374614
Kaempferol-rhamnose-glucose	Kaempferol_rhamnose_glucose
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.771898900194149], [0.771898900194149, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-3.827076164469884, -2.335500649685778, -1.53241490848366]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.771898900194149], [0.771898900194149, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-3.583487930975825, -4.123227340969362, -1.262421406004976]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -2.564997240879774
Di

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8227529010616078], [0.8227529010616078, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.25', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-4.162776408699378, -2.1226833527220865, -0.07746233494979338]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8227529010616078], [0.8227529010616078, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.25', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-3.8897650251963034, -0.9071244055646139, -2.212630775140475]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -2.120974032123753
Saponin*_8.8	Saponin_8_8
The path ./py/10xKfold/test_full

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8545790612524204], [0.8545790612524204, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.31']
scores for each fold>>
 [-0.6383367277872625, -1.5632746173235628, -3.139725586467846]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8545790612524204], [0.8545790612524204, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.31']
scores for each fold>>
 [-1.2884062637218423, -0.590576598217684, -3.1980955570108343]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -1.6923594729834537
1241.580464/

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.0175, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8574934275784368], [0.8574934275784368, 0.9999999999999998]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-4.950872996974973, -2.4312367923518976, -3.9938453706190726]
43


  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8353425249042398], [0.8353425249042398, 0.9999999999999998]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-3.8171408561074207, -2.2426333052840435, -1.9677822685860096]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -2.675852143325825
1021.486521/9.1412

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.868838139855373], [0.868838139855373, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.20', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-5.438111914951817, -1.7898662854567169, -0.12027846749602356]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.868838139855373], [0.868838139855373, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.20', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-1.692814627219846, -1.1486817856746923, -0.41519617716543644]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -1.0855641966866583
Saponin*_10.183	Saponin_10_183
The path ./py/10xKfold/test_full/ exists.
42
b

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7385954896947532], [0.7385954896947532, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.23', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-1.6753055703886246, -1.8561657180127313, -1.3245748122558414]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7385954896947532], [0.7385954896947532, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.23', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-2.449124811746174, -0.9171317645879942, -1.4110523069468153]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -1.5924362944269947
Ribulose 5-phosphate_L	Ribulose_5_phosphate_L
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alp

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.022500000000000003, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8984065778372593], [0.8984065778372593, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-2.3206059099438403, -10.342569089865588, -1.2001527425973668]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8856538896490241], [0.8856538896490241, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-0.9793584224178773, -0.06074454828283282, -4.742580841321035]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fol

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999998, 0.7014884038753383], [0.7014884038753383, 0.9999999999999998]]
non_zero_features>>
 ['Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [0.016478885875355398, -4.182021281182931, -11.448364431350013]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999998, 0.7014884038753383], [0.7014884038753383, 0.9999999999999998]]
non_zero_features>>
 ['Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.28', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-35.58877810523797, -0.11621891218626157, -6.587277960206546]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -5.204635608885863
Lactobionic acid_L	Lacto

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7189461790622763], [0.7189461790622763, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-16.986998412105784, -0.6720048896506481, -2.7390815044946666]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7189461790622763], [0.7189461790622763, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-5.2626857739284105, -1.980044492976956, -36.62605556012134]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -6.7993616020836996
Glucaric aci

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8400108426579092], [0.8400108426579093, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.27', 'Xele.ptg000045l.31']
scores for each fold>>
 [-0.8977623184675754, -0.04158066831611085, -0.07087881920222538]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8400

  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999999, 0.9222543574788268], [0.9222543574788268, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25']
scores for each fold>>
 [-0.28138115975926525, 0.4450510589027251, -0.021000732624241625]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score 0.3566731723814584
5-hydroxy ferulic acid glucoside_2_L	_5_hydroxy_ferulic_acid_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999999, 0.9246488261436769], [0.9246488261436769, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.29']
scores for each fold>>
 [-4.104975363888796, -1.165788162879394, -4.22887292990323]
43


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[0.9999999999999999, 0.9246488261436769], [0.9246488261436769, 0.9999999999999999]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.8', 'Xele.ptg000045l.9', 'Xele.ptg000045l.10', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.21', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.24', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.29']
scores for each fold>>
 [-1.0755665362666567, -1.1255459715739398, 0.30194416998220364]
best estimator>>
 found in split: 43
 param_grid: {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
 mean fold score -0.6330561126194643
Quercitin-3-O-Glucoside_

  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9452524036842073], [0.9452524036842072, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.18', 'Xele.ptg000045l.22', 'Xele.ptg000045l.23', 'Xele.ptg000045l.25', 'Xele.ptg000045l.26', 'Xele.ptg000045l.29', 'Xele.ptg000045l.31']
scores for each fold>>
 [-0.8216504779782214, 0.3400300203202755, 0.5747356597924327]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.0175, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9554192684529924], [0.9554192684529924, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.1', 'Xele.ptg0000

  model = cd_fast.enet_coordinate_descent(
  c /= stddev[:, None]


best parameter from gridsearch>>
 {'regressor__alpha': 0.0125, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[nan, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [0.0, 0.0, 0.0]
43


  model = cd_fast.enet_coordinate_descent(
  c /= stddev[:, None]
  model = cd_fast.enet_coordinate_descent(


best parameter from gridsearch>>
 {'regressor__alpha': 0.0125, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[nan, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [0.0, 0.0, 0.0]
best estimator>>
 found in split: 42
 param_grid: {'regressor__alpha': 0.0125, 'regressor__fit_intercept': True}
 mean fold score 0.0
Naringenin chalcone_L	Naringenin_chalcone_L
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.035, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9682577321398181], [0.9682577321398183, 1.0]]
non_zero_features>>
 ['Xele.ptg000212l.1', 'Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.14', 'Xele.ptg000045l.15', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.22', 'Xele.ptg00

In [86]:
#tenX = [42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52]
tenX = [42, 43 ]

out = './py/10xKfold/test_full/'
param_grid = {
    'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
    'regressor__fit_intercept': [True, False]
}

for i, (gcms_target, orig_str) in enumerate(gcms_target_dict.items()):
    print(f"{gcms_mut.iloc[i,0]}\t{gcms_target}")
    tmp_10xKfold = nX_cross_validation(X.iloc[:,0:25], gcms_mut.iloc[i,1:], param_grid, 'r2', str(gcms_target), random_states=tenX, output_path=out)


with open(f"{out}gcms_dict_nXcv.json", 'w') as file:
    json.dump(gcms_target_dict, file)

6-phospho-gluconate	_6_phospho_gluconate
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7944558762693695], [0.7944558762693695, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.14', 'Xele.ptg000045l.16', 'Xele.ptg000045l.19', 'Xele.ptg000045l.20', 'Xele.ptg000045l.23']
scores for each fold>>
 [0.26745080031097324, -2.332077024869586, 0.12147071758762451]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.037500000000000006, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.8053983639204454], [0.8053983639204454, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.2', 'Xele.ptg000045l.6', 'Xele.ptg000045l.7', 'Xele.ptg000045l.9', 'Xele.ptg000045l.14', '

  c /= stddev[:, None]
  c /= stddev[None, :]


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [-0.004298633957063203, -0.0363175658433561, -0.04032403038081789]
43


  c /= stddev[:, None]
  c /= stddev[None, :]


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [-0.05280050910087053, -0.008255054544565876, -0.09440393032603644]
asparagine_188_216_rt7.84	asparagine_188_216_rt7_84
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.6809271779333511], [0.6809271779333512, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.9', 'Xele.ptg000045l.11', 'Xele.ptg000045l.15', 'Xele.ptg000045l.26']
scores for each fold>>
 [0.0880711476285565, 0.30168979960303255, -0.03210707748502828]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_stat

  c /= stddev[:, None]
  c /= stddev[None, :]


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [-0.045283720084051904, -0.15629695850055159, -0.21773994690683662]
43


  c /= stddev[:, None]
  c /= stddev[None, :]


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [-0.05497835525964878, -0.002843342603738197, -0.03339588403442173]
ornithine_142_174_rt9.34	ornithine_142_174_rt9_34
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.47270363737815185], [0.47270363737815185, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.2', 'Xele.ptg000045l.22']
scores for each fold>>
 [-0.004913635257368654, -0.024104634690445437, -0.023257355764515708]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Mat

  c /= stddev[:, None]
  c /= stddev[None, :]


best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, nan], [nan, nan]]
non_zero_features>>
 []
scores for each fold>>
 [-0.034497406234753836, -0.0009491516250772492, -0.17546231409854074]
43
best parameter from gridsearch>>
 {'regressor__alpha': 0.020000000000000004, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=43, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.46585360834220646], [0.4658536083422065, 1.0]]
non_zero_features>>
 ['Xele.ptg000045l.7', 'Xele.ptg000045l.22', 'Xele.ptg000045l.26']
scores for each fold>>
 [0.0022308550934807547, -0.005687139489020554, 0.02552055735060288]
proline_142_130_rt.5.53	proline_142_130_rt_5_53
The path ./py/10xKfold/test_full/ exists.
42
best parameter from gridsearch>>
 {'regressor__alpha': 0.04000000000000001, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True