In [112]:
from sklearn.model_selection import cross_val_score, LeaveOneOut
from sklearn.preprocessing import StandardScaler as zscore # zscore
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso #LRlasso
from collections import OrderedDict
from joblib import dump, load #to save models in files
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import re
import json
import os

In [113]:
from sklearn.model_selection import GridSearchCV
def gridcv(X, y, model, param_grid, naimpute=False, prepy=True, scorer = 'neg_mean_squared_error', cv_meth = LeaveOneOut()):
    """
    Perform Cross-Validation (defaukt: LOOCV) with hyperparameter tuning using GridSearchCV.
    
    Parameters:
    ----------
    X : pandas DataFrame or numpy array
        The feature matrix.
        
    y : pandas Series or numpy array
        The target variable.
        
    model : scikit-learn estimator
        The machine learning model to be used, should be an uninitialized model instance 
        (e.g., Lasso(), not Lasso(alpha=1.0)).
        
    param_grid : dict
        Dictionary containing the hyperparameters to be tuned and their possible values. 
        The keys should be prefixed with 'regressor__' to work with the pipeline.
        
    naimpute : bool, optional (default=False)
        Toggle imputation for missing values. 
        Currently not implemented; will print a message and return 0 if set to True.
        
    prepy : bool, optional (default=True)
        Toggle preprocessing target variable 'y' by setting any negative values to zero.
        
    scorer : str, callable, or None, optional (default='neg_mean_squared_error')
        A string or a scorer callable object / function with signature scorer(estimator, X, y). 
        For valid scoring strings, see the scikit-learn documentation.
        
    cv_meth : cross-validation generator, optional (default=LeaveOneOut())
        A cross-validation splitting strategy. 
        Possible inputs for cv are integers to specify the number of folds in a (Stratified)KFold, 
        CV splitter, cross-validation generator iterators, or an iterable.
        
    Returns:
    -------
    overall_metric : dict
        Dictionary containing the overall metrics and other details from the GridSearchCV.
        
    out_model : GridSearchCV object
        Fitted GridSearchCV object.
        
    best_params : dict
        Dictionary containing the best hyperparameters found by GridSearchCV.

    Call:
    ------
    from sklearn.model_selection import KFold

    # set up KFold cross-validator
    kfold_cv = KFold(n_splits=3, shuffle=True, random_state=42)

    param_grid = {
        'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
        'regressor__fit_intercept': [True, False]
    }
    print(param_grid)

    # Call the gridcv function with KFold as the cross-validation method
    lasso_fullkfold_scores, lasso_fullkfold_model, best_param = gridcv(
        X, 
        y,
        Lasso(max_iter=4000),
        param_grid,
        scorer='r2', 
        cv_meth=kfold_cv
    )
    dump(lasso_fullkfold_model, './models/lasso_fullkfold_model.pkl') # save the model as .pkl
    """

    # overall_metric = {'CV': cv_meth, 'scoring_metric': scorer} originally
    overall_metric = {'CV': str(cv_meth), 'scoring_metric': str(scorer)} # transformed to string because json dump scores later

    if prepy:
        y[y < 0] = 0
    
    if naimpute:
      print("not implemented")
      return 0


    pipeline = Pipeline([
        ('scaler', zscore()), 
        ('regressor', model)        # Regression model
    ])

    
    # declaring an Grid object
    # score : https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
    out_model = GridSearchCV(pipeline, param_grid=param_grid, cv=cv_meth, scoring=scorer).fit(X,y)
    # GridSearchCV need the regressor__ prefix for the pipiline object in the para_grid later when called

    best_pipeline = out_model.best_estimator_
    y_pred = best_pipeline.predict(X)

    overall_metric['correlation_true_pred'] = list(np.corrcoef(list(y), list(y_pred)))
    overall_metric['correlation_true_pred'][0] = list(overall_metric['correlation_true_pred'][0])
    overall_metric['correlation_true_pred'][1] = list(overall_metric['correlation_true_pred'][1])


    # LOOCV folds: split{i}_test_score (number of data points minus one) 
    overall_metric['fold_scores'] = [out_model.cv_results_[f'split{i}_test_score'][out_model.best_index_] for i in range(out_model.n_splits_)]
    best_params = out_model.best_params_


    # printing section
    print("best parameter from gridsearch>>\n", out_model.best_params_)
    print(overall_metric['CV'])
    print(overall_metric['scoring_metric'])
    print("correlation Matrix>>\n", overall_metric['correlation_true_pred'])
    print("scores for each fold>>\n",overall_metric['fold_scores'])

    if str(model).startswith("Lasso"):
        # access the 'regressor' step from the best pipeline and then its coefficients
        coefficients = best_pipeline.named_steps['regressor'].coef_
        overall_metric['non_zero_coefficients'] = coefficients[coefficients != 0]
        overall_metric['non_zero_coefficients'] = overall_metric['non_zero_coefficients'].tolist()
        overall_metric['non_zero_features'] = list(X.columns[np.where(coefficients != 0)[0]])
        print("non_zero_features>>\n",overall_metric['non_zero_features'])

    if str(model).startswith("RandomForestRegressor"):
        
        feature_names = X.columns
        feature_importances = best_pipeline.named_steps['regressor'].feature_importances_
        feature_importance_dict = dict(zip(feature_names, feature_importances))
        sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
        overall_metric['feature_importances'] = OrderedDict(sorted_feature_importance)
        print("feature_importances>>\n",overall_metric['feature_importances'])
       

    return overall_metric, out_model, best_params


In [None]:
def nX_cross_validation2(X, target, param_grid, scorer_estimate, output_prefix, random_states, output_path='./models/10xKfold/', n_splits=3):
    if os.path.exists(output_path):
        print(f"The path {output_path} exists.")
    else:
        print(f"The path {output_path} does not exist.")
        raise FileNotFoundError(f"The path {output_path} does not exist.")

    cv_results = {'random_state': [], 'scores': {}, 'mean_scores': [], 'common_features': {}, 'model': {}}
    for ran_state in random_states:
        print(ran_state)
        kfold_cv = KFold(n_splits=n_splits, shuffle=True, random_state=ran_state)
        scores, model, best_param = gridcv(
            X, 
            target,
            Lasso(),
            param_grid,
            prepy=False,
            scorer=scorer_estimate, 
            cv_meth=kfold_cv
        )
        cv_results['random_state'].append(ran_state)
        cv_results['scores'][ran_state] = scores
        cv_results['mean_scores'].append(np.mean(scores['fold_scores']))
        # cv_results['model'][ran_state] = model

    # Determine common features...
    cv_results['common_features'] = set(cv_results['scores'][42]['non_zero_features'])
    for r in cv_results['random_state'][1:]:
        current_features = set(cv_results['scores'][r]['non_zero_features'])
        cv_results['common_features'] = cv_results['common_features'].intersection(current_features)
    cv_results['common_features'] = list(cv_results['common_features'])

    #save to json
    with open(f"{output_path}{output_prefix}_nXcv.json", 'w') as file:
       json.dump(cv_results, file)
    file.close()

    return cv_results




In [166]:
def nX_cross_validation(X, target, param_grid, scorer_estimate, output_prefix, random_states, output_path='./models/10xKfold/', n_splits=3):
    if os.path.exists(output_path):
        print(f"The path {output_path} exists.")
    else:
        print(f"The path {output_path} does not exist.")
        raise FileNotFoundError(f"The path {output_path} does not exist.")

    cv_results = {'random_state': [], 'scores': {}, 'mean_scores': [], 'selected_features': {}, 'common_features': {}, 'model': {}}
    for ran_state in random_states:
        print(ran_state)
        kfold_cv = KFold(n_splits=n_splits, shuffle=True, random_state=ran_state)
        scores, model, best_param = gridcv(
            X, 
            target,
            RandomForestRegressor(),
            param_grid,
            prepy=False,
            scorer=scorer_estimate, 
            cv_meth=kfold_cv
        )
        cv_results['random_state'].append(ran_state)

        cv_results['scores'][ran_state] = scores
        cv_results['mean_scores'].append(np.mean(scores['fold_scores']))
        
        if np.mean(scores['fold_scores']) > 0.3:
            print(f"\n >> TRUE, mean fold scores {np.mean(scores['fold_scores'])} is bigger than tresh << \n")
            # select feature based on cumulative importance
            cumulative_importance = 0.0
            selected_features = []
            for feature, importance in scores['feature_importances'].items():
                cumulative_importance += importance
                selected_features.append(feature)
                if cumulative_importance >= 0.95:
                    break
            cv_results['selected_features'][ran_state] = selected_features
        # cv_results['model'][ran_state] = model

    
    # Determine common features selected on cumulative importance
    first_key = list(cv_results['selected_features'])[0]
    cv_results['common_features'] = set(cv_results['selected_features'][first_key])

    for r in list(cv_results['selected_features'].keys())[1:]:
        current_features = set(cv_results['selected_features'][r])
        cv_results['common_features'] = cv_results['common_features'].intersection(current_features)
    cv_results['common_features'] = list(cv_results['common_features'])

    #save to json
    with open(f"{output_path}{output_prefix}_nXcv.json", 'w') as file:
       json.dump(cv_results, file)
    file.close()
    
    return cv_results




In [115]:
def to_valid_variable_name(name):
    # Replace special characters with underscores
    name = re.sub(r'\W|^(?=\d)', '_', name)
    # Reduce multiple consecutive underscores to one
    name = re.sub(r'_{2,}', '_', name)
    # Truncate length if necessary
    max_length = 30
    if len(name) > max_length:
        name = name[:max_length]
    # Ensure it doesn't start with a number
    if name[0].isdigit():
        name = "_" + name
    return name

In [38]:
tr_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/tr_mut.csv", sep=",")
gcms_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/gcms_mut.csv", sep=",")
lcms_mut = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/lcms_mut.csv", sep=",")

X = pd.read_csv("/home/t44p/PW_rawdata/tr_gc_mutual/tr_mut_transposed.csv", sep=",")

In [40]:
"""from sklearn.datasets import make_regression
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)"""
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X.iloc[:,:10], gcms_mut.iloc[59,1:])
print(regr.predict(X.iloc[:,:10]))

[0.0844354  0.27029761 0.24435731 0.25930299 0.31700472 0.37935929
 0.01227968 0.25631101 0.29332108 0.32620919 0.27003129 0.3508714
 0.38192    0.08663085 0.25436918 0.29424821 0.29044632 0.28093038
 0.35387837 0.35452822 0.29928585 0.42814777 0.41850387 0.40778452
 0.35611601 0.33236527 0.32649929 0.41752302 0.41007262 0.38604956
 0.38263844 0.32426934 0.42306265 0.43122422 0.42213342 0.35779345
 0.35602408 0.32028502 0.39819233 0.43004471]


In [123]:
kfold_cv = KFold(n_splits=3, shuffle=True, random_state=42)

param_grid = {
    'regressor__alpha': np.array(np.arange(0.0125, 0.0425, 0.0025)),
    'regressor__fit_intercept': [True, False]
}   
lasso_fullkfold_scores, lasso_fullkfold_model, lasso_best_param = gridcv(
    X.iloc[:,0:10], 
    gcms_mut.iloc[59,1:],
    Lasso(max_iter=4000),
    param_grid,
    scorer='r2', 
    cv_meth=kfold_cv
)

best parameter from gridsearch>>
 {'regressor__alpha': 0.0175, 'regressor__fit_intercept': True}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.7481332260894714], [0.7481332260894714, 0.9999999999999998]]
scores for each fold>>
 [0.7720139897938894, 0.3313252228931619, 0.25279959004526065]
non_zero_features>>
 ['Xele.ptg000045l.1', 'Xele.ptg000045l.6', 'Xele.ptg000045l.8', 'Xele.ptg000045l.10']


In [69]:
feature_names = X.columns
feature_importances = regr.feature_importances_
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Optionally, you might want to sort this dictionary based on importances
sorted_feature_importance = OrderedDict(sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True))

print(type(sorted_feature_importance))


<class 'collections.OrderedDict'>


In [104]:
kfold_cv = KFold(n_splits=3, shuffle=True, random_state=42)

param_grid = {
    'regressor__n_estimators': np.array(np.arange(2, 15, 1)),
    'regressor__max_features': np.array(np.arange(2, 10, 1))
}   
rfr_fullkfold_scores, rfr_fullkfold_model, rfr_best_param = gridcv(
    X.iloc[:,0:1000], 
    gcms_mut.iloc[59,1:],
    RandomForestRegressor(),
    param_grid,
    scorer='r2', 
    cv_meth=kfold_cv
)
#for key, value in rfr_fullkfold_scores.items():
#    print(f"{key} >>>>\n {value}\n\n")

best parameter from gridsearch>>
 {'regressor__max_features': 4, 'regressor__n_estimators': 8}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9460937748832431], [0.9460937748832432, 1.0]]
scores for each fold>>
 [0.38941522592499245, 0.49696222787038236, 0.3799830813119881]
feature_importances>>
 OrderedDict([('Xele.ptg000045l.169', 0.07208411929780768), ('Xele.ptg000037l.69', 0.06292678820869439), ('Xele.ptg000045l.7', 0.060756550840301175), ('Xele.ptg000011l.440', 0.060019652459365525), ('Xele.ptg000011l.383', 0.054512034415653096), ('Xele.ptg000011l.236', 0.05327827220567377), ('Xele.ptg000011l.93', 0.03746056980894684), ('Xele.ptg000045l.88', 0.0356973138771508), ('Xele.ptg000045l.188', 0.03364640034544176), ('Xele.ptg000045l.252', 0.033402446510724576), ('Xele.ptg000037l.181', 0.025868372915816486), ('Xele.ptg000037l.234', 0.025790207078347584), ('Xele.ptg000045l.121', 0.024496885395953946), ('Xele.ptg000011l.451', 0.023967958537301676), ('Xele.

In [167]:
print(f"RFR START >>> {to_valid_variable_name(str(gcms_mut.iloc[59,0]))}\n\n")
tenX = [42, 43, 44]#, 45, 46, 47, 48, 49, 50, 51, 52]
out = './py/10xKfold/test_rfr/'
param_grid = {
    'regressor__n_estimators': np.array(np.arange(10, 15, 1)),
    'regressor__max_features': np.array(np.arange(7, 10, 1)),
    'regressor__bootstrap': [False, True]
}   
sucrose_10xKfold = nX_cross_validation(X.iloc[:,:], gcms_mut.iloc[59,1:], param_grid, 'r2', to_valid_variable_name(str(gcms_mut.iloc[59,0])), tenX, output_path=out)


RFR START >>> sucrose_437_361_rt13_77


The path ./py/10xKfold/test_rfr/ exists.
42
best parameter from gridsearch>>
 {'regressor__bootstrap': False, 'regressor__max_features': 7, 'regressor__n_estimators': 14}
KFold(n_splits=3, random_state=42, shuffle=True)
r2
correlation Matrix>>
 [[1.0, 0.9999999999999998], [0.9999999999999998, 1.0]]
scores for each fold>>
 [0.4005141944600831, 0.5886815481823096, 0.14790873728694187]
feature_importances>>
 OrderedDict([('Xele.ptg000011l.23', 0.03719601172680732), ('Xele.ptg000001l.104', 0.03719601172680732), ('Xele.ptg000002l.318', 0.0371960117268073), ('Xele.ptg000012l.397', 0.03569926452413251), ('Xele.ptg000003l.1568', 0.035699264524132454), ('Xele.ptg000012l.924', 0.03407046834573952), ('Xele.ptg000046l.264', 0.03363274803477348), ('Xele.ptg000022l.993', 0.031810088024038195), ('Xele.ptg000024l.174', 0.030905543749592066), ('Xele.ptg000037l.213', 0.025703164528986525), ('Xele.ptg000046l.359', 0.02392945524261029), ('Xele.ptg000063l.86', 0.0176

In [168]:
#for ele in sucrose_10xKfold['selected_features']:
#    print(f"{sucrose_10xKfold['selected_features'][ele]}")
sucrose_10xKfold['mean_scores']
sucrose_10xKfold['common_features']
#sucrose_10xKfold['selected_features']

[]

In [105]:
med_sel_feets = []
for key, val in rfr_fullkfold_scores['feature_importances'].items():
    if val >= np.median(list(rfr_fullkfold_scores['feature_importances'].values())):
        med_sel_feets.append(key)


cumulative_importance = 0.0
selected_features = []

for feature, importance in rfr_fullkfold_scores['feature_importances'].items():
    cumulative_importance += importance
    selected_features.append(feature)
    if cumulative_importance >= 0.95:
        break

print(f"features >>\n {len(rfr_fullkfold_scores['feature_importances'].keys())}\nmedian >> \n {np.median(list(rfr_fullkfold_scores['feature_importances'].values()))}\n med_sel_feat>>\n{len(med_sel_feets)}\nselected features >> \n{len(selected_features)}")

features >>
 1000
median >> 
 0.0
 med_sel_feat>>
1000
selected features >> 
62


In [139]:
list(rfr_fullkfold_scores['feature_importances'])[0]

'Xele.ptg000045l.169'