In [1]:
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np
import os
import warnings
import datetime
warnings.filterwarnings(action='ignore')
import os
from sklearn.metrics import r2_score
from scipy import stats
print('Last run', datetime.datetime.now().strftime('%Y-%m-%d'))
import matplotlib.pyplot as plt

Last run 2024-01-08


# 1. Apply models on test set
Use tabix

In [None]:
%%bash
lip_type=class
threshold=pval_1e-04_maf_0.01
python ML_06_apply_regression_model_on_test_set.py \
--vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.gz \
--coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values \
--output_fn_prefix test_${lip_type}_${threshold} \
--lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}.list \
--overwrite False

In [8]:
# Create commands to run
fh_cmd = open('cmd_apply_model_on_test_set.txt', 'w')
lst_threshold = [f'pval_1e-0{x}' for x in range(3, 8)] + [f'pval_1e-0{x}_maf_0.01' for x in range(3, 8)]
for lip_type in ['class', 'species']:
    for threshold in lst_threshold:
        cmd = 'lip_type=' + lip_type + ';'
        cmd += 'threshold=' + threshold + ';'
        cmd += '''python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_model_on_test_set.py \
    --vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
    --vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.gz \
    --coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db \
    --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values \
    --output_fn_prefix test_${lip_type}_${threshold} \
    --lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}.list \
    --overwrite False'''
        fh_cmd.write(cmd+'\n')
fh_cmd.close()
print('# DONE')

# DONE


In [10]:
# Create commands to run
fh_cmd = open('cmd_apply_model_on_training_set.txt', 'w')
c = 0
lst_threshold = [f'pval_1e-0{x}' for x in range(3, 8)] + [f'pval_1e-0{x}_maf_0.01' for x in range(3, 8)]
for lip_type in ['class', 'species']:
    for threshold in lst_threshold:
        cmd = 'lip_type=' + lip_type + ';'
        cmd += 'threshold=' + threshold + ';'
        cmd += '''python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_model_on_test_set.py \
    --vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
    --vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.gz \
    --coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db \
    --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values \
    --output_fn_prefix train_${lip_type}_${threshold} \
    --lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}.list \
    --overwrite False'''
        fh_cmd.write(cmd+'\n')
fh_cmd.close()
print('# DONE')

# DONE


# 2. Evaluation model performance

## 2.1 Fix missing intercept in predicted values of test set

### (1) Merge true predicted values from model training

In [7]:
# Issue found in old models: did not save intercept!!!
# Use this temporary fix to calculate intercept and save to file: (on training data)
# Intercept = predicted values from model train - predicted values by model application code

# Merge true predicted values of training set
def merge_pred_value(in_dir, out_fn, out_dir='', overwrite=False):
    if out_dir == '': out_dir = in_dir
    if os.path.isfile(os.path.join(out_dir, out_fn)) and not overwrite:
        print('# - Output file exists; Skip saving')
        return
    out_fh = open(os.path.join(out_dir, out_fn), 'w')
    print('# - Output saved:', os.path.join(out_dir, out_fn))
    count = 0
    for fn in os.listdir(in_dir):
        if fn.endswith('.pred') and ('all_' not in fn):
            in_fh = open(os.path.join(in_dir, fn))
            if count == 0:
                line = in_fh.readline()
                out_fh.write(line)
                line = in_fh.readline()
                out_fh.write(line)
            else:
                in_fh.readline()
                line = in_fh.readline()
                out_fh.write(line)
            in_fh.close()
            count += 1
    out_fh.close()
    print(f'# {count} files merged')

lst_thresholds = [f'pval_1e-0{i}' for i in range(3, 8)] + [f'pval_1e-0{i}_maf_0.01' for i in range(3, 8)]
for threshold in lst_thresholds:
    for lip_type in ['class', 'species']:
        print(f'\n# Process {lip_type}, {threshold}')
        in_dir = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/{threshold}'
        out_fn = f'true_pred_{lip_type}_{threshold}.pred'
        out_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/model_intercepts'
        if os.path.isdir(in_dir) and os.path.isdir(out_dir):
            merge_pred_value(in_dir=in_dir, out_fn=out_fn, out_dir=out_dir)
        else:
            print('# ERROR:  In or out directory not exist')
print('# DONE merging files')

# Re do this one due to missing pred values in training
# Training was terminated due to out of memory in SLURM
lst_thresholds = ['pval_1e-03']
for threshold in lst_thresholds:
    for lip_type in ['species']:
        print(f'\n# Process {lip_type}, {threshold}')
        in_dir = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/{threshold}'
        out_fn = f'true_pred_{lip_type}_{threshold}.pred'
        out_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/model_intercepts'
        if os.path.isdir(in_dir) and os.path.isdir(out_dir):
            merge_pred_value(in_dir=in_dir, out_fn=out_fn, out_dir=out_dir, overwrite=True)
        else:
            print('# ERROR:  In or out directory not exist')



# Process class, pval_1e-03
# - Output file exists; Skip saving

# Process species, pval_1e-03
# - Output file exists; Skip saving

# Process class, pval_1e-04
# - Output file exists; Skip saving

# Process species, pval_1e-04
# - Output file exists; Skip saving

# Process class, pval_1e-05
# - Output file exists; Skip saving

# Process species, pval_1e-05
# - Output file exists; Skip saving

# Process class, pval_1e-06
# - Output file exists; Skip saving

# Process species, pval_1e-06
# - Output file exists; Skip saving

# Process class, pval_1e-07
# - Output file exists; Skip saving

# Process species, pval_1e-07
# - Output file exists; Skip saving

# Process class, pval_1e-03_maf_0.01
# - Output file exists; Skip saving

# Process species, pval_1e-03_maf_0.01
# - Output file exists; Skip saving

# Process class, pval_1e-04_maf_0.01
# - Output file exists; Skip saving

# Process species, pval_1e-04_maf_0.01
# - Output file exists; Skip saving

# Process class, pval_1e-05_maf_0.01
# 

**Note**: Eight lipid species did not train due to low memory.Need to:
1. Redo them and repopulate weights database of pval 1e-03
2. Apply model on these lipid species and get "wrong pred values" on test set

In [14]:
# Apply models on missing lipid species on training set:
# PC(P-42:5)
# LPC(20:2) [sn1]
# PC(O-46:7) (b)
# AC(20:4)
# AC(20:3) (a)
# TG(56:6) [NL-20:4]
# PE(P-20:1/22:6)
# LPC(22:6) [sn1]
cmd = '''
lip_type=species;threshold=pval_1e-03; \
python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_model_on_test_set.py \
--vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type}  \
--vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.gz \
--coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/wrong_pred_values_train     \
--output_fn_prefix train_${lip_type}_${threshold}_redo \
--lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_species_failed_redo.list \
--overwrite True
'''
print('\nTraining set:\n'+cmd)

# Same for test set
cmd = '''
lip_type=species;threshold=pval_1e-03; \
python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_model_on_test_set.py \
--vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type}  \
--vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.gz \
--coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/wrong_pred_values_test     \
--output_fn_prefix test_${lip_type}_${threshold}_redo \
--lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_species_failed_redo.list \
--overwrite True
'''
print('\nTest set:\n'+cmd)



Training set:

lip_type=species;threshold=pval_1e-03; python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_model_on_test_set.py --vcf_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type}  --vcf_files lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.gz --coeff_db /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type}/merged_model_params/train_coeff_${lip_type}_${threshold}.db --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/wrong_pred_values_train     --output_fn_prefix train_${lip_type}_${threshold}_redo --lipid_list /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_species_failed_redo.list --overwrite True


Test set:

lip_type=species;threshold=pval_1e-03; python /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_06_apply_regression_

In [20]:
def add_intercept_to_pred_values(fn_true_pred_train, fn_wrong_pred_train, fn_wrong_pred_test,
                                 output_dir, output_fn, overwrite=False):
    '''
    Calculate intercept from training data, then add to predicted values of test data, and save new result
    :param fn_true_pred_train: predicted values from model training on training set
    :param fn_wrong_pred_train: predicted values from application of model on training set
    :param fn_wrong_pred_test: predicted values from application of model on test set
    :param output_dir, output_fn: output directory and filename
    :param overwrite: overwrite existing file
    :return:
    '''
    if not os.path.isdir(output_dir):
        print('# Output directory does not exist. Create one at', output_dir)
    if os.path.isfile(os.path.join(output_dir, output_fn)) and not overwrite:
        print('# Output file exists. Skip saving or set overwrite=True')
        return
    df_true_pred_train = pd.read_csv(fn_true_pred_train, sep='\t')
    df_wrong_pred_train = pd.read_csv(fn_wrong_pred_train, sep='\t')
    df_wrong_pred_test = pd.read_csv(fn_wrong_pred_test, sep='\t')

    # Reorder lipids in true pred values to match order in wrong pred values
    assert 'trait' in df_wrong_pred_test.columns
    assert 'trait' in df_wrong_pred_train.columns
    lst_lipid = df_wrong_pred_train['trait']
    df_true_pred_train = df_true_pred_train.set_index(keys='Lipid').reindex(index=lst_lipid).reset_index()
    df_residuals = df_true_pred_train.iloc[:, 1:]-df_wrong_pred_train.iloc[:, 1:]
    df_residuals = pd.concat([df_wrong_pred_train[['trait']], df_residuals], axis=1)

    df_true_pred_test = df_wrong_pred_test.iloc[:, 1:] + df_residuals.iloc[:, 1:df_wrong_pred_test.shape[-1]].values
    df_true_pred_test = pd.concat([df_wrong_pred_train[['trait']], df_true_pred_test], axis=1)

    # Save true predicted values of test
    df_true_pred_test.to_csv(os.path.join(output_dir, output_fn), sep='\t', index=False)
    # Save intercept for future use
    df_residuals.to_csv(os.path.join(output_dir, output_fn+'.intercept'), sep='\t', index=False)
    return df_wrong_pred_test, df_true_pred_test, df_residuals


lst_thresholds = [f'pval_1e-0{i}' for i in range(3, 8)] + [f'pval_1e-0{i}_maf_0.01' for i in range(3, 8)]
for threshold in lst_thresholds:
    for lip_type in ['class', 'species']:
        print(f'# Process {lip_type}, {threshold}')
        true_pred_dir_train = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/true_pred_values_train'
        fn_true_pred_train = os.path.join(true_pred_dir_train, f'true_pred_{lip_type}_{threshold}.pred')

        wrong_pred_dir_train = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/wrong_pred_values_train'
        fn_wrong_pred_train = os.path.join(wrong_pred_dir_train, f'train_{lip_type}_{threshold}.pred')

        wrong_pred_dir_test = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/wrong_pred_values_test'
        fn_wrong_pred_test = os.path.join(wrong_pred_dir_test, f'test_{lip_type}_{threshold}.pred')

        output_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/true_pred_values_test'
        output_fn = f'true_pred_values_test_{lip_type}_{threshold}.pred'
        # add_intercept_to_pred_values(fn_true_pred_train, fn_wrong_pred_train, fn_wrong_pred_test,
        #                                  output_dir, output_fn, overwrite=True)
        try:
            add_intercept_to_pred_values(fn_true_pred_train, fn_wrong_pred_train, fn_wrong_pred_test,
                                         output_dir, output_fn, overwrite=True)
        except:
            print('# - Somthing wrong')
print('# DONE')

# Process class, pval_1e-03
# Process species, pval_1e-03
# Process class, pval_1e-04
# Process species, pval_1e-04
# Process class, pval_1e-05
# Process species, pval_1e-05
# Process class, pval_1e-06
# Process species, pval_1e-06
# Process class, pval_1e-07
# Process species, pval_1e-07
# Process class, pval_1e-03_maf_0.01
# Process species, pval_1e-03_maf_0.01
# Process class, pval_1e-04_maf_0.01
# Process species, pval_1e-04_maf_0.01
# Process class, pval_1e-05_maf_0.01
# Process species, pval_1e-05_maf_0.01
# Process class, pval_1e-06_maf_0.01
# Process species, pval_1e-06_maf_0.01
# Process class, pval_1e-07_maf_0.01
# Process species, pval_1e-07_maf_0.01
# DONE


## 2.2 Calculate pearson r2

In [34]:
# Calculate pearson r2 between true values and predicted values
# Do the same for train and test
def get_pearson_r2(lipid_type = 'species',
                   data_type = 'train',
                   threshold = 'pval_1e-03_maf_0.01',
                   dir_true = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait',
                   dir_pred = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/true_pred_values_train',
                   fn_true = '', output_fn = ''):
    print('# Result saved:', output_fn)
    if fn_true=='': fn_true = f'lipid_{lipid_type}_ID_matched.no_dup.residual.{data_type}.txt'
    if data_type == 'train':
        fn_pred = f'true_pred_{lipid_type}_{threshold}.pred'
    else:
        fn_pred = f'true_pred_values_{data_type}_{lipid_type}_{threshold}.pred'

    df_pred = pd.read_csv(os.path.join(dir_pred, fn_pred), sep='\t').T.reset_index()
    df_pred.columns = ['genotype_ID'] + list(df_pred.iloc[0, 1:])
    df_pred.drop(index=0, inplace=True)
    df_true = pd.read_csv(os.path.join(dir_true, fn_true), sep='\t')

    # Re-order lipid ans samples so that pred and true dataframe matches
    # remove sample HD0145_HD4145 oterwise NaN values will cause error in r2 calcualtion
    df_pred = df_pred.set_index(keys='genotype_ID').reindex(df_true['genotype_ID']).reset_index()
    print('# Load true and predicted values:', df_pred.shape, df_true.shape)

    print('# Check and remove NaN values')
    sample_ids_nan = df_pred[df_pred.isna().any(axis=1)]['genotype_ID']
    if len(sample_ids_nan) != 0:
        print('# Found NaN in predicted values: ')
        for val in sample_ids_nan:
            print('# -', val, '; Remove')
            df_pred = df_pred[df_pred['genotype_ID']!=val].copy()
            df_true = df_true[df_true['genotype_ID']!=val].copy()
    print('# Cleaned true and predicted values:', df_pred.shape, df_true.shape)

    print('# Calculate model fitting r2, pearson r2 and spearman r2')
    c = 0
    model_fitting_r2 = [] # Test quality of model fitting
    pearson_r2, pearson_pval = [], [] # Test pearson r2 for linear relationship
    spearman_r2 = [] # Test monotonicity of the relationship. Non-parametric
    lst_lipid = df_pred.columns[1:]
    for lipid in lst_lipid:
        model_fitting_r2.append(r2_score(df_true[lipid], df_pred[lipid]))
        # Note: scipy fucntion returns pearson r not r2!!!!!
        ps_r, ps_pval = stats.pearsonr(df_true[lipid], df_pred[lipid])
        pearson_pval.append(ps_pval)
        pearson_r2.append(ps_r**2)
        spearman_r2.append(stats.spearmanr(df_true[lipid], df_pred[lipid])[0]**2)
        c += 1
        print(f'\r# {c}/{len(lst_lipid)}', end='', flush=True)
    # Save result to output file
    df_result = pd.DataFrame({'Lipid':lst_lipid, 'pearson_r2':pearson_r2, 'pearson_pval':pearson_pval, 'model_fitting_r2':model_fitting_r2})
    df_result.to_csv(output_fn, sep='\t', index=False)
    return df_result



In [35]:
lst_thresholds = [f'pval_1e-0{i}' for i in range(3, 8)] + [f'pval_1e-0{i}_maf_0.01' for i in range(3, 8)]
for lipid_type in ['class', 'species']:
    for data_type in ['train', 'test']:
        for threshold in lst_thresholds:
            output_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance'
            output_fn = f'model_performance_{data_type}_{lipid_type}_{threshold}.txt'
            get_pearson_r2(lipid_type = lipid_type,
                           data_type = data_type,
                           threshold = threshold,
                           dir_true = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait',
                           dir_pred = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/predicted_values/true_pred_values_{data_type}',
                           fn_true = f'lipid_{lipid_type}_ID_matched.no_dup.residual.{data_type}.txt',
                           output_fn = os.path.join(output_dir, output_fn))
    #         break
    #     break
    # break

# Result saved: /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_train_class_pval_1e-03.txt
# Load true and predicted values: (1606, 50) (1606, 52)
# Check and remove NaN values
# Found NaN in predicted values: 
# - HD0145_HD4145 ; Remove
# Cleaned true and predicted values: (1605, 50) (1605, 52)
# Calculate model fitting r2, pearson r2 and spearman r2
# 49/49# Result saved: /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_train_class_pval_1e-04.txt
# Load true and predicted values: (1606, 50) (1606, 52)
# Check and remove NaN values
# Found NaN in predicted values: 
# - HD0145_HD4145 ; Remove
# Cleaned true and predicted values: (1605, 50) (1605, 52)
# Calculate model fitting r2, pearson r2 and spearman r2
# 49/49# Result saved: /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_p

# 3. Plot model performance against heritability

In [59]:
def load_h2_data_calcualte_95CI(lip_type = 'species',
                                h2_merged_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/heritability_estimation/merged_heritability_results', h2_fn=''):
    '''
    Load heritability and calculate 95% confidence interval
    :param lip_type:
    :param h2_merged_dir:
    :param h2_fn:
    :return: a dataframe of heritability
    '''
    if h2_fn=='': h2_fn = f'heritability_{lip_type}.train_only.txt'
    print('# - Load heritability')
    df_h2 = pd.read_csv(os.path.join(h2_merged_dir, h2_fn), sep='\t')
    print('# - Calculate 95% CI')
    # calcualte 95% confidence interval
    # "For normal distribution, a 95% confidence interval is obtained as the values 1.96×SE either side of the mean."
    df_h2['95CI'] = 1.96*df_h2['se']
    return df_h2

def load_performance(fn_performance):
    print('# - Load performance', fn_performance)
    df_performance = pd.read_csv(fn_performance, sep='\t')
    # Create a column of lipid names for mapping
    df_performance['Lipid_name'] = df_performance['Lipid'].apply(lambda x: x.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ', '_'))
    return df_performance

def plot_h2_r2(df_r2_heritability_merged, title = '', save_fig=False):
    fig, ax = plt.subplots(dpi=100, figsize=(10, 15), nrows=3)
    ax[0].plot(df_r2_heritability_merged.index, df_r2_heritability_merged['model_fitting_r2'],
                    ls='', marker='.', color='r', label='Model fitting R2')
    ax[0].legend()

    # ax[1].plot(df_r2_heritability_merged['model_fitting_r2'], df_r2_heritability_merged['pearson_r2'],
    #                 ls='', marker='.', label='Pearson R2')
    # ax[1].legend()
    ax[1].plot(df_r2_heritability_merged.index, df_r2_heritability_merged['pearson_r2'],
               ls='', marker='.', label='Pearson R2')
    ax[1].legend()
    
    ax[2].errorbar(df_r2_heritability_merged.index, df_r2_heritability_merged['h2'], yerr=df_r2_heritability_merged['95CI'],
                   ls='', marker='.', color='k', ecolor='lightgrey', label='heritability with 95% CI')
    ax[2].plot(df_r2_heritability_merged.index, df_r2_heritability_merged['pearson_r2'],
                    ls='', marker='.', color='r', label='Test set pearson r2')
    ax[2].set_title(title)
    ax[2].set_xlabel('Lipid sorted by heritability')
    ax[2].set_ylabel('h2 or r2')
    ax[2].legend()
    fig.suptitle(title)
    fig.tight_layout()
    if save_fig: fig.savefig(f'{lipid_type}_{threshold}_{data_type}.jpeg')

def load_data_and_plot(lipid_type = 'species',
                       threshold = 'pval_1e-03_maf_0.01',
                       data_type = 'test', save_fig=False, plot=True):
    h2_fn = f'heritability_{lipid_type}.train_only.txt'
    df_h2 = load_h2_data_calcualte_95CI(lip_type = data_type,
                                        h2_merged_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/heritability_estimation/merged_heritability_results',
                                        h2_fn=h2_fn)

    # print('\n# Load performance data')
    performance_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance'
    fn_performance = f'model_performance_{data_type}_{lipid_type}_{threshold}.txt'
    df_performance = load_performance(os.path.join(performance_dir, fn_performance))

    print('# Merge performance and heritability by lipid name')
    df_h2_performance_merged = df_h2.merge(df_performance, on='Lipid_name')
    df_h2_performance_merged.sort_values(by='h2', inplace=True)
    df_h2_performance_merged.reset_index(drop=True, inplace=True)
    df_h2_performance_merged.head()

    title = f'{lipid_type}, {threshold}, {data_type}'
    if plot: plot_h2_r2(df_h2_performance_merged, title, save_fig)
    return df_h2_performance_merged
    

In [84]:
# load_data_and_plot(lipid_type='species',threshold='pval_1e-05',data_type='test', save_fig=True)
lst_threshold = [f'pval_1e-0{x}' for x in range(3, 8)] + [f'pval_1e-0{x}_maf_0.01' for x in range(3, 8)]
for lipid_type in ['class', 'species']:
    for threshold in lst_threshold:
        for data_type in ['test', 'train']:
            print('\n#', '#'*20, 'Process', data_type, lipid_type, theshold)
            df_h2_performance_merged = load_data_and_plot(lipid_type=lipid_type,
                                                          threshold=threshold,
                                                          data_type=data_type,
                                                          save_fig=False,
                                                          plot=False)
            # Save merged result
            merged_result_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/merged_h2_r2'
            df_h2_performance_merged.to_csv(os.path.join(merged_result_dir, f'merged_h2_r2_{data_type}_{lipid_type}_{threshold}.txt'),
                                            sep='\t', index=False)
            
print('# DONE')


# #################### Process test class pval_1e-07_maf_0.01
# - Load heritability
# - Calculate 95% CI
# - Load performance /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_test_class_pval_1e-03.txt
# Merge performance and heritability by lipid name

# #################### Process train class pval_1e-07_maf_0.01
# - Load heritability
# - Calculate 95% CI
# - Load performance /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_train_class_pval_1e-03.txt
# Merge performance and heritability by lipid name

# #################### Process test class pval_1e-07_maf_0.01
# - Load heritability
# - Calculate 95% CI
# - Load performance /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_test_class_pval_1e-04.txt
# Merge performance and heritability by lipid name

In [60]:
df_h2_performance_merged = load_data_and_plot(lipid_type='species',
                                              threshold='pval_1e-05',
                                              data_type='train',
                                              plot=False)

# - Load heritability
# - Calculate 95% CI
# - Load performance /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/model_performance/model_performance_train_species_pval_1e-05.txt
# Merge performance and heritability by lipid name
