Create commands to train boosting model

In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt

# 1. AdaBoost

In [None]:
%%bash
# Example call
lip_type=species
lipid="PC(44:5)"
output_prefix=PC-44:5-
python ML_08_boosting_model_train.py \
--output_prefix ${output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/AdaBoost/${lip_type} \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_${lip_type}_filter_by_pval_1e-07 \
--gwas_snp_fn PC-44:5-_SNPs.pval_1e-07.txt \
--lipid_name ${lipid} \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--train True \
--n_estimator 10 \
--boost_type Ada

## 1.1 Create commands to run

In [4]:
# Load list of lipid
lip_type='species'
threshold = 'pval_1e-05'
fn_lst_lipid = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_{lip_type}.list'
cmd_output_fn = f'cmd_AdaBoost_train.{lip_type}.{threshold}.500_estimator.txt'
fh_cmd = open(cmd_output_fn, 'w')
lst_lipids = []
with open(fn_lst_lipid) as fh:
    for line in fh:
        lst_lipids.append(line.strip())
c = 0
for lipid in lst_lipids:
    lipid_name = lipid.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ', '_')
    cmd = f'lip_type={lip_type};'
    cmd += f'lipid="{lipid}";'
    cmd += f'output_prefix="{lipid_name}";'
    cmd += '''python ML_08_boosting_model_train.py \
--output_prefix ${output_prefix}.500_estimator \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/AdaBoost/${lip_type} \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_${lip_type}_filter_by_%s \
--gwas_snp_fn %s_SNPs.%s.txt \
--lipid_name "${lipid}" \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--train True \
--n_estimator 500 \
--boost_type Ada''' % (threshold, lipid_name, threshold)
    fh_cmd.write(cmd + '\n')
    c += 1
    print(f'\rProcess {c}', end='', flush=True)

fh_cmd.close()

Process 830

In [8]:
df = pd.read_csv('cmd_AdaBoost_train.species.pval_1e-05.txt', header=None)
# df.sample(frac=0.1).sample(frac=1).to_csv('cmd_AdaBoost_train.species.pval_1e-07.83sample.txt', index=False, header=False)
with open('cmd_AdaBoost_train.species.pval_1e-05.83sample.txt', 'w') as o_fh:
    for i in range(83):
        o_fh.write(df.sample(frac=0.1).sample(frac=1).iloc[i, :].values[0]+'\n')

## 1.2 Create a quick run using 20 lipid species with the highest h2

In [17]:
# Create a quick run using 20 lipid species with the highest h2
# Load h2 data to selet lipid with highest h2
h2_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/merged_h2_r2'
fn = f'merged_h2_r2_test_species_pval_1e-05.txt'
print(fn)
df_h2 = pd.read_csv(os.path.join(h2_dir, fn), sep='\t').sort_values('h2', ascending=False)
top_20 = df_h2.head(20)
bottom_20 = df_h2.tail(20)
display(df_h2.head(5))
display(df_h2.tail(5))

lip_type='species'
threshold = 'pval_1e-05'
fn_lst_lipid = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_{lip_type}.list'
cmd_output_fn = f'cmd_AdaBoost_train.{lip_type}.{threshold}.top_20_h2.txt'
fh_cmd = open(cmd_output_fn, 'w')
lst_lipids = top_20['Lipid']
c = 0
for lipid in lst_lipids:
    lipid_name = lipid.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ', '_')
    cmd = f'lip_type={lip_type};'
    cmd += f'lipid="{lipid}";'
    cmd += f'output_prefix="{lipid_name}";'
    cmd += '''python ML_08_boosting_model_train.py \
--output_prefix ${output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/AdaBoost/${lip_type} \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_${lip_type}_filter_by_%s \
--gwas_snp_fn %s_SNPs.%s.txt \
--lipid_name "${lipid}" \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--train True \
--n_estimator 100 \
--boost_type Ada''' % (threshold, lipid_name, threshold)
    fh_cmd.write(cmd + '\n')
    c += 1
    print(f'\rProcess {c}', end='', flush=True)

fh_cmd.close()
    

merged_h2_r2_test_species_pval_1e-05.txt


Unnamed: 0,Lipid_name,h2,se,95CI,Lipid,pearson_r2,pearson_pval,model_fitting_r2
829,CE-20:5-,0.999999,0.188218,0.368907,CE(20:5),0.053283,1.092933e-09,-0.14657
828,CE-20:4-,0.999999,0.185907,0.364378,CE(20:4),0.041802,7.340825e-08,-0.265375
827,DE-20:4-,0.99833,0.191236,0.374823,DE(20:4),0.037772,3.20064e-07,-0.24745
826,PC-16:0_20:4-,0.976646,0.179809,0.352426,PC(16:0_20:4),0.101952,1.3226300000000002e-17,-0.000614
825,PC-18:0_20:4-,0.9466,0.180321,0.353429,PC(18:0_20:4),0.122469,4.731261e-21,0.071183


Unnamed: 0,Lipid_name,h2,se,95CI,Lipid,pearson_r2,pearson_pval,model_fitting_r2
140,PE-O-16:0-22:6-,1e-06,0.188499,0.369458,PE(O-16:0/22:6),0.005595276,0.051037,-0.247109
139,PE-P-18:0-22:5-_-n6-,1e-06,0.188001,0.368482,PE(P-18:0/22:5) (n6),0.001195597,0.367619,-0.378982
138,TG-50:0-_[NL-18:0],1e-06,0.187011,0.366542,TG(50:0) [NL-18:0],0.002674842,0.177633,-0.275413
137,PC-18:0_18:1-,1e-06,0.188027,0.368533,PC(18:0_18:1),1.08324e-07,0.99316,-0.426993
0,CE-18:2-,1e-06,0.18328,0.359229,CE(18:2),0.0004966405,0.561534,-0.373347


Process 20

In [9]:
# Do the same for lipid class
# Load list of lipid
lip_type='class'
threshold = 'pval_1e-05'
fn_lst_lipid = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_{lip_type}.list'
cmd_output_fn = f'cmd_AdaBoost_train.{lip_type}.{threshold}.txt'
fh_cmd = open(cmd_output_fn, 'w')
lst_lipids = []
with open(fn_lst_lipid) as fh:
    for line in fh:
        lst_lipids.append(line.strip())
c = 0
for lipid in lst_lipids:
    lipid_name = lipid.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ', '_')
    cmd = f'lip_type={lip_type};'
    cmd += f'lipid="{lipid}";'
    cmd += f'output_prefix="{lipid_name}";'
    cmd += '''python ML_08_boosting_model_train.py \
--output_prefix ${output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/AdaBoost/${lip_type} \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_${lip_type}_filter_by_%s \
--gwas_snp_fn %s_SNPs.%s.txt \
--lipid_name "${lipid}" \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--train True \
--n_estimator 100 \
--boost_type Ada''' % (threshold, lipid_name, threshold)
    fh_cmd.write(cmd + '\n')
    c += 1
    print(f'\rProcess {c}', end='', flush=True)

fh_cmd.close()

Process 49

# 2. Gradient boosting

In [None]:
%%bash
# Example call
lip_type=species
lipid="AC(16:1)-OH"
output_prefix=AC-16:1--OH
python ML_08v2_boosting_model_train_and_test.py \
--output_prefix ${output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/GradientBoosting/species \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_fn /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-07/AC-16:1--OH_SNPs.pval_1e-07.txt \
--lipid_name ${lipid} \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--n_estimator 20,30,50,75,100,150,200,250,300,500 \
--learning_rate 0.002,0.004,0.006,0.008,0.01,0.02,0.04,0.06,0.08,0.1,0.2,0.4,0.5 \
--model Gradient

In [2]:
# create cmds
# Load all lipid names
lip_type='species'
threshold = 'pval_1e-05'
fn_lst_lipid = f'/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_{lip_type}.list'
cmd_output_fn = f'cmd_GradientBoosting_train.{lip_type}.{threshold}.txt'
fh_cmd = open(cmd_output_fn, 'w')
lst_lipids = []
with open(fn_lst_lipid) as fh:
    for line in fh:
        lst_lipids.append(line.strip())

c = 0
for lipid in lst_lipids:
    lipid_name = lipid.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ', '_')
    cmd = f'lip_type={lip_type};'
    cmd += f'lipid="{lipid}";'
    cmd += f'output_prefix="{lipid_name}";'
    cmd += '''python ML_08v2_boosting_model_train_and_test.py \
--output_prefix ${output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/GradientBoosting/species \
--dosage_dir_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn_train lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--dosage_dir_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_${lip_type} \
--dosage_fn_test lipid_${lip_type}_chr*.pval_0.001_maf_0.05.test.vcf.dosage.gz \
--gwas_snp_fn /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_%s/%s_SNPs.%s.txt \
--lipid_name "${lipid}" \
--trait_fn_train /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.train.txt \
--trait_fn_test /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_${lip_type}_ID_matched.no_dup.residual.test.txt \
--multiallelic False \
--n_estimator 20,30,50,75,100,150,200,250,300,500 \
--learning_rate 0.002,0.004,0.006,0.008,0.01,0.02,0.04,0.06,0.08,0.1,0.2,0.4,0.5 \
--model Gradient''' % (threshold, lipid_name, threshold)
    fh_cmd.write(cmd + '\n')
    c += 1
    print(f'\rProcess {c}', end='', flush=True)

fh_cmd.close()

Process 349