In [1]:
import pandas as pd
import datetime
import os
print('Last run', datetime.datetime.now().strftime('%Y-%m-%d'))

Last run 2024-01-03


# 1. Create a master file containing all filtered GWAS snps

In [2]:
def merge_gwas_snps(input_dir, input_suffix, output_fn, output_dir=None):
    '''
    :param
        - input_dir: input directory with filtered fastGWA output files
        - input_suffix: suffix of input file
    :return: Save merged file in the same directory
    '''
    print()
    if output_dir is None:
        output_dir = os.path.expanduser(input_dir)
        if os.path.isfile(os.path.join(output_dir, output_fn)):
            os.remove(os.path.join(output_dir, output_fn))

    total = 0 # Get total number of files to be processed

    for fn in os.listdir(os.path.expanduser(input_dir)):
        if fn.endswith(input_suffix):
            total += 1
    c = 0
    lst_dfs = []
    cols_to_save = ['CHR', 'SNP', 'POS', 'A1', 'A2', 'N', 'AF1']
    for fn in os.listdir(os.path.expanduser(input_dir)):
        if fn.endswith(input_suffix):
            lst_dfs.append(pd.read_csv(os.path.join(input_dir, fn), sep='\t')[cols_to_save])
            # if c==0: # If the first file
            #     df = pd.read_csv(os.path.join(input_dir, fn), sep='\t')[cols_to_save]
            # else:
            #     df_tmp = pd.read_csv(os.path.join(input_dir, fn), sep='\t')[cols_to_save]
            #     df = pd.concat([df, df_tmp]).drop_duplicates(subset=['SNP'])
            c += 1
            print(f'\r# File processed: {c}/{total}    ', flush=True, end='')
    print('\n# - Concat dfs and save to output')
    df = pd.concat(lst_dfs).drop_duplicates(subset='SNP').sort_values(by=['CHR', 'POS'])
    df.to_csv(os.path.join(output_dir, output_fn), sep='\t', index=False)
    return df

In [8]:
# Merged SNPs of GWAS on lipid class, for step 2: subset dosage files
# Optional, but could speed up training process
# Only need to do this on filtered SNP file using p value threshold of 1e-3
df_lipid_class_pval005 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_class_filter_by_pval_1e-3',
                                                input_suffix='.pval_0.001.txt',
                                                output_fn='all_SNPs.pval_0.001.txt',
                                                output_dir=None)
print('# Filter by pval<0.05', df_lipid_class_pval005.shape)

df_lipid_class_pval005_maf001 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_class_filter_by_pval_1e-3_MAF_1e-2',
                                                output_fn='all_SNPs.pval_0.001_maf_0.01.txt',
                                                input_suffix='.pval_0.001.maf_0.01.txt',
                                                output_dir=None)
print('# Filter by pval<0.05 and MAF>0.01', df_lipid_class_pval005_maf001.shape)

df_lipid_class_pval005_maf005 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_class_filter_by_pval_1e-3_MAF_5e-2',
                                                output_fn='all_SNPs.pval_0.001_maf_0.05.txt',
                                                input_suffix='.pval_0.001.maf_0.05.txt',
                                                output_dir=None)
print('# Filter by pval<0.05 and MAF>0.05', df_lipid_class_pval005_maf005.shape)





# File processed: 49/49    
# - Concat dfs and save to output
# Filter by pval<0.05 (1400163, 7)

# File processed: 49/49    
# - Concat dfs and save to output
# Filter by pval<0.05 adn MAF>0.01 (280013, 7)

# File processed: 49/49    
# - Concat dfs and save to output
# Filter by pval<0.05 adn MAF>0.05 (193613, 7)


In [9]:
# Merged SNPs of GWAS on lipid species
df_lipid_species_pval005 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-3',
                                                input_suffix='.pval_0.001.txt',
                                                output_fn='all_SNPs.pval_0.001.txt',
                                                output_dir=None)
print('# Filter by pval<0.05', df_lipid_species_pval005.shape)

df_lipid_species_pval005_maf001 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-3_MAF_1e-2',
                                                output_fn='all_SNPs.pval_0.001_maf_0.01.txt',
                                                input_suffix='.pval_0.001.maf_0.01.txt',
                                                output_dir=None)
print('# Filter by pval<0.05 and MAF>0.01', df_lipid_species_pval005_maf001.shape)

df_lipid_species_pval005_maf005 = merge_gwas_snps(input_dir='~/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-3_MAF_5e-2',
                                                output_fn='all_SNPs.pval_0.001_maf_0.05.txt',
                                                input_suffix='.pval_0.001.maf_0.05.txt',
                                                output_dir=None)
print('# Filter by pval<0.05 and MAF>0.05', df_lipid_species_pval005_maf005.shape)



# File processed: 830/830    
# - Concat dfs and save to output
# Filter by pval<0.05 (8925292, 7)

# File processed: 830/830    
# - Concat dfs and save to output
# Filter by pval<0.05 adn MAF>0.01 (1845793, 7)

# File processed: 830/830    
# - Concat dfs and save to output
# Filter by pval<0.05 adn MAF>0.05 (1263037, 7)


# 2. Get dosage from vcfs
(based on merged filtered files. Multiallelic sites will be skipped) \
Do the same for train and test files

In [1]:
%%bash
# Modified from ~/CCHC/lipidomics/code/3-1_extract_snps_and_get_dosage.py
# Saved in ~/CCHC/lipidomics/20231211_rerun/code/ML_02_extract_snps_and_get_dosage.py
# Run in terminal

# Lipid class
python ML_02_extract_snps_and_get_dosage.py \
--input_dir /data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/202312_redo_training_vcfs \
--input_fn max_unrelated_set_chr*.vcf.gz  \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_class \
--output_fn lipid_class_chr*.pval_0.001_maf_0.05.vcf \
--result_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_class_filter_by_pval_1e-3 \
--all_snps_fn all_SNPs.pval_0.001.txt \
--chr_range 1-22 # or something like --chr_range 3

# Lipid species
python ML_02_extract_snps_and_get_dosage.py \
--input_dir /data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/202312_redo_training_vcfs \
--input_fn max_unrelated_set_chr*.vcf.gz  \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_species \
--output_fn lipid_species_chr*.pval_0.001_maf_0.05.vcf \
--result_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-3 \
--all_snps_fn all_SNPs.pval_0.001.txt \
--chr_range 1-22


In [None]:
%%bash
# bgzip and tabix index dosage files
fn=lipid_species_chr9.pval_0.001_maf_0.05.vcf.dosage.gz
tabix -b 2 -e 2 -f $fn

In [None]:
%%bash
# For test vcfs
# /data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/202312_redo_test_vcfs/test_set_chr*.vcf.gz

# Lipid class
python ML_02_extract_snps_and_get_dosage.py \
--input_dir /data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/202312_redo_test_vcfs \
--input_fn test_set_chr*.vcf.gz  \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_class \
--output_fn lipid_class_chr*.pval_0.001_maf_0.05.test.vcf \
--result_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_class_filter_by_pval_1e-03 \
--all_snps_fn all_SNPs.pval_1e-03.txt \
--chr_range 1-22 # or something like --chr_range 3

# Lipid species
python ML_02_extract_snps_and_get_dosage.py \
--input_dir /data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/202312_redo_test_vcfs \
--input_fn test_set_chr*.vcf.gz  \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/test/lipid_species/tmp \
--output_fn lipid_species_chr*.pval_0.001_maf_0.05.test.vcf \
--result_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_species_filter_by_pval_1e-03 \
--all_snps_fn all_SNPs.pval_1e-03.txt \
--chr_range 1-22

# 3. Format trait file

In [33]:
# Use residuals of lipidomics measures
# Rows are samples, columns are lipid traits
# Must have a column "genotype_ID" matching genotype IDs
id_mapping_fn = '/data100t1/home/wanying/CCHC/doc/samples_IDs/20220916_IDs_genotyped_all.txt'
output_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait'
df_id_mapping = pd.read_csv(id_mapping_fn, sep='\t')

for lip_type in ['class', 'species']:
    trait_fn = f'/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_{lip_type}_with_covar.no_dup.residual.train.txt'
    output_fn = os.path.join(output_dir, f'lipid_{lip_type}_ID_matched.no_dup.residual.train.txt')
    df_trait = pd.read_csv(trait_fn, sep='\t')
    if not os.path.isfile(output_fn):
        print(f'# Lipid {lip_type} file saved: {output_fn}')
        df_id_mapping.merge(df_trait, on='RRID').to_csv(output_fn, sep='\t', index=False)
    else:
        print(f'# File already exists. Skip saving: {output_fn}')

# Lipid class file saved: /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_class_ID_matched.no_dup.residual.train.txt
# Lipid species file saved: /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_species_ID_matched.no_dup.residual.train.txt


# 4. Create commands for model training
Train models

In [None]:
%%bash

# Use ~/CCHC/lipidomics/20231211_rerun/code/ML_03_model_train.py
lip_type=species
lipid="PC(44:5)"
OMP_NUM_THREADS=1 python ML_03_model_train.py \
--output_prefix test_run \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/${lip_type} \
--dosage_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_${lip_type} \
--dosage_fn lipid_${lip_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_${lip_type}_filter_by_pval_1e-05 \
--gwas_snp_fn PC-44:5-_SNPs.pval_1e-05.txt \
--lip_name ${lipid} \
--n_alphas 100 \
--multiallelic False \
--train True \
--reg_type elastic_net




In [2]:
def write_cmds_model_train(lipid_type = 'species', filter_threshold = 'pval_1e-07', output_fn_prefix='run_ML_model_train'):
    '''
    Write commands to file
    :param lipid_type: modified lipid names
    :param filter_threshold: by p value (and MAF)
    :return:
    '''
    # Load Lipid names of class or species
    fn_lipid = f'/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_{lipid_type}.txt'
    with open(fn_lipid) as fh:
        lst_lipid = fh.readline().strip().split('\t')[2:]
    print(f'# Number of lipid found in lipid measures ({lipid_type}):', len(lst_lipid))
    output_fn = f'{output_fn_prefix}_{filter_threshold}.txt'
    fh = open(output_fn, 'w')
    for lipid in lst_lipid:
        # Modify special characters
        lipid_name = lipid.replace('\\', '-').replace('/', '-').replace('(','-').replace(')','-').replace(' ','_')
        cmd = f'''OMP_NUM_THREADS=1 python ML_03_model_train.py \
    --output_prefix {lipid_name}_{filter_threshold} \
    --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lipid_type}/{filter_threshold} \
    --dosage_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/genotype_dosage/train/lipid_{lipid_type} \
    --dosage_fn lipid_{lipid_type}_chr*.pval_0.001_maf_0.05.vcf.dosage.gz \
    --gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/fastGWA/lipid_{lipid_type}_filter_by_{filter_threshold} \
    --gwas_snp_fn {lipid_name}_SNPs.{filter_threshold.replace('_maf','.maf')}.txt \
    --lip_name "{lipid}" \
    --trait_fn /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/inputs/lipid_trait/lipid_{lipid_type}_ID_matched.no_dup.residual.train.txt \
    --n_alphas 100 \
    --multiallelic False \
    --train True \
    --reg_type elastic_net'''
        fh.write(cmd+'\n')
    fh.close()
    return lst_lipid


In [14]:
# Filtering thresholds:
# pval_1e-3, pval_1e-3_maf_1e-2
# pval_1e-05, pval_1e-05_maf_0.01
# pval_1e-06, pval_1e-06_maf_0.01
# pval_1e-07, pval_1e-07_maf_0.01
# pval_5e-08, pval_5e-08_maf_0.01
for lipid_type in ['species', 'class']:
    for filter_threshold in ['pval_1e-03', 'pval_1e-03_maf_0.01',
                             'pval_1e-05', 'pval_1e-05_maf_0.01',
                             'pval_1e-04', 'pval_1e-04_maf_0.01',
                             'pval_1e-06', 'pval_1e-06_maf_0.01',
                             'pval_1e-07', 'pval_1e-07_maf_0.01',
                             'pval_5e-08', 'pval_5e-08_maf_0.01']:
        lst_lipid = write_cmds_model_train(lipid_type = lipid_type,
                               filter_threshold = filter_threshold,
                               output_fn_prefix=f'run_ML_model_train_{lipid_type}')

# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (species): 830
# Number of lipid found in lipid measures (class): 49
# Number of lipid found in lipid measures (class): 49
# Number of lipid found in lipid measures (class): 49
# Number of lipid found in lipid measures (class): 49
# Number of lipid found in lipid measures (class): 49
# Number of lipid found in lipid measures (cla

In [19]:
%%bash
# Output screen log to a file for future reference (Use SLURM if it is free)
# Example:
screen_name=class_pval_1e-5; screen -L -Logfile ${screen_name}.log -S ${screen_name}
bash run_ML_model_train_class_pval_1e-5


'/Users/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_species.txt'

In [57]:
# For pval 1e-03, select 10+10 lipid traits with top and bottom heritability to save time
# (Training others later)

# Load most recent heritability estimation
heritability_fn = '/data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/heritability_by_GREML_species/heritability_all.txt'

df_h2 = pd.read_csv(heritability_fn, sep='\t').sort_values(by='Variance')
top_lipid_species = df_h2.iloc[:10, 0]
bottom_lipid_species = df_h2.iloc[-10:, 0]
file_dir = '/data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/cmd_ML_model_train/'
input_cmd_fn = os.path.join(file_dir, 'run_ML_model_train_species_pval_1e-03.txt')
output_fn = os.path.join(file_dir, 'run_ML_model_train_species_pval_1e-03.top10_bottom10.txt')
output_fn_leftover = os.path.join(file_dir, 'run_ML_model_train_species_pval_1e-03.leftover.txt')
fh_output = open(output_fn, 'w')
fh_output_leftover = open(output_fn_leftover, 'w')
fh_input = open(input_cmd_fn)
c = 0
for line in fh_input:
    flag = False
    for lip in list(top_lipid_species)+list(bottom_lipid_species):
        # Create commands for selected lipid species
        if '--output_prefix '+lip+'_pval_1e-03' in line:
            c += 1
            flag = True
            fh_output.write(line)
    if not flag: # Write rest of commands to another file
        fh_output_leftover.write(line)
print(f'# {c} commands selected')
fh_input.close()
fh_output.close()
fh_output_leftover.close()

# 20 commands selected


# 5. Create commands to save model parameters to SQL database

In [6]:
output_fh = open('cmd_save_coefficients_to_SQL.txt', 'w')
for lip_type in ['class', 'species']:
    for pval_threshold in ['pval_1e-03', 'pval_1e-04', 'pval_1e-05',
                      'pval_1e-06', 'pval_1e-07']:
        for threshold in [pval_threshold, pval_threshold+'_maf_0.01']:
            cmd = f'''threshold={threshold}; python  /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_04_convert_coefficients_to_sql_database.py \
    --search_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/$threshold \
    --model_param_file_suffix elastic_net \
    --merged_model_param_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/$threshold \
    --merged_model_param_fn all_{lip_type}_merged_$threshold.txt \
    --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/merged_model_params \
    --output_fn_prefix train_coeff_{lip_type}_$threshold \
    --overwrite False'''
#         cmd_maf = f'''threshold={pval_threshold}_maf_0.01; python  /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/code/ML_04_convert_coefficients_to_sql_database.py \
# --search_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/$threshold \
# --model_param_file_suffix elastic_net \
# --merged_model_param_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/$threshold \
# --merged_model_param_fn all_{lip_type}_merged_$threshold.txt \
# --output_dir /data100t1/home/wanying/CCHC/lipidomics/20231211_rerun/outputs/prediction_models/elastic_net/{lip_type}/merged_model_params \
# --output_fn_prefix train_coeff_{lip_type}_$threshold \
# --overwrite False'''
            output_fh.write(cmd+'\n')
output_fh.close()
