In [6]:
# Build predixcan style model for lipidomics

In [140]:
import logging
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
import warnings
warnings.filterwarnings(action='ignore')

import os
import subprocess
import time

import matplotlib.pyplot as plt
import seaborn as sns
import gzip

import sys
sys.path.append('/data100t1/home/wanying/lab_code/utils/')
from extract_snps_for_large_vcf_v2_xopen import find_variants
from get_dosage_from_vcf import get_dosage

from multiprocessing import Pool
import argparse

import datetime
print('Last run:', datetime.datetime.now().strftime('%Y-%m-%d'))

Last run: 2023-08-22


In [5]:
import logging

logging.debug('This is a debug message')
logging.info('This is an info message')
logging.warning('This is a warning message')
logging.error('This is an error message')
logging.critical('This is a critical message')

ERROR:root:This is an error message
CRITICAL:root:This is a critical message


## 1. Subset genotype files (in vcf)
* Get SNPs with pval<1e-3 from lipidome-wide GWAS for each lipid class and lipid species

### 1.1 Extract SNPs based on GWAS pvalue, subset vcfs

#### 1.1.1 Extract SNPs based on GWAS pvalue, threshold=1e-3

In [5]:
# Already have code in 2-2_subest_SNPs_for_model_building.py
# Run it in terminal
# Lipid class
dir_lip_class_in = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_class_GWAS/'
dir_lip_class_output = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_class_GWAS_snps_pval_1e-3/'

# Lipid species
dir_lip_species_in = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS/'
dir_lip_species_output = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3/'

cmd_file = 'vgi02_1.sh'
fh = open(cmd_file, 'w')
for lip_type in ['class', 'species']:
    if lip_type == 'class':
        dir_in, dir_out = dir_lip_class_in, dir_lip_class_output
    else:
        dir_in, dir_out = dir_lip_species_in, dir_lip_species_output
    
    for fn in os.listdir(dir_in):
        if fn.endswith('fastGWA'):
            lip = fn.split('.fastGWA')[0]
            cmd = '2-2_subest_SNPs_for_model_building.py'
            cmd = f'python ./{cmd} --input {dir_in+fn} --output {dir_out}'
            fh.write(cmd+'\n')
            # print(cmd)
fh.close()

#### 1.1.2 Combine SNPs from all lipid species and classes in to one master table (remove duplicates)

In [16]:
# Combine SNPs from all lipid species and classes in to one master table (remove duplicates)
# Then subset SNPs from vcf file

print('# Combine SNPs from all GWAS result (threshold used: pval<1e-3)')
for lip_type in ['class', 'species']:
    if lip_type == 'class':
        subset_dir = dir_lip_class_output # Extracted SNPs are saved in this directory
    else:
        subset_dir = dir_lip_species_output # Extracted SNPs are saved in this directory
    
    df_list = [] # Store dataframes for concatnation later
    for fn in os.listdir(subset_dir):
        df_list.append(pd.read_csv(subset_dir+fn, sep='\t'))
    df_merged = pd.concat(df_list).drop_duplicates(subset='SNP').sort_values(by=['CHR', 'POS'])
    
    output_fn = subset_dir + f'all_SNPs_combined_no_dup_{lip_type}.txt'
    if not os.path.isfile(output_fn):
        df_merged.to_csv(output_fn, sep='\t', index=False)
        print('# Merged file saved to:', output_fn)
    else:
        print('# File already exist, skip saving')
print('# DONE')

# Combine SNPs from all GWAS result (threshold used: pval<1e-3)
# Merged fiel saved to: /data100t1/home/wanying/CCHC/lipidomics/output/lip_class_GWAS_snps_pval_1e-3/all_SNPs_combined_no_dup_class.txt
# Merged fiel saved to: /data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3/all_SNPs_combined_no_dup_species.txt
# DONE


#### 1.1.3 Subset all SNPs from vcf file

In [18]:
help(find_variants)

Help on function find_variants in module extract_snps_for_large_vcf_v2_xopen:

find_variants(lst_pos, output_fn, input_fn, input_col_name='POS', threads=1, verbose=False)
    This function takes in positions of a list of snp (no duplication),
    then checks the input genotype file and output found SNP into a output file
    Parameters:
        - lst_pos: positions (int) of snps. Must be array-like for iteration
        - output_fn: path and file name to output found SNPs
        - input_fn: path and file name of input genotype file. Must be a single chromosome
                    Assume all variants are sorted base on position in the input file
        - compression='gzip': compression type. Most genotype files are in .gz format, so use '.gzip' as default
        - input_col_name='POS': column name of position in input genotype file. Usually 'POS', could be 'current_pos'
        - threads: multi-threading
        - verbose: whether to print number of snps processed
    Returns:
      

In [20]:
# Subset all SNPs from vcf file
# import sys
# sys.path.append('/data100t1/home/wanying/lab_code/utils/')
# from extract_snps_for_large_vcf_v2_xopen import find_variants
# from get_dosage_from_vcf import get_dosage

# from multiprocessing import Pool

# Use unrealted samples in training and testing
# Save others for validation
input_dir = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/'
output_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/'
lip_type = 'species'

# Load SNPs to be extracted
print('# Load SNPs to be extracted')
all_snps_fn = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3/all_SNPs_combined_no_dup_species.txt'
df_snps = pd.read_csv(all_snps_fn, sep='\t')

for chr_num, df in df_snps.groupby('CHR'):
    input_fn = f'max_unrelated_set_chr{chr_num}.vcf.gz'
    output_fn = f'{lip_type}_chr{chr_num}.vcf'
    find_variants(lst_pos = df['POS'],
                  output_fn = output_dir+output_fn,
                  input_fn = input_dir+input_fn,
                  input_col_name = 'POS',
                  threads = 8)


In [21]:
df_snps

Unnamed: 0,CHR,SNP,POS,A1,A2,N,AF1,BETA,SE,P
0,1,chr1:24963:GT:G,24963,G,GT,2096,0.005010,-0.718337,0.209048,0.000590
1,1,chr1:29187:AG:A,29187,A,AG,2096,0.000239,-3.181940,0.945625,0.000766
2,1,chr1:47647:G:T,47647,T,G,2096,0.000477,-2.070370,0.614724,0.000757
3,1,chr1:56439:T:C,56439,C,T,2096,0.000716,-1.805160,0.529122,0.000646
4,1,chr1:62046:A:G,62046,G,A,2096,0.000716,1.977740,0.580189,0.000653
...,...,...,...,...,...,...,...,...,...,...
9750613,22,chr22:50804600:C:G,50804600,G,C,2096,0.000239,-3.268530,0.945269,0.000545
9750614,22,chr22:50804608:TTCTAC:T,50804608,T,TTCTAC,2096,0.000239,-3.268530,0.945269,0.000545
9750615,22,chr22:50804826:A:G,50804826,G,A,2096,0.000477,2.327240,0.688984,0.000731
9750616,22,chr22:50804830:T:A,50804830,A,T,2096,0.000477,-2.315530,0.677886,0.000636


### 1.2 Subset training set

In [None]:
%%bash
# Run in terminal: ./subset_vcfs.sh 22
chr_num=$1
output_dir=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/
vcf=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/subset_chr${chr_num}.dose.vcf.gz
sample_list=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/genome_maximum_independent_set_PRIMUS_3rd_degree.txt
output=${output_dir}max_unrelated_set_chr${chr_num}.vcf

# Use this to exclude samples: bcftools view -o ${output} -S ^${sample_list} ${vcf}
echo "#Run bcftools on chr${chr_num}"
bcftools view -o ${output} -S ${sample_list} ${vcf}

echo "#bgzip output"
bgzip ${output}

### 1.3 Subset validation set

In [None]:
%%bash
chr_num=$1
output_dir=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/validation/
vcf=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/subset_chr${chr_num}.dose.vcf.gz
sample_list=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/genome_maximum_independent_set_PRIMUS_3rd_degree.txt
output=${output_dir}validataion_set_chr${chr_num}.vcf

# Print chromosome, position, ref allele, the first alternate allele, AF field of INFO column and genotype
# bcftools query -f '%CHROM\t%POS\t%REF\t%ALT{0}\t%INFO/AF\t%FORMAT\n' -S ${sample_list} -o ${output} ${vcf}
echo "#Run bcftools on chr${chr_num}"
bcftools view -o ${output} -S ^${sample_list} ${vcf}

echo "#bgzip output"
bgzip ${output}

In [11]:
# screen -dmS chr22; screen -S chr22 -X stuff "plink2 --vcf ../subset_chr22.dose.vcf.gz 'dosage'=DS --out subset_chr22;exit\n"
for i in range(1, 23):
    cmd = f'./subset_vcfs.sh {i}'
    cmd = f'screen -dmS chr{i}; screen -S chr{i} -X stuff "{cmd};exit\\n"'
    print(cmd)

screen -dmS chr1; screen -S chr1 -X stuff "./subset_vcfs.sh 1;exit\n"
screen -dmS chr2; screen -S chr2 -X stuff "./subset_vcfs.sh 2;exit\n"
screen -dmS chr3; screen -S chr3 -X stuff "./subset_vcfs.sh 3;exit\n"
screen -dmS chr4; screen -S chr4 -X stuff "./subset_vcfs.sh 4;exit\n"
screen -dmS chr5; screen -S chr5 -X stuff "./subset_vcfs.sh 5;exit\n"
screen -dmS chr6; screen -S chr6 -X stuff "./subset_vcfs.sh 6;exit\n"
screen -dmS chr7; screen -S chr7 -X stuff "./subset_vcfs.sh 7;exit\n"
screen -dmS chr8; screen -S chr8 -X stuff "./subset_vcfs.sh 8;exit\n"
screen -dmS chr9; screen -S chr9 -X stuff "./subset_vcfs.sh 9;exit\n"
screen -dmS chr10; screen -S chr10 -X stuff "./subset_vcfs.sh 10;exit\n"
screen -dmS chr11; screen -S chr11 -X stuff "./subset_vcfs.sh 11;exit\n"
screen -dmS chr12; screen -S chr12 -X stuff "./subset_vcfs.sh 12;exit\n"
screen -dmS chr13; screen -S chr13 -X stuff "./subset_vcfs.sh 13;exit\n"
screen -dmS chr14; screen -S chr14 -X stuff "./subset_vcfs.sh 14;exit\n"
scree

## 2. Compare different models
1. Models to try
    * OLS regression
    * Lasso regression
    * Ridge regression
    * Elastic-net regression
2. Use unrelated set
    * Primus max unrelated set (3rd degree relatives): N=1608
        * ```/data100t1/home/wanying/CCHC/lipidomics/input_docs/primus/primus_rel_3/plink.genome_maximum_independent_set_PRIMUS```
    * This is about 70% of all lipidomic samples with genotype data (perfect!)

### 2.1 Extract SNPs based on GWAS pval

In [45]:
import sys
sys.path.append('/data100t1/home/wanying/lab_code/utils/')
from extract_snps_for_large_vcf_v2_xopen import find_variants
from get_dosage_from_vcf import get_dosage

from multiprocessing import Pool


In [26]:
# Try result of the simple model: trait ~ sex + age + snp + PC1-5 + grm
result_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_noadj_BMI_AGE2_snps_pval_1e-5/'
output_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/'

# Load SNPs with pval<1e-5
for fn in os.listdir(result_dir):
    if fn.endswith('.txt'):
        lipid = fn.split('_suggestive_sig_SNPs.txt')[0]
        df_snps = pd.read_csv(result_dir+fn, sep='\t')
        # print(fn, lipid)
        # display(df_snps.head())
        
        # Extract SNPs from VCF
        
        break



PI-18:1_18:2-_suggestive_sig_SNPs.txt PI-18:1_18:2-


In [27]:
input_dir = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/'
output_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/'

for chr_num, df in df_snps.groupby('CHR'):
    input_fn = f'max_unrelated_set_chr{chr_num}.vcf.gz'
    output_fn = f'{lipid}_chr{chr_num}.vcf'
    find_variants(lst_pos = df_snps['POS'],
                  output_fn = output_dir+output_fn,
                  input_fn = input_dir+input_fn,
                  input_col_name = 'POS',
                  threads = 8)

In [None]:
# Validation set
input_dir = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/validation/'
output_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/validation/'

for chr_num, df in df_snps.groupby('CHR'):
    print(f'# Processing chr{chr_num}')
    input_fn = f'validataion_set_chr{chr_num}.vcf.gz'
    output_fn = f'validation_{lipid}_chr{chr_num}.vcf'
    # find_variants(lst_pos = df_snps['POS'],
    #               output_fn = output_dir+output_fn,
    #               input_fn = input_dir+input_fn,
    #               input_col_name = 'POS',
    #               threads = 8)
    with Pool(30) as p:
        p.starmap(find_variants,
                  [(df_snps['POS'], output_dir+output_fn, input_dir+input_fn, 'POS', 8)])

In [38]:
%%bash

# Training data: Run lipid species with the highest heritability
result_dir=/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_noadj_BMI_AGE2_snps_pval_1e-5/
input_dir=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/training_max_unrelated_sampels_3rd_degree/
output_dir=/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/training/

python 3-1_extract_snps.py \
--input_dir ${input_dir} \
--output_dir ${output_dir} \
--result_dir ${result_dir} \
--gwas_summary CE-20:4-_suggestive_sig_SNPs.txt


# Validation: Run lipid species with the highest heritability
# for fn in *; do fnew=`echo validataion_set_chr5.vcf.gz | sed s/validataion/validation/g`; mv ${fn} ${fnew}; done
result_dir=/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_noadj_BMI_AGE2_snps_pval_1e-5/
input_dir=/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_sample_vcfs/validation/
output_dir=/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/validation/

python 3-1_extract_snps.py \
--input_dir ${input_dir} \
--output_dir ${output_dir} \
--result_dir ${result_dir} \
--gwas_summary CE-20:4-_suggestive_sig_SNPs.txt \
--type validation

Process is terminated.


In [183]:
# Merge extracted vcfs and get dosage

lipid = 'CE-20:4-' #training_CE-20:4-_chr1.vcf
data_type = 'training'
input_dir = f'/data100t1/home/wanying/CCHC/lipidomics/output/prediction_models/elastic_net/{data_type}/'
output_fn = f'{data_type}_{lipid}_merged.vcf'

print(f'# {lipid}')
fh_output = open(input_dir+output_fn, 'w')
# Loop through all 22 chromosomes
for i in range(1, 23):
    fn = f'{data_type}_CE-20:4-_chr{i}.vcf'
    try:
        if i == 1: # Write header of the first file only
            with open(input_dir+fn) as fh:
                line = fh.readline().strip()
                while line != '':
                    fh_output.write(line+'\n')
                    line = fh.readline().strip()
        else:
            with open(input_dir+fn) as fh:
                line = fh.readline() # Skip the headerline
                line = fh.readline().strip()
                while line != '':
                    fh_output.write(line+'\n')
                    line = fh.readline().strip()
        print(f'# - chr{i} processed: {fn}')
    except:
        print(f'# - chr{i} not found: {fn}')

get_dosage(vcf_fn = input_dir+output_fn)               

# CE-20:4-
# - chr1 processed: training_CE-20:4-_chr1.vcf
# - chr2 processed: training_CE-20:4-_chr2.vcf
# - chr3 processed: training_CE-20:4-_chr3.vcf
# - chr4 processed: training_CE-20:4-_chr4.vcf
# - chr5 processed: training_CE-20:4-_chr5.vcf
# - chr6 processed: training_CE-20:4-_chr6.vcf
# - chr7 processed: training_CE-20:4-_chr7.vcf
# - chr8 processed: training_CE-20:4-_chr8.vcf
# - chr9 processed: training_CE-20:4-_chr9.vcf
# - chr10 processed: training_CE-20:4-_chr10.vcf
# - chr11 processed: training_CE-20:4-_chr11.vcf
# - chr12 processed: training_CE-20:4-_chr12.vcf
# - chr13 processed: training_CE-20:4-_chr13.vcf
# - chr14 processed: training_CE-20:4-_chr14.vcf
# - chr15 processed: training_CE-20:4-_chr15.vcf
# - chr16 processed: training_CE-20:4-_chr16.vcf
# - chr17 processed: training_CE-20:4-_chr17.vcf
# - chr18 processed: training_CE-20:4-_chr18.vcf
# - chr19 processed: training_CE-20:4-_chr19.vcf
# - chr20 processed: training_CE-20:4-_chr20.vcf
# - chr21 not found: traini

### 2.2 Elastic-net

In [129]:
df_lipid

Sample ID,lipid,HD0280_HA0023,BD2180_BD6180,BD2179_BD6179,BD2287_BD6287,HD0119_HD4119,HD0107_HD4107,LD0233_LA0022,BD3539_BA0523,LD0175_LD4175,...,HD0275_HA0018,BD1533_BD5533,BD2455_BD6455,LD0144_LD4144,BD2188_BD6188,BD2606_BD6606,LD0082_LD4082,BD2833_BD6833,BD3346_BA0352,HD0145_HD4145
0,Sph(d18:1),157.839923,263.122091,169.087489,171.992473,132.140002,140.338830,90.215133,470.804867,652.526550,...,273.145074,121.444814,186.087185,336.087726,127.326968,127.083828,632.444120,128.717520,304.806129,503.020867
1,Sph(d18:2),38.057042,41.226980,36.753953,32.771967,27.904479,40.144444,23.745802,103.808167,100.265888,...,53.986133,30.328217,40.521517,50.986497,36.226206,25.625813,79.372982,37.149054,57.534402,70.629189
2,S1P(d16:1),129.048022,127.595188,119.406130,86.553900,93.834349,140.296858,88.280625,109.830013,84.220941,...,65.755553,124.726406,145.453005,61.373302,143.826046,80.213625,104.652760,108.464876,104.190869,119.446256
3,S1P(d18:0),129.355676,158.003578,190.428130,154.601178,126.894236,132.281778,67.173566,510.893403,242.122785,...,159.889000,188.297466,165.526335,137.284780,152.314033,204.771540,181.381054,167.388456,138.916134,265.700549
4,S1P(d18:1),1385.577691,1281.638948,1475.482007,1206.860945,1066.719195,1618.954754,465.880094,3219.669200,1879.642643,...,1270.746682,1599.296006,1506.776150,1142.908858,1133.101997,1260.876298,1632.869747,1275.891640,1498.270059,2035.831872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,LPC(20:4) [+OH],45.860720,57.090893,35.615850,121.866685,35.452389,48.912804,6.706640,166.893123,47.863177,...,48.239504,9.814752,69.174927,26.947807,14.141666,31.865821,132.663735,76.734582,17.204772,130.261278
826,LPC(22:6) [+OH],16.668964,21.328238,12.148890,43.793932,13.208831,10.854816,2.721076,23.974130,20.774097,...,11.931727,3.408203,18.042996,8.689440,14.075836,8.480297,52.439842,21.261516,5.241465,45.025341
827,PC(34:2) [+OH],16.584685,10.968600,9.050296,13.775098,10.801005,12.541043,7.582815,13.052273,11.514862,...,51.455901,11.104793,12.857234,15.708312,17.821288,10.334228,11.291571,13.356824,14.649446,28.002036
828,PC(36:4) [+OH],127.326292,106.593407,84.277667,55.437042,102.441169,67.763868,104.611699,204.542939,82.851991,...,73.747121,88.404576,104.186252,98.713650,91.416678,89.426125,60.770283,139.434843,68.742367,84.684274


In [184]:
# Load data
fn_dosage = output_fn + '.dosage'
df_dosage = pd.read_csv(input_dir+fn_dosage, sep='\t')
print('#Load dosage from merged vcf', len(df_dosage))
display(df_dosage.head(2))

# Map lipidomic IDs (LABID) to genotype IDs
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping, sep='\t')
df_id_mapping.dropna(subset=['lipidomic', 'genotype_ID'], inplace=True)
print('#Load id mapping', len(df_id_mapping))
display(df_id_mapping.head(2))

# Load lipid traits
fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_species.txt'
df_lipid = pd.read_csv(fn_lipid, sep='\t').drop(columns='MS Label').drop_duplicates(subset='Sample ID')
df_lipid = df_lipid.set_index(keys='Sample ID').T.reset_index().rename(columns={'index':'lipid'})
dict_id_mapping = df_id_mapping[['lipidomic', 'genotype_ID']].set_index(keys='lipidomic').to_dict()
df_lipid.rename(columns=dict_id_mapping['genotype_ID'], inplace=True)
# There are duplicate sample IDs after mapped to genotype IDs
# Remove duplicate IDs
df_lipid = df_lipid.T.reset_index().drop_duplicates(subset='Sample ID').set_index(keys='Sample ID').T
# Change order of samples to match dosage file
df_lipid = df_lipid[['lipid'] + list(df_dosage.columns[9:])].copy()
print('#Load lipid traits', len(df_lipid))
display(df_lipid.head(2))




#Load dosage from merged vcf 9328


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,HD0280_HA0023,...,HD0275_HA0018,BD1533_BD5533,BD2455_BD6455,LD0144_LD4144,BD2188_BD6188,BD2606_BD6606,LD0082_LD4082,BD2833_BD6833,BD3346_BA0352,HD0145_HD4145
0,chr1,4620446,chr1:4620446:C:T,C,T,.,PASS,AF=0.0001;MAF=0.0001;R2=1;IMPUTED;AC=0;AN=3214,GT:DS:HDS:GP,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr1,29003121,chr1:29003121:G:A,G,A,.,PASS,AF=0.00231;MAF=0.00231;R2=0.87398;IMPUTED;AC=5...,GT:DS:HDS:GP,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Load id mapping 2365


Unnamed: 0,RRID,LABID,genotype_ID,RNAseq,lipidomic,proteomic
9,BD0009,10Y0308,BD0009_BD4009,10Y0308,10Y0308,
11,BD0010,10Y0021,BD0010_BD4010,10Y0021,10Y0021,


#Load lipid traits 830


Sample ID,lipid,HD0280_HA0023,BD2180_BD6180,BD2179_BD6179,BD2287_BD6287,HD0119_HD4119,HD0107_HD4107,LD0233_LA0022,BD3539_BA0523,LD0175_LD4175,...,HD0275_HA0018,BD1533_BD5533,BD2455_BD6455,LD0144_LD4144,BD2188_BD6188,BD2606_BD6606,LD0082_LD4082,BD2833_BD6833,BD3346_BA0352,HD0145_HD4145
0,Sph(d18:1),157.839923,263.122091,169.087489,171.992473,132.140002,140.33883,90.215133,470.804867,652.52655,...,273.145074,121.444814,186.087185,336.087726,127.326968,127.083828,632.44412,128.71752,304.806129,503.020867
1,Sph(d18:2),38.057042,41.22698,36.753953,32.771967,27.904479,40.144444,23.745802,103.808167,100.265888,...,53.986133,30.328217,40.521517,50.986497,36.226206,25.625813,79.372982,37.149054,57.534402,70.629189


In [233]:
# import warnings
# warnings.filterwarnings("ignore")

# Fit model
lip = 'Sph(d18:1)'
X, y = df_dosage.iloc[:, 9:].values, df_lipid[df_lipid['lipid']==lip].iloc[:, 1:].values
regr_eln = ElasticNetCV(cv=5, random_state=0, max_iter=1e6, n_jobs=32)
regr_eln.fit(X.reshape(1607, -1), y.reshape(-1, 1)) # X.reshape(1607, -1), y.reshape(-1, 1)

print(regr_eln.alpha_)
print(regr_eln.intercept_)
# print(regr.predict([[0, 0]]))

  return f(*args, **kwargs)


22.180399857720683
238.42521341986784


### 2.3 Lasso

In [None]:
regr_lasso = LassoCV(cv=5, random_state=0, max_iter=1e6, n_jobs=64) # LassoCV(cv=5, random_state=0)
regr_lasso.fit(X.reshape(-1, len(df_dosage)), y.reshape(df_lipid.shape[1]-1))

print(regr_lasso.alpha_)
print(regr_lasso.intercept_)

In [230]:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.5, max_iter=1000000)
reg.fit(X.reshape(1607, -1), y.reshape(-1, 1))


Lasso(alpha=0.5, max_iter=1000000)

In [225]:
for i in reg.coef_:
    print(i)

-0.0
-2.1310495053844747
7.022481147922106
0.0
0.0
-0.0
-0.0
0.0
0.0
0.0
-0.0
0.0
-0.0
0.0
-0.0
0.0
5.537004432419987
0.0
0.0
-0.0
-7.212200798896612
-0.0
-0.0
-0.0
0.0
0.0
-0.0
-0.0
3.246549205846249
-0.0
-0.0
-0.0
-0.0
-0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2.0587868071788713
0.0
-0.0
0.0
-0.0
0.0
0.0
0.0
-0.0
-0.0
0.0
-0.0
0.0
-0.0
0.0
0.0
0.0
-0.0
0.0
0.0
5.966049988186495
-12.473554749347661
-0.0
-0.0
0.0
0.0
0.0
6.350220224705828
-0.0
12.604419021311806
-0.0
0.0
0.0
0.0
-0.0
-0.0
0.0
-0.0
-0.0
0.0
-1.3381066093837701
0.0
5.89780657224057
0.0
0.0
8.87806836399264
0.0
-0.0
1.99224660816587
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.0
-0.0
-0.0
-0.0
-0.0
-0.0
-0.0
0.0
-0.0
-0.0
7.595671620764921
-0.0
0.0
-0.0
0.0
-0.0
-0.0
-13.338926864087428
-0.0
-0.0
0.0
-0.0
0.0
0.0
-0.0
-0.0
-0.0
-2.6919376849356236
-0.0
-0.0
-0.0
0.0
-0.0
-0.0
10.902340732129327
-0.0
-0.0
0.0
-0.0
0.0
0.594343230028635
-0.0
-0.0
-0.0
-0.0
0.0
0.0
-0.0
0.0
0.0
-0.0
-5.853315153387324
0.0
-0.0
0.0
-19.806747786575198
0.0
-1.12

### 2.3 Ridge

In [None]:
regr_ridge = RidgeCV(cv=5)
# clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
regr_ridge.fit(X.reshape(-1, len(df_dosage)), y.reshape(df_lipid.shape[1]-1))

print(regr_ridge.alpha_)
print(regr_ridge.intercept_)

## 3. (2023/08/20) Redo model trainig with tabix

### 3.1 Create a genralized code to do elastic net, ridge, lasso and OLS regression

In [91]:
# Print progress par in console
# - progress: current progress (number of SNPs processed)
# - total: total number of SNPs needs to be processed
def progress_bar(progress, total):
    percent = 100 * (progress/total)
    bar = '=' * int(percent) + '-' * int(100 - percent)
    print(f'|{bar}| {percent:.2f}%', end='\r')

def get_doasge(dosage_fn, lst_snp_pos):
    '''
    Use tabix to find dosage of SNPs from a single chromosome
    Param:
     - dosage_fn: name of dosage file to be checked against (single chromosome)
     - lst_snp_pos: start and end position to search for a list of SNP. Such as 1:10000-10001 or chr1:10000-10001.
                Format needs to match chr/pos format in the dosage file
    Return:
     - sample_ids: IDs of genoytpe samples
     - snp_lst: a list of SNPs used in model (ie. SNPs that are found in dosage files)
     - dosage_matrix: dosage of given SNPs as a numpy array. Fill with NA if a SNP is not found
    '''

    # Get genotype sample IDs from dosage file
    with gzip.open(dosage_fn, 'rt') as fh:
        line = fh.readline().strip() # Take sample IDs from header line
        tmp = line.split()
        indx_dosage = tmp.index('FORMAT') + 1 # Get index of sample IDs and dosage values
        sample_ids = tmp[indx_dosage:] # Genotype IDs

    snp_lst = []
    dosage_matrix = []
    count = 0  # Track number of SNPs checked
    for snp_pos in lst_snp_pos:
        tabix_cmd = f'tabix {dosage_fn} {snp_pos}'
        return_vals = subprocess.run(tabix_cmd.split(), capture_output=True, text=True).stdout.strip().split('\n')
        
        # Drop multiallelic SNPs if args.multiallelic is False
        # If more than one SNPs were found in VCF, ignore this multiallelic site
        if not args.multiallelic:
            if len(return_vals)>1: continue
        
        try:
            chr_num, pos, snp_id, ref, alt, _, _, _, _, dosage = return_vals[0].split(maxsplit=9)
        except:
            # If multiallelic sites have been already removed from dosage files, tabix call will return empty
            continue

        dosage_matrix.append([float(x) for x in dosage.split()])
        snp_lst.append(snp_id)
        count += 1

        if count%20==0:
            progress_bar(progress=count, total=len(lst_snp_pos))
    print(f'\n\t# {count} SNPs processed')

    return sample_ids, snp_lst, np.array(dosage_matrix).reshape(-1, len(sample_ids))

# Load dosage of all filtered SNPs by given p value threshold from GWAS
def load_all_dosage(gwas_snp_fn: str,
                    gwas_snp_dir: str,
                    dosage_dir: str,
                    dosage_fn: str,
                    multiallelic):
    '''
    Get dosage of SNPs from single-chrosmosome dosage files of a given lipid
    Params:
        - gwas_snp_dir: directory to GWAS SNPs (already filtered by p value threshold)
        - gwas_snp_fn: file name of GWAS SNPs (already filtered by p value threshold)
        - dosage_dir: Directory to subsetted dosage file
        - dosage_fn: file name of subset dosage files (by chromosome).
                    Replace chromosome number with '*', such as 'species_chr*.vcf.gz.dosage'
        -multiallelic: drop multiallelic sites if False
    Return:
        - start_time: start time of loading dosage
        - snp_lst: a list of snps loaded
        - dosage_all: A numpy array of dosage. Each row is a SNP, each column is a subject
    '''
    print('# Processing lipid:', args.lip_name)

    # print(f'# Load GWAS SNPs for current lipid')
    df_gwas_snp = pd.read_csv(os.path.join(gwas_snp_dir, gwas_snp_fn), sep='\t').sort_values(by=['CHR', 'POS'])
    # Create regions to lookup usning tabix
    df_gwas_snp['REGION'] = 'chr' + df_gwas_snp['CHR'].astype('str') + ':' + df_gwas_snp['POS'].astype('str') + '-' + df_gwas_snp['POS'].astype('str')

    print('\n# Get dosage of GWAS SNPs to include in regression model')
    print('# - Checking by chromosome:')

    dosage_all = '' # A numpy array to store dosage from all chromosome
    start_time = time.time() # Time execution time
    all_snp_lst = [] # Track what SNPs have been loaded
    for chr_num, df in df_gwas_snp.groupby(by='CHR'):
        if chr_num==22:
        
            # dosage_fn = f'species_chr{chr_num}.vcf.gz.dosage'
            print(f'#  chr{chr_num}')
            sample_ids, snp_lst, dosage_matrix = get_doasge(os.path.join(dosage_dir, dosage_fn.replace('*', str(chr_num))),
                                                            list(df['REGION']))
            # lst_df_dosage.append(pd.DataFrame(data=dosage_matrix, columns=sample_ids, index=df['POS']))
            all_snp_lst += snp_lst
            if len(dosage_all) == 0: # if dosage array is empty
                dosage_all = dosage_matrix
            else:
                dosage_all = np.append(dosage_all, dosage_matrix, axis=0)


            break
        
        
        
    end_time = time.time()
    print(f'# - Checking finished in {(end_time-start_time):.4f}s')
    print('-' * 50)
    return start_time, all_snp_lst, dosage_all.astype('float64')

In [71]:
parser = argparse.ArgumentParser(description='Fit regression model of choice (elastic net, ridge, lasso or OLS)')
parser.add_argument('-o', '--output_prefix', type=str,
                           help='Output file to save alpha, l1_ratio and coefficients of chosen model')
parser.add_argument('--output_dir', type=str, help='Output directory. Default is current directory', default='.')
parser.add_argument('--dosage_dir', type=str, help='Derictory to dosage files',
                    default='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train')
parser.add_argument('--dosage_fn', type=str, help='File name format of dosage files. Use * to replace chromosome number',
                    default='species_chr*.vcf.gz.dosage')
parser.add_argument('--gwas_snp_dir', type=str, help='Directory to filtered GWAS SNPs (eg. GWAs SNPs with pval<1e-3)',
                    default='/data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3')
parser.add_argument('--gwas_snp_fn', type=str, help='File name of the filtered GWAS SNPs (eg. GWAs SNPs with pval<1e-3)',
                    default='AC-10:0-_SNPs_pval_0.001.txt')
parser.add_argument('--lip_name', type=str,
                    help='Name of the lipid to be processed')
parser.add_argument('--n_alphas', type=int, default=100,
                    help='Define how many alphas to test in CV. Dafault is 10. JTI used 100 as defined in R glmnet()')
parser.add_argument('--multiallelic', type=str, default='False',
                    help='If false, multiallelic SNPs will be removed from model fitting')
parser.add_argument('--train', type=str, default='True',
                    help='If true, will not fill with NaN if a SNP is not found. Missing values will cause errors')
parser.add_argument('--reg_type', type=str, default='elastic_net', choices=['elastic_net', 'ridge', 'lasso', 'ols'],
                    help="Type of regression. Choose from: 'elastic_net', 'ridge', 'lasso' and 'ols'")
parser.add_argument('--lipidomis_fn', type=str,
                    default='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted',
                    help='Path and name of the lipidomics data to be used in training. Assume values are already transformed or normalized')



_StoreAction(option_strings=['--lipidomis_fn'], dest='lipidomis_fn', nargs=None, const=None, default='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted', type=<class 'str'>, choices=None, help='Path and name of the lipidomics data to be used in training. Assume values are already transformed or normalized', metavar=None)

In [123]:
terminal_cmds = '''
--output_prefix output.txt \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params \
--dosage_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage \
--dosage_fn species_chr*.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3 \
--gwas_snp_fn AC-10:0-_SNPs_pval_0.001.txt \
--lip_name AC(10:0) \
--reg_type elastic_net \
--n_alphas 100 \
--multiallelic False \
--train True
'''

args = parser.parse_args(terminal_cmds.split())


args.output_prefix = f"{args.output_prefix}.{datetime.datetime.now().strftime('%Y%m%d_%H:%M:%S')}"
# Check if files exist
for i in range(1, 22):
    if not os.path.isfile(os.path.join(args.dosage_dir, args.dosage_fn.replace('*',str(i)))):
        print('# ERROR: Dosage file not found:', os.path.join(args.dosage_dir, args.dosage_fn))
        # exit()
if not os.path.isfile(os.path.join(args.gwas_snp_dir, args.gwas_snp_fn)):
    print('# ERROR: filtered GWAS result not found:', os.path.join(args.gaws_snp_dir, args.gwas_snp_fn))
#     exit()

if args.multiallelic.upper()[0]=='F' or args.multiallelic=='0':
    args.multiallelic = False
else: # Do not drop multiallelic sites if True
    args.multiallelic = True

if args.train.upper()[0]=='F' or args.train=='0':
    args.train = False
else: # Do not fill missing values with NA
    args.train = True

print('# Run starts:', datetime.datetime.now().strftime('%Y-%m-%d'))
print('# Arguments used:')
for arg in vars(args):
    print(f'# - {arg}:', getattr(args, arg))

print('# Output file is', f'{args.output_dir}/{args.output_prefix}')
print(f'# Cross validation on {args.n_alphas} alphas')

# Run starts: 2023-08-21
# Arguments used:
# - output_prefix: output.txt.20230821_12:07:15
# - output_dir: /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params
# - dosage_dir: /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage
# - dosage_fn: species_chr*.vcf.dosage.gz
# - gwas_snp_dir: /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3
# - gwas_snp_fn: AC-10:0-_SNPs_pval_0.001.txt
# - lip_name: AC(10:0)
# - n_alphas: 100
# - multiallelic: False
# - train: True
# - reg_type: elastic_net
# - lipid_name: 
# - lipidomis_fn: /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted
# Output file is /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/output.txt.20230821_12:07:15
# Cross validation on 100 alpha

In [26]:
print('# Load lipidomic data (lipid species)')
df_lipid = pd.read_csv(args.lipidomis_fn, sep='\t')
print(f"# - data loaded from {args.lipidomis_fn}: shape {df_lipid.shape}")

# Re-order lipidomics data so that sample IDs match the order in genotype file
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping,
                            sep='\t').dropna(subset=['genotype_ID',
                                                     'lipidomic']).drop_duplicates(subset='lipidomic')[['LABID', 'genotype_ID']]

print(f'\n# Load genotype IDs for matching (only need to read the first line of dosage file)')
fn_genotype = os.path.join(args.dosage_dir, args.dosage_fn.replace('*', '22'))
with open(fn_genotype) as fh:
    df_genotype_id = pd.DataFrame(fh.readline().strip().split()[9:], columns=['genotype_ID'])

print(f'# - Organize sample IDs so that their orders match in lipidomics data and dosage file')
df_lipid = df_genotype_id.merge(df_id_mapping.merge(df_lipid.drop_duplicates(subset='Sample ID'),
                                                    left_on='LABID',
                                                    right_on='Sample ID'), on='genotype_ID')
print(f'# - Final processed lipidomics data: {len(df_lipid)}')

# Load lipidomic data (lipid species)
# - data loaded from /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted: shape (1607, 831)

# Load genotype IDs for matching (only need to read the first line of dosage file)
# - Organize sample IDs so that their orders match in lipidomics data and dosage file
# - Final processed lipidomics data: 1607


In [92]:
print(f'\n# Load filtered GWAS SNPs of current lipid: {args.lip_name}')
load_dosage_start_time, snp_lst, dosage_all = load_all_dosage(gwas_snp_dir=args.gwas_snp_dir,
                                                              gwas_snp_fn=args.gwas_snp_fn,
                                                              dosage_dir=args.dosage_dir,
                                                              dosage_fn=args.dosage_fn,
                                                              multiallelic=args.multiallelic)


# Load filtered GWAS SNPs of current lipid: AC-10:0-
# Processing lipid: AC-10:0-

# Get dosage of GWAS SNPs to include in regression model
# - Checking by chromosome:
#  chr22
	# 871 SNPs processed
# - Checking finished in 23.3749s
--------------------------------------------------


In [120]:
dosage_all.shape

(871, 1607)

In [139]:
print(f'# - Number of SNPs loaded: {len(snp_lst)}')

args.reg_type = 'lasso'

print(f'\n# Run {args.reg_type} regression')
# lipid trait, already residuals and looks normal, so no need to INV
y = df_lipid[args.lip_name]
# y = inverse_normal_transformation(df_lipid[lip])
# print(y.shape)

start_time = time.time()

if args.reg_type == 'elastic_net':
    # Notes from sklearn docs:
    # - l1_ratio is the alpha in R glmnet
    # - alpha is the lambda in R gmlnet
    # Since PrediXcan used glmnet with alpha=0.5,and lambda selected by 10 fold cv,
    # The corresponding parameter in sklearn.ElasticNetCV() are:
    # - l1_ratio=0.5
    # - n_alphas=100, no user supllied selections for alpha, start with n_alphas=10 to save time
    # - In R glmnet, when nobs > nvars, the default lambda.min.ratio is 0.0001
    # - 10 fold cv
    regr = ElasticNetCV(cv=10,
                        n_alphas=args.n_alphas,
                        random_state=0,
                        n_jobs=8,
                        l1_ratio=0.5)  # Default l1 ratio=0.5
elif args.reg_type == 'ridge':
    # regr = RidgeCV(cv=10,
    #                n_alphas=args.n_alphas,
    #                random_state=0,
    #                n_jobs=8)
    # TODO: might need this later
    regr = ElasticNetCV(cv=10,
                        n_alphas=args.n_alphas,
                        random_state=0,
                        n_jobs=8,
                        l1_ratio=0)  # Ridge l1 ratio=0
    pass
elif args.reg_type == 'lasso':
    # regr = LassoCV(cv=10,
    #                n_alphas=args.n_alphas,
    #                random_state=0,
    #                n_jobs=8)
    regr = ElasticNetCV(cv=10,
                        n_alphas=args.n_alphas,
                        random_state=0,
                        n_jobs=8,
                        l1_ratio=1)  # lasso l1 ratio=1
elif args.reg_type == 'ols':
    # TODO: might need this later
    pass

X = dosage_all.T
regr.fit(X, y)

end_time = time.time()
print(f'# - Model fitting finised in {(end_time - start_time):.4f}s')

# - Number of SNPs loaded: 871

# Run lasso regression
# - Model fitting finised in 75.7735s


- Run EN model
    - Model fitting finised in 64.6335s, Score=0.3897
-  Run lasso regression (enCV)
    - Model fitting finised in 34.3832s, Score=0.3845
- Run lasso regression (lassoCV)
    - Model fitting finised in 53.4908s, Score=0.3845

In [137]:
regr.score(X, y)

0.3897001637915396

In [131]:
dir(RidgeCV)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_intercept',
 '_validate_data',
 '_validate_params',
 'fit',
 'get_params',
 'predict',
 'score',
 'set_params']

### 3.2 Create commands to run in terminal

```
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model_txt_file_wtih_tabix.py \
--output_prefix output.txt \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/lasso/training/model_params \
--dosage_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage \
--dosage_fn species_chr*.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3 \
--gwas_snp_fn AC-10:0-_SNPs_pval_0.001.txt \
--lip_name "AC(10:0)" \
--reg_type lasso \
--n_alphas 100 \
--multiallelic False \
--train True
```

In [144]:
'''
lipid_name = '"AC(10:0)"'
gwas_snp_fn = 'AC-10:0-_SNPs_pval_0.001.txt'
output_prefix = 'AC-10:0-'

cmd = f'OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model_txt_file_wtih_tabix.py \
--output_prefix {output_prefix} \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/lasso/training/model_params \
--dosage_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage \
--dosage_fn species_chr*.vcf.dosage.gz \
--gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3 \
--gwas_snp_fn {gwas_snp_fn} \
--lip_name {lipid_name} \
--reg_type lasso \
--n_alphas 100 \
--multiallelic False \
--train True'
'''

# Load lipid species names:
fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_list.txt'
lst_lipid = []
with open(fn_lipid) as fh:
    line = fh.readline().strip()
    while line != '':
        lst_lipid.append(line)
        line = fh.readline().strip()

fh_output = open('lasso_train_slurm_cmds.sh', 'w')
for lipid_name in lst_lipid:
    output_prefix = lipid_name.replace('/', '-').replace('(','-').replace(')','-').replace(' ','_')
    gwas_snp_fn = f'{output_prefix}_SNPs_pval_0.001.txt'
    lipid_name = "'" + lipid_name + "'"
    
    # cmd = f'screen -dmS {output_prefix}; screen -S {output_prefix} -X stuff '
    # cmd += '"' + f'OMP_NUM_THREADS=1 python /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code/01_elastic_net_sklearn_model_txt_file_wtih_tabix.py \
    # --output_prefix {output_prefix} \
    # --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/lasso/training/model_params \
    # --dosage_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage \
    # --dosage_fn species_chr*.vcf.dosage.gz \
    # --gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3 \
    # --gwas_snp_fn {gwas_snp_fn} \
    # --lip_name {lipid_name} \
    # --reg_type lasso \
    # --n_alphas 100 \
    # --multiallelic False \
    # --train True\\n' + '"'
    
    cmd = f'OMP_NUM_THREADS=8 python /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code/01_elastic_net_sklearn_model_txt_file_wtih_tabix.py \
    --output_prefix {output_prefix} \
    --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/lasso/training/model_params \
    --dosage_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train/bgziped_dosage \
    --dosage_fn species_chr*.vcf.dosage.gz \
    --gwas_snp_dir /data100t1/home/wanying/CCHC/lipidomics/output/traininig_set_lipid_species_GWAS/adj_for_sex_age_pval_1e-3 \
    --gwas_snp_fn {gwas_snp_fn} \
    --lip_name {lipid_name} \
    --reg_type lasso \
    --n_alphas 100 \
    --multiallelic False \
    --train True'
    
    # print('\n'+cmd)
    fh_output.write(cmd+'\n')
fh_output.close()