In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import argparse
import warnings
import sys
sys.path.append('/data100t1/home/wanying/lab_code/utils')
from rank_based_inverse_normal_transformation import inverse_normal_transformation
import datetime
print('Last run:', datetime.datetime.now().strftime('%Y-%m-%d'))
warnings.filterwarnings(action='ignore')

Last run: 2023-04-25


In [7]:
# ---------------------- Help functions ----------------------
def get_doasge(dosage_fn, lst_snps):
    '''
    Param:
     - dosage_fn: name of dosage file to be check against (single chromosome only)
     - lst_snps: a list of SNP positions to be searched for (single chromosome only)
    Return:
     - sample_ids: smaple IDs
     - dosage_matrix: dosage of given SNPs as a numpy array. Fill with NA if a SNP is not found
    '''
    
    with open(dosage_fn) as fh:
        line = fh.readline().strip() # Take sample IDs from header line
        tmp = line.split()
        indx_dosage = tmp.index('FORMAT') + 1 # Get index of sample IDs and dosage values
        indx_pos = tmp.index('POS') # Index of SNP position
        sample_ids = tmp[indx_dosage:] # Genotype IDs
        dosage_matrix = [] # Store dosage values in a numpy matrix. Lines are SNPs, columns are individuals
        
        line = fh.readline().strip()
        snp_pos = lst_snps.pop(0) # Check from the first element
        count = 0
        print('\t', end='')
        while line != '':
            # Scan through dosage file to get dosage of GWAS snps
            tmp = line.split()
            cur_pos = tmp[indx_pos]
            
            if float(cur_pos) == float(snp_pos): # Find a match
                dosage = tmp[indx_dosage:]
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    snp_pos = lst_snps.pop(0)
                else:
                    break
                line = fh.readline().strip()
                count += 1
            elif float(cur_pos) > float(snp_pos):
                # print(dosage_fn, cur_pos) # For testing !!!!
                dosage = [np.nan] * len(sample_ids) # SNP not found in dosage file, fill dosage with NAs
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    # If current position in dosage file is already larger than SNP pos
                    # Does not need to read in the next line
                    snp_pos = lst_snps.pop(0) # Check next SNP
                else:
                    # Does not need to continue reading dosage file when the SNP list is empty
                    break
            else:
                # Keep reading in the next line if SNP pos is smaller than current pos
                line = fh.readline().strip()
                count += 1
            
            if count%1000000==0:
                print(f'{count} lines processed', flush=True)
                print('\t', end='')
            elif count%20000==0:
                print('.', end='', flush=True)
    print(f'{count} lines processed')            
    return sample_ids, np.array(dosage_matrix).reshape(-1, len(sample_ids))

# Load dosage of all SNPs with p val<10-3 from GWAS
def load_all_dosage(gwas_snp_fn: str,
                    gwas_snp_dir: str='',
                    dosage_dir: str='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                    dosage_fn: str='species_chr*.vcf.gz.dosage'):
    '''
    Get doage of all SNPs (GWAS pval<1e-3) from single-chrosmosome dosage files of a given lipid
    Params:
        - gwas_snp_dir: directory to GWAS SNPs
        - gwas_snp_fn: file name of GWAS SNPs
        - dosage_dir: Subsetted dosage file: species_chr*.vcf.gz.dosage
        - dosage_fn: file name of subset dosage files (by chromosome).
                    Replace chromosome number with '*', such as 'species_chr*.vcf.gz.dosage'
    Return:
        - df_gwas_snp: a dataframe of GWAS SNPs
        - dosage_all: A numpy array of doage. Each row is a SNP, each column is a subject
    '''
    # Check if file exists
    if gwas_snp_dir.endswith('/'): gwas_snp_dir = gwas_snp_dir[:-1] # Remove last slash
    if not os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        print(f'# ERROR: GWAS SNP file not find: {gwas_snp_dir}/{gwas_snp_fn}\n# END')
        exit()
        
    lip_name = gwas_snp_fn.split('_')[0]
    print('# Processing lipid:', lip_name)

    # print(f'# Load GWAS SNPs for current lipid')
    df_gwas_snp = pd.read_csv(f'{gwas_snp_dir}/{gwas_snp_fn}', sep='\t').sort_values(by=['CHR', 'POS'])
    # print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')

    print('\n# Get dosage of GWAS SNPs to include in regression models')
    print('# - Checking by chromosome:')

    dosage_all = '' # A numpy array to store dosage from all chromosome
    start_time = datetime.datetime.now() # Time execution time
    for chr_num, df in df_gwas_snp.groupby(by='CHR'):
        # dosage_fn = f'species_chr{chr_num}.vcf.gz.dosage'
        print(f'#  chr{chr_num}')
        sample_ids, dosage_matrix = get_doasge(f"{dosage_dir}/{dosage_fn.replace('*', str(chr_num))}", list(df['POS']))
        # lst_df_dosage.append(pd.DataFrame(data=dosage_matrix, columns=sample_ids, index=df['POS']))
        if len(dosage_all) == 0: # if dosage array is empty
            dosage_all = dosage_matrix
        else:
            dosage_all = np.append(dosage_all, dosage_matrix, axis=0)
        # break
    end_time = datetime.datetime.now()
    print(f'# - Checking finished in {(end_time-start_time).total_seconds()}s')
    print('-' * 50)
    return df_gwas_snp, dosage_all.astype('float64')

# ---------------------- End of help functions ----------------------

# ################# Load lipidomic data #################
print('# Load lipidomic data (lipid species)')
fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_species.txt'
df_lipid = pd.read_csv(fn_lipid, sep='\t')
print(f"# - data loaded from {fn_lipid.split('/')[-1]}: shape {df_lipid.shape}")

# Re-order lipidomic data so that sample IDs match the order in genotype file
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping,
                            sep='\t').dropna(subset=['genotype_ID',
                                                     'lipidomic']).drop_duplicates(subset='genotype_ID')[['LABID', 'genotype_ID']]

print(f'\n# Load genotype IDs for matching (only need to read the first line of dosage file)')
dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train'
fn_genotype = f'{dosage_dir}/species_chr22.vcf.gz.dosage'
with open(fn_genotype) as fh:
    df_genotype_id = pd.DataFrame(fh.readline().strip().split()[9:], columns=['genotype_ID'])

print(f'# - Organize sample IDs so that their orders match in lipidomics data and dosage file')
df_lipid = df_genotype_id.merge(df_id_mapping.merge(df_lipid.drop_duplicates(subset='Sample ID'),
                                                    left_on='LABID',
                                                    right_on='Sample ID'), on='genotype_ID')
print(f'# - Final processed lipidomic data: {len(df_lipid)}')


# ################# Load GWAS snps of each lipid and run regression #################
# dosage_all: each row contains doages of a single SNP across all individuals
# !! Lip species PI(15-MHDA_20:4)\PI(17:0_20:4) is missing
gwas_snp_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3' # GWAS SNPs with p value<1e-3
output_file = f"{datetime.datetime.now().strftime('%Y%m%d-%M:%S')}_lip_species_elasticnet_params.txt" # Save coefficients, alpha and l1 ratios of selected model for each lipid
output_fh = open(output_file, 'w')
output_fh.write('lipid\talpha\tl1_ratio\tcoefficients\n') # write header line

count = 0 
for lip in df_lipid.columns[4:]:
    gwas_snp_fn = f"{lip.replace('(', '-').replace(')', '-').replace(' ', '_').replace('/', '-')}_SNPs_pval_0.001.txt"
    if os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        lip_name = gwas_snp_fn.split('_')[0] # Modified lipid name
        # Get SNPs and dosage
        print(f'\n# Load GWAS SNPs for current lipid: {lip_name}')
        df_gwas_snp,dosage_all = load_all_dosage(gwas_snp_dir = gwas_snp_dir,
                                                 gwas_snp_fn = gwas_snp_fn,
                                                 dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                                                 dosage_fn = 'species_chr*.vcf.gz.dosage')
        print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')
        
        print('# Run Elastic net regression')
        # lipid level, INVed
        y = inverse_normal_transformation(df_lipid[lip])
        # print(y.shape)

        start_time = time.time()
        # regr = ElasticNet(alpha=0.5, max_iter=10000, random_state=0)
        # regr = ElasticNet(alpha=0.5, random_state=0)
        # alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        alphas = [0.25, 0.5, 0.75, 1]
        # l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1] # It is recommanded to put more values close to 1 (i.e. Lasso) and less close to 0 (i.e. Ridge)
        # regr = ElasticNetCV(cv=10, random_state=0, n_jobs=32, alphas=alphas, l1_ratio=l1_ratio)
        regr = ElasticNetCV(cv=10, random_state=0, n_jobs=32, alphas=alphas) # Try 11 ratio=0.5 first
        regr.fit(dosage_all.T, y)

        end_time = time.time()
        print(f'# - Model fitting finised in {(end_time - start_time):.4f}s')
        output_fh.write(f"{lip}\t{regr.alpha_}\t{regr.l1_ratio_}\t{','.join(str(x) for x in regr.coef_)}\n")
        # alpha\tl1_ratio\tcoefficients\n'
        # break
    else:
        print(f'# - Warning: {lip} not found')
    count += 1
    print(f'# #################### {count} lipid processed ####################')
output_fh.close()


# Load lipidomic data (lipid species)
# - data loaded from lipid_species.txt: shape (2499, 832)

# Load genotype IDs for matching (only need to read the first line of dosage file)
# - Organize sample IDs so that their orders match in lipidomics data and dosage file
# - Final processed lipidomic data: 1607

# Load GWAS SNPs for current lipid: Sph-d18:1-
# Processing lipid: Sph-d18:1-

# Get dosage of GWAS SNPs to include in regression models
# - Checking by chromosome:
#  chr1
	.........

KeyboardInterrupt: 

In [3]:
from sklearn.datasets import make_regression

X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNet(random_state=0)
regr.fit(X, y)
ElasticNet(random_state=0)
print(regr.coef_)
print(regr.intercept_)
print(regr.predict([[0, 0]]))

[18.83816048 64.55968825]
1.4512607561654032
[1.45126076]


In [148]:
alphas = [0, 0.25, 0.5, 0.75, 1]
# regr = ElasticNetCV(cv=10, random_state=0, alphas=alphas, max_iter=10000000, tol=0.001)
regr3 = ElasticNetCV(cv=10, random_state=0, alphas=[0.1, 0.15, 0.1994727942696716, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
regr3.fit(X, y)
print(regr3.coef_)
print('Alpha:', regr3.alpha_)
print('L1 ratio:', regr3.l1_ratio_)
print('Intercept:', regr3.intercept_)
print(regr3.coef_, regr3.alphas_)
print(regr3.predict([[0,0]]))

[29.05783225 95.72681905]
Alpha: 0.01
L1 ratio: 0.5
Intercept: 0.02195782605835639
[29.05783225 95.72681905] [1.         0.9        0.8        0.7        0.6        0.5
 0.4        0.3        0.2        0.19947279 0.15       0.1
 0.01      ]
[0.02195783]


In [136]:
alphas = [0, 0.25, 0.5, 0.75, 1]
# regr = ElasticNetCV(cv=10, random_state=0, alphas=alphas, max_iter=10000000, tol=0.001)
regr1 = ElasticNetCV(cv=10, random_state=0, eps=1, n_alphas=10)
regr1.fit(X, y)
print(regr1.coef_)
print('Alpha:', regr1.alpha_)
print('L1 ratio:', regr1.l1_ratio_)
print('Intercept:', regr1.intercept_)
print(regr1.coef_, regr1.alphas_)
print(regr1.predict([[0,0]]))

[0.00000000e+00 1.80489686e-16]
Alpha: 199.4727942696716
L1 ratio: 0.5
Intercept: 4.0791205793431775
[0.00000000e+00 1.80489686e-16] [199.47279427 199.47279427 199.47279427 199.47279427 199.47279427
 199.47279427 199.47279427 199.47279427 199.47279427 199.47279427]
[4.07912058]


In [12]:
# Test nested CV (as used in predixcan)
from sklearn.datasets import make_regression

X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNet(random_state=0)
regr.fit(X, y)
# ElasticNet(random_state=0)
print(regr.coef_)
print(regr.intercept_)
print(regr.predict([[0, 0], [1, 2]]))
print(regr.score(X,y))

[18.83816048 64.55968825]
1.4512607561654032
[  1.45126076 149.40879773]
0.8904453086976037


In [None]:
# Define model
model = ElasticNet(l1_ratio=0.5, random_state=0)
# Define evaluation method

# Define grid search

# Define


In [8]:
from sklearn.metrics import r2_score
r2_score(y, regr.predict(X))

0.8904453086976037

In [17]:
# cmd = 'python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_200-299.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 200'

for start in range(0, 829, 50):
    # print(start)
    window = 50
    cmd = f'python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_{start}-{start+window-1}.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range {start} --range_window {window}'
    print(cmd)
    
    

python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_0-49.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 0 --range_window 50
python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_50-99.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 50 --range_window 50
python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_100-149.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 100 --range_window 50
python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_150-199.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 150 --range_window 50
python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_200-249.txt --output_dir /data100t1/home/wanying/CC

In [75]:

############ Testing run #############



from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np
import os
import time
import argparse
import sys
sys.path.append('/data100t1/home/wanying/lab_code/utils')
from rank_based_inverse_normal_transformation import inverse_normal_transformation
import warnings
import datetime
warnings.filterwarnings(action='ignore')

'''
Example call:
# python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_500-599.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 500
python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_100-199.txt \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params \
--lipid_range 100 \
--range_window 100
'''
# ---------------------- Help functions ----------------------
def get_doasge(dosage_fn, lst_snps):
    '''
    Param:
     - dosage_fn: name of dosage file to be check against (single chromosome only)
     - lst_snps: a list of SNP positions to be searched for (single chromosome only)
    Return:
     - sample_ids: smaple IDs
     - dosage_matrix: dosage of given SNPs as a numpy array. Fill with NA if a SNP is not found
    '''
    
    with open(dosage_fn) as fh:
        line = fh.readline().strip() # Take sample IDs from header line
        tmp = line.split()
        indx_dosage = tmp.index('FORMAT') + 1 # Get index of sample IDs and dosage values
        indx_pos = tmp.index('POS') # Index of SNP position
        sample_ids = tmp[indx_dosage:] # Genotype IDs
        dosage_matrix = [] # Store dosage values in a numpy matrix. Lines are SNPs, columns are individuals
        
        line = fh.readline().strip()
        snp_pos = lst_snps.pop(0) # Check from the first element
        count = 0
        print('\t', end='')
        while line != '':
            # Scan through dosage file to get dosage of GWAS snps
            tmp = line.split()
            cur_pos = tmp[indx_pos]
            
            if float(cur_pos) == float(snp_pos): # Find a match
                dosage = tmp[indx_dosage:]
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    snp_pos = lst_snps.pop(0)
                else:
                    break
                line = fh.readline().strip()
                count += 1
            elif float(cur_pos) > float(snp_pos):
                # print(dosage_fn, cur_pos) # For testing !!!!
                dosage = [np.nan] * len(sample_ids) # SNP not found in dosage file, fill dosage with NAs
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    # If current position in dosage file is already larger than SNP pos
                    # Does not need to read in the next line
                    snp_pos = lst_snps.pop(0) # Check next SNP
                else:
                    # Does not need to continue reading dosage file when the SNP list is empty
                    break
            else:
                # Keep reading in the next line if SNP pos is smaller than current pos
                line = fh.readline().strip()
                count += 1
            
            if count%1000000==0:
                print(f'{count} lines processed', flush=True)
                print('\t', end='')
            elif count%20000==0:
                print('.', end='', flush=True)
    print(f'{count} lines processed')            
    return sample_ids, np.array(dosage_matrix).reshape(-1, len(sample_ids))

# Load dosage of all SNPs with p val<10-3 from GWAS
def load_all_dosage(gwas_snp_fn: str,
                    gwas_snp_dir: str='',
                    dosage_dir: str='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                    dosage_fn: str='species_chr*.vcf.gz.dosage'):
    '''
    Get doage of all SNPs (GWAS pval<1e-3) from single-chrosmosome dosage files of a given lipid
    Params:
        - gwas_snp_dir: directory to GWAS SNPs
        - gwas_snp_fn: file name of GWAS SNPs
        - dosage_dir: Subsetted dosage file: species_chr*.vcf.gz.dosage
        - dosage_fn: file name of subset dosage files (by chromosome).
                    Replace chromosome number with '*', such as 'species_chr*.vcf.gz.dosage'
    Return:
        - start_time: start time of loading dosage
        - df_gwas_snp: a dataframe of GWAS SNPs
        - dosage_all: A numpy array of doage. Each row is a SNP, each column is a subject
    '''
    # Check if file exists
    if gwas_snp_dir.endswith('/'): gwas_snp_dir = gwas_snp_dir[:-1] # Remove last slash
    if not os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        print(f'# ERROR: GWAS SNP file not find: {gwas_snp_dir}/{gwas_snp_fn}\n# END')
        exit()
        
    lip_name = gwas_snp_fn.split('_')[0]
    print('# Processing lipid:', lip_name)

    # print(f'# Load GWAS SNPs for current lipid')
    df_gwas_snp = pd.read_csv(f'{gwas_snp_dir}/{gwas_snp_fn}', sep='\t').sort_values(by=['CHR', 'POS'])
    # print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')

    print('\n# Get dosage of GWAS SNPs to include in regression models')
    print('# - Checking by chromosome:')

    dosage_all = '' # A numpy array to store dosage from all chromosome
    start_time = time.time() # Time execution time
    for chr_num, df in df_gwas_snp.groupby(by='CHR'):
        # dosage_fn = f'species_chr{chr_num}.vcf.gz.dosage'
        print(f'#  chr{chr_num}')
        sample_ids, dosage_matrix = get_doasge(f"{dosage_dir}/{dosage_fn.replace('*', str(chr_num))}", list(df['POS']))
        # lst_df_dosage.append(pd.DataFrame(data=dosage_matrix, columns=sample_ids, index=df['POS']))
        if len(dosage_all) == 0: # if dosage array is empty
            dosage_all = dosage_matrix
        else:
            dosage_all = np.append(dosage_all, dosage_matrix, axis=0)
        ########## For testing purpose ##########
        break
        ####################
    end_time = time.time()
    print(f'# - Checking finished in {(start_time-end_time):.4f}s')
    print('-' * 50)
    return start_time, df_gwas_snp, dosage_all.astype('float64')

# ---------------------- End of help functions ----------------------


# ################# Process args #################
parser = argparse.ArgumentParser(description='Fit elastic net regression with 10 fold cross-validation')
parser.add_argument('-o', '--output', type=str,
                           help='Output file to  save alpha, l1_ratio and coefficients of chosen model')
parser.add_argument('--output_dir', type=str, help='Output directory. Defualt is current directory', default='.')
parser.add_argument('--lipid_range', type=int, default=-1,
                    help='Define a subset of lipids to run. Default -1 ie. run all lipids')
parser.add_argument('--range_window', type=int, default=100,
                    help='Define a window of lipids to run. Default is 100, will run from lipid_range to lipid_range+range_window-1')
# args = parser.parse_args()
args = parser.parse_args('--output lipid_species_l1_0.5_100-149.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params --lipid_range 100 --range_window 50'.split())
args.output = f"{args.output}.{datetime.datetime.now().strftime('%Y%m%d_%H:%M:%S')}"
if args.output_dir.endswith('/'): args.output_dir = args.output_dir[:-1]
print('# Run starts:', datetime.datetime.now().strftime('%Y-%m-%d'))
print('# Output file is', f'{args.output_dir}/{args.output}')
if args.lipid_range==0:
    print('# Run all lipids')
else:
    print(f'# Run lipids from index {args.lipid_range} to {args.lipid_range + args.range_window - 1}')

# ################# Load lipidomic data #################
print('# Load lipidomic data (lipid species)')
# fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_species.txt'
fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/lipid_traits_residuals/train/lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted'
df_lipid = pd.read_csv(fn_lipid, sep='\t')
print(f"# - data loaded from {fn_lipid.split('/')[-1]}: shape {df_lipid.shape}")

# Re-order lipidomic data so that sample IDs match the order in genotype file
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping,
                            sep='\t').dropna(subset=['genotype_ID',
                                                     'lipidomic']).drop_duplicates(subset='lipidomic')[['LABID', 'genotype_ID']]

print(f'\n# Load genotype IDs for matching (only need to read the first line of dosage file)')
dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train'
fn_genotype = f'{dosage_dir}/species_chr22.vcf.gz.dosage'
with open(fn_genotype) as fh:
    df_genotype_id = pd.DataFrame(fh.readline().strip().split()[9:], columns=['genotype_ID'])

print(f'# - Organize sample IDs so that their orders match in lipidomics data and dosage file')
df_lipid = df_genotype_id.merge(df_id_mapping.merge(df_lipid.drop_duplicates(subset='Sample ID'),
                                                    left_on='LABID',
                                                    right_on='Sample ID'), on='genotype_ID')
print(f'# - Final processed lipidomic data: {len(df_lipid)}')


# ################# Load GWAS snps of each lipid and run regression #################
# dosage_all: each row contains doages of a single SNP across all individuals
# !! Lip species PI(15-MHDA_20:4)\PI(17:0_20:4) is missing
gwas_snp_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3' # GWAS SNPs with p value<1e-3
# Save coefficients, alpha and l1 ratios of selected model for each lipid
output_fh = open(f'{args.output_dir}/{args.output}', 'w')
output_fh.write('lipid\talpha\tl1_ratio\tbest_r2\tcoefficients\n') # write header line
output_fh_lip_pred = open(f'{args.output_dir}/{args.output}.pred', 'w') # Save predicted values of each lipid using best fitted model
output_fh_lip_pred.write('Lipid'+'\t'+'\t'.join([val for val in df_lipid['Sample ID']])+'\n') # write header line

count = 0
# get list of lipids to be fitted
if args.lipid_range != -1:
    if args.lipid_range>=len(df_lipid.columns[4:]):
        print(f'# ERROR: lipid range {args.lipid_range} is out of range')
        exit()
    else:
        try:
            lst_lipids = df_lipid.columns[4+args.lipid_range : 4+args.lipid_range+args.range_window]
        except:
            # If window is too large, just run to the end of lipid list
            lst_lipids = df_lipid.columns[4 + args.lipid_range:]
else:
    lst_lipids = df_lipid.columns[4:]

for lip in lst_lipids:
    gwas_snp_fn = f"{lip.replace('(', '-').replace(')', '-').replace(' ', '_').replace('/', '-')}_SNPs_pval_0.001.txt"
    if os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        lip_name = gwas_snp_fn.split('_')[0] # Modified lipid name
        # Get SNPs and dosage
        print(f'\n# Load GWAS SNPs for current lipid: {lip_name}')
        load_dosage_start_time, df_gwas_snp,dosage_all = load_all_dosage(gwas_snp_dir = gwas_snp_dir,
                                                                         gwas_snp_fn = gwas_snp_fn,
                                                                         dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                                                                         dosage_fn = 'species_chr*.vcf.gz.dosage')
        print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')
        
        print('# Run Elastic net regression')
        # lipid level, INVed
        y = inverse_normal_transformation(df_lipid[lip])
        # print(y.shape)

        start_time = time.time()
        # Notes from sklearn docs:
        # - l1_ratio is the alpha in R glmnet
        # - alpha is the lambda in R gmlnet
        # Since PrediXcan used glmnet with alpha=0.5,and lambda selected by 10 fold cv,
        # The corresponding parameter in sklearn.ElasticNetCV() are:
        # - l1_ratio=0.5
        # - n_alphas=100, no user supllied selections for alpha
        # - In R glmnet, when nobs > nvars, the default lambda.min.ratio is 0.0001
        # - 10 fold cv
        regr = ElasticNetCV(cv=10,
                            random_state=0,
                            n_jobs=32,
                            l1_ratio=0.5) # Default l1 ratio=0.5
        regr.fit(dosage_all.T, y)

        end_time = time.time()
        print(f'# - Model fitting finised in {(end_time - start_time):.4f}s')
        
        # Also output predicted values and best R2
        output_fh.write(f"{lip}\t{regr.alpha_}\t{regr.l1_ratio_}\t{regr.score(X, y)}\t{','.join([str(x) for x in regr.coef_])}\n")
        output_fh_lip_pred.write(lip+'\t'+'\t'.join([str(val) for val in regr.pred(X)])+'\n')
        print(f'# Total running time of current lipid: {(end_time - load_dosage_start_time):.4f}s')
        # break
    else:
        print(f'# - Warning: {lip} not found')
    count += 1
    print(f'# #################### {count} lipid processed ####################')
output_fh.close()
output_fh_lip_pred.close()



# Run starts: 2023-04-23
# Output file is /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/lipid_species_l1_0.5_100-149.txt.20230423_21:15:49
# Run lipids from index 100 to 149
# Load lipidomic data (lipid species)
# - data loaded from lipid_species_residuals_adj_for_sex_age_pc1-5.txt.reformatted: shape (1607, 831)

# Load genotype IDs for matching (only need to read the first line of dosage file)
# - Organize sample IDs so that their orders match in lipidomics data and dosage file
# - Final processed lipidomic data: 1607

# Load GWAS SNPs for current lipid: GM3-d18:1-24:1-
# Processing lipid: GM3-d18:1-24:1-

# Get dosage of GWAS SNPs to include in regression models
# - Checking by chromosome:
#  chr1
	......................................777198 lines processed
# - Checking finished in -22.2336s
--------------------------------------------------
# - Number of SNPs loaded: 28090
# Run Elastic net regression
# - Model fitting finised in 1183.7

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2352 is different from 2)

# Check model coefficients

In [18]:
file_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/'
for fn in os.listdir(file_dir):
    if ('20230424' in fn) and ('.pred' not in fn):
        print(fn, end='')
        df = pd.read_csv(file_dir + fn, sep='\t')
        coeffs = df['coefficients']
        dict_counts = {0:0, -1:0}
        for val in coeffs:
            count = 0
            for coeff in val.split(','):
                if float(coeff) != 0:
                    count += 1
            # print(count)
            if count == 0:
                dict_counts[0] += 1
            else:
                dict_counts[-1] += 1
        print(f'########### All 0s: {dict_counts[0]}; Other: {dict_counts[-1]}')
    # break

lipid_species_l1_0.5_650-699.txt.20230424_00:11:57########### All 0s: 0; Other: 50
lipid_species_l1_0.5_562-599.txt.20230424_13:00:54########### All 0s: 0; Other: 38
lipid_species_l1_0.5_261-299.txt.20230424_12:48:55########### All 0s: 0; Other: 0
lipid_species_l1_0.5_100-149.txt.20230424_00:09:03########### All 0s: 0; Other: 11
lipid_species_l1_0.5_5-104_100_alpha_CV.txt.20230424_09:03:41########### All 0s: 0; Other: 15
lipid_species_l1_0.5_200-249.txt.20230424_00:09:28########### All 0s: 0; Other: 11
lipid_species_l1_0.5_600-649.txt.20230424_00:11:48########### All 0s: 0; Other: 32
lipid_species_l1_0.5_100_alpha_CV.txt.20230424_00:11:16########### All 0s: 0; Other: 5
lipid_species_l1_0.5_412-449.txt.20230424_12:55:00########### All 0s: 0; Other: 0
lipid_species_l1_0.5_512-549.txt.20230424_13:01:00########### All 0s: 0; Other: 37
lipid_species_l1_0.5_562-599.txt.20230424_12:58:59########### All 0s: 0; Other: 0
lipid_species_l1_0.5_360-399.txt.20230424_13:02:16########### All 0s: 0; Ot

EmptyDataError: No columns to parse from file

In [57]:
# Fit models with 100 alphas in CV
'''
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_5-104_100_alpha_CV.txt \
--output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas \
--lipid_range 5 \
--range_window 100 \
--n_alphas 100
'''
window = 45
code_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/code'
count = 0
for i in range(33, 830, window):
    cmd = f'OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_{i}-{i+window-1}_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range {i} --range_window {window} --n_alphas 100'
    screen_session = f'100alphas_{i}-{i+window-1}'
    count += 1
    print(f'#################### CMD {count} ####################')
    print(f'cd {code_dir}; screen -S {screen_session} -L -Logfile {screen_session}.log')
    print(cmd+'\n')

#################### CMD 1 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_33-77 -L -Logfile 100alphas_33-77.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_33-77_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 33 --range_window 45 --n_alphas 100

#################### CMD 2 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_78-122 -L -Logfile 100alphas_78-122.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_78-122_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 78 --range_window 45 --n_alphas 100

#################### CMD 3 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_m

vgipiper
#################### CMD 1 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_33-77 -L -Logfile 100alphas_33-77.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_33-77_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 33 --range_window 45 --n_alphas 100

#################### CMD 2 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_78-122 -L -Logfile 100alphas_78-122.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_78-122_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 78 --range_window 45 --n_alphas 100




vgi02
#################### CMD 3 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_123-167 -L -Logfile 100alphas_123-167.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_123-167_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 123 --range_window 45 --n_alphas 100

#################### CMD 4 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_168-212 -L -Logfile 100alphas_168-212.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_168-212_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 168 --range_window 45 --n_alphas 100

#################### CMD 5 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_213-257 -L -Logfile 100alphas_213-257.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_213-257_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 213 --range_window 45 --n_alphas 100

#################### CMD 6 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_258-302 -L -Logfile 100alphas_258-302.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_258-302_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 258 --range_window 45 --n_alphas 100

#################### CMD 7 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_303-347 -L -Logfile 100alphas_303-347.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_303-347_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 303 --range_window 45 --n_alphas 100

#################### CMD 8 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_348-392 -L -Logfile 100alphas_348-392.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_348-392_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 348 --range_window 45 --n_alphas 100

#################### CMD 9 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_393-437 -L -Logfile 100alphas_393-437.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_393-437_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 393 --range_window 45 --n_alphas 100

#################### CMD 10 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_438-482 -L -Logfile 100alphas_438-482.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_438-482_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 438 --range_window 45 --n_alphas 100

#################### CMD 11 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_483-527 -L -Logfile 100alphas_483-527.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_483-527_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 483 --range_window 45 --n_alphas 100

#################### CMD 12 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_528-572 -L -Logfile 100alphas_528-572.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_528-572_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 528 --range_window 45 --n_alphas 100





vgipiper03
#################### CMD 13 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_573-617 -L -Logfile 100alphas_573-617.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_573-617_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 573 --range_window 45 --n_alphas 100

#################### CMD 14 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_618-662 -L -Logfile 100alphas_618-662.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_618-662_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 618 --range_window 45 --n_alphas 100





vgipiper05
#################### CMD 15 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_663-707 -L -Logfile 100alphas_663-707.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_663-707_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 663 --range_window 45 --n_alphas 100

#################### CMD 16 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_708-752 -L -Logfile 100alphas_708-752.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_708-752_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 708 --range_window 45 --n_alphas 100

#################### CMD 17 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_753-797 -L -Logfile 100alphas_753-797.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_753-797_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 753 --range_window 45 --n_alphas 100

#################### CMD 18 ####################
cd /data100t1/home/wanying/CCHC/lipidomics/prediction_models/code; screen -S 100alphas_798-842 -L -Logfile 100alphas_798-842.log
OMP_NUM_THREADS=1 python 01_elastic_net_sklearn_model.py --output lipid_species_l1_0.5_798-842_100alphaCV.txt --output_dir /data100t1/home/wanying/CCHC/lipidomics/prediction_models/elastic_net/training/model_params/100alphas --lipid_range 798 --range_window 45 --n_alphas 100

