In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import warnings
import sys
sys.path.append('/data100t1/home/wanying/lab_code/utils')
from rank_based_inverse_normal_transformation import inverse_normal_transformation
import datetime
print('Last run:', datetime.datetime.now().strftime('%Y-%m-%d'))
warnings.filterwarnings(action='ignore')

Last run: 2023-04-17


In [27]:
# ---------------------- Help functions ----------------------
def get_doasge(dosage_fn, lst_snps):
    '''
    Param:
     - dosage_fn: name of dosage file to be check against (single chromosome only)
     - lst_snps: a list of SNP positions to be searched for (single chromosome only)
    Return:
     - sample_ids: smaple IDs
     - dosage_matrix: dosage of given SNPs as a numpy array. Fill with NA if a SNP is not found
    '''
    
    with open(dosage_fn) as fh:
        line = fh.readline().strip() # Take sample IDs from header line
        tmp = line.split()
        indx_dosage = tmp.index('FORMAT') + 1 # Get index of sample IDs and dosage values
        indx_pos = tmp.index('POS') # Index of SNP position
        sample_ids = tmp[indx_dosage:] # Genotype IDs
        dosage_matrix = [] # Store dosage values in a numpy matrix. Lines are SNPs, columns are individuals
        
        line = fh.readline().strip()
        snp_pos = lst_snps.pop(0) # Check from the first element
        count = 0
        print('\t', end='')
        while line != '':
            # Scan through dosage file to get dosage of GWAS snps
            tmp = line.split()
            cur_pos = tmp[indx_pos]
            
            if float(cur_pos) == float(snp_pos): # Find a match
                dosage = tmp[indx_dosage:]
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    snp_pos = lst_snps.pop(0)
                else:
                    break
                line = fh.readline().strip()
                count += 1
            elif float(cur_pos) > float(snp_pos):
                # print(dosage_fn, cur_pos) # For testing !!!!
                dosage = [np.nan] * len(sample_ids) # SNP not found in dosage file, fill dosage with NAs
                dosage_matrix += dosage
                if len(lst_snps) > 0:
                    # If current position in dosage file is already larger than SNP pos
                    # Does not need to read in the next line
                    snp_pos = lst_snps.pop(0) # Check next SNP
                else:
                    # Does not need to continue reading dosage file when the SNP list is empty
                    break
            else:
                # Keep reading in the next line if SNP pos is smaller than current pos
                line = fh.readline().strip()
                count += 1
            
            if count%1000000==0:
                print(f'{count} lines processed', flush=True)
                print('\t', end='')
            elif count%20000==0:
                print('.', end='', flush=True)
    print(f'{count} lines processed')            
    return sample_ids, np.array(dosage_matrix).reshape(-1, len(sample_ids))

# Load dosage of all SNPs with p val<10-3 from GWAS
def load_all_dosage(gwas_snp_fn: str,
                    gwas_snp_dir: str='',
                    dosage_dir: str='/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                    dosage_fn: str='species_chr*.vcf.gz.dosage'):
    '''
    Get doage of all SNPs (GWAS pval<1e-3) from single-chrosmosome dosage files of a given lipid
    Params:
        - gwas_snp_dir: directory to GWAS SNPs
        - gwas_snp_fn: file name of GWAS SNPs
        - dosage_dir: Subsetted dosage file: species_chr*.vcf.gz.dosage
        - dosage_fn: file name of subset dosage files (by chromosome).
                    Replace chromosome number with '*', such as 'species_chr*.vcf.gz.dosage'
    Return:
        - df_gwas_snp: a dataframe of GWAS SNPs
        - dosage_all: A numpy array of doage. Each row is a SNP, each column is a subject
    '''
    # Check if file exists
    if gwas_snp_dir.endswith('/'): gwas_snp_dir = gwas_snp_dir[:-1] # Remove last slash
    if not os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        print(f'# ERROR: GWAS SNP file not find: {gwas_snp_dir}/{gwas_snp_fn}\n# END')
        exit()
        
    lip_name = gwas_snp_fn.split('_')[0]
    print('# Processing lipid:', lip_name)

    # print(f'# Load GWAS SNPs for current lipid')
    df_gwas_snp = pd.read_csv(f'{gwas_snp_dir}/{gwas_snp_fn}', sep='\t').sort_values(by=['CHR', 'POS'])
    # print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')

    print('\n# Get dosage of GWAS SNPs to include in regression models')
    print('# - Checking by chromosome:')

    dosage_all = '' # A numpy array to store dosage from all chromosome
    start_time = datetime.datetime.now() # Time execution time
    for chr_num, df in df_gwas_snp.groupby(by='CHR'):
        # dosage_fn = f'species_chr{chr_num}.vcf.gz.dosage'
        print(f'#  chr{chr_num}')
        sample_ids, dosage_matrix = get_doasge(f"{dosage_dir}/{dosage_fn.replace('*', str(chr_num))}", list(df['POS']))
        # lst_df_dosage.append(pd.DataFrame(data=dosage_matrix, columns=sample_ids, index=df['POS']))
        if len(dosage_all) == 0: # if dosage array is empty
            dosage_all = dosage_matrix
        else:
            dosage_all = np.append(dosage_all, dosage_matrix, axis=0)
        # break
    end_time = datetime.datetime.now()
    print(f'# - Checking finished in {(end_time-start_time).total_seconds()}s')
    print('-' * 50)
    return df_gwas_snp, dosage_all.astype('float64')

# ---------------------- End of help functions ----------------------

# ################# Load lipidomic data #################
print('# Load lipidomic data (lipid species)')
fn_lipid = '/data100t1/home/wanying/CCHC/lipidomics/input_docs/lipidomic_measures/lipid_species.txt'
df_lipid = pd.read_csv(fn_lipid, sep='\t')
print(f"# - data loaded from {fn_lipid.split('/')[-1]}: shape {df_lipid.shape}")

# Re-order lipidomic data so that sample IDs match the order in genotype file
fn_id_mapping = '/data100t1/home/wanying/CCHC/doc/samples_IDs/202211_merged_RNA_lipid_protein_genotype_mapping_and_availability.txt'
df_id_mapping = pd.read_csv(fn_id_mapping,
                            sep='\t').dropna(subset=['genotype_ID',
                                                     'lipidomic']).drop_duplicates(subset='genotype_ID')[['LABID', 'genotype_ID']]

print(f'\n# Load genotype IDs for matching (only need to read the first line of dosage file)')
dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train'
fn_genotype = f'{dosage_dir}/species_chr22.vcf.gz.dosage'
with open(fn_genotype) as fh:
    df_genotype_id = pd.DataFrame(fh.readline().strip().split()[9:], columns=['genotype_ID'])

print(f'# - Organize sample IDs so that their orders match in lipidomics data and dosage file')
df_lipid = df_genotype_id.merge(df_id_mapping.merge(df_lipid.drop_duplicates(subset='Sample ID'),
                                                    left_on='LABID',
                                                    right_on='Sample ID'), on='genotype_ID')
print(f'# - Final processed lipidomic data: {len(df_lipid)}')


# ################# Load GWAS snps of each lipid and run regression #################
# dosage_all: each row contains doages of a single SNP across all individuals
# !! Lip species PI(15-MHDA_20:4)\PI(17:0_20:4) is missing
gwas_snp_dir = '/data100t1/home/wanying/CCHC/lipidomics/output/lip_species_GWAS_snps_pval_1e-3' # GWAS SNPs with p value<1e-3
output_file = f"{datetime.datetime.now().strftime('%Y%m%d-%M:%S')}_lip_species_elasticnet_params.txt" # Save coefficients, alpha and l1 ratios of selected model for each lipid
output_fh = open(output_file, 'w')
output_fh.write('lipid\talpha\tl1_ratio\tcoefficients\n') # write header line

count = 0 
for lip in df_lipid.columns[4:]:
    gwas_snp_fn = f"{lip.replace('(', '-').replace(')', '-').replace(' ', '_').replace('/', '-')}_SNPs_pval_0.001.txt"
    if os.path.isfile(f'{gwas_snp_dir}/{gwas_snp_fn}'):
        lip_name = gwas_snp_fn.split('_')[0] # Modified lipid name
        # Get SNPs and dosage
        print(f'\n# Load GWAS SNPs for current lipid: {lip_name}')
        df_gwas_snp,dosage_all = load_all_dosage(gwas_snp_dir = gwas_snp_dir,
                                                 gwas_snp_fn = gwas_snp_fn,
                                                 dosage_dir = '/data100t1/home/wanying/CCHC/lipidomics/prediction_models/input_docs/subset_vcfs/train',
                                                 dosage_fn = 'species_chr*.vcf.gz.dosage')
        print(f'# - Number of SNPs loaded: {len(df_gwas_snp)}')
        
        print('# Run Elastic net regression')
        # lipid level
        y = df_lipid[lip]
        # print(y.shape)

        start_time = time.time()
        # regr = ElasticNet(alpha=0.5, max_iter=10000, random_state=0)
        # regr = ElasticNet(alpha=0.5, random_state=0)
        # alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        alphas = [0.25, 0.5, 0.75, 1]
        # l1_ratio = [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1] # It is recommanded to put more values close to 1 (i.e. Lasso) and less close to 0 (i.e. Ridge)
        # regr = ElasticNetCV(cv=10, random_state=0, n_jobs=32, alphas=alphas, l1_ratio=l1_ratio)
        regr = ElasticNetCV(cv=10, random_state=0, n_jobs=32, alphas=alphas) # Try 11 ratio=0.5 first
        regr.fit(dosage_all.T, y)

        end_time = time.time()
        print(f'# - Model fitting finised in {(end_time - start_time):.4f}s')
        output_fh.write(f"{lip}\t{regr.alpha_}\t{regr.l1_ratio_}\t{','.join(str(x) for x in regr.coef_)}\n")
        # alpha\tl1_ratio\tcoefficients\n'
        # break
    else:
        print(f'# - Warning: {lip} not found')
    count += 1
    print(f'# #################### {count} lipid processed ####################')
output_fh.close()


# Load lipidomic data (lipid species)
# - data loaded from lipid_species.txt: shape (2499, 832)

# Load genotype IDs for matching (only need to read the first line of dosage file)
# - Organize sample IDs so that their orders match in lipidomics data and dosage file
# - Final processed lipidomic data: 1607

# Load GWAS SNPs for current lipid: Sph-d18:1-
# Processing lipid: Sph-d18:1-

# Get dosage of GWAS SNPs to include in regression models
# - Checking by chromosome:
# chr1
	............

KeyboardInterrupt: 

In [24]:
from sklearn.datasets import make_regression

X, y = make_regression(n_features=2, random_state=0)
regr = ElasticNet(random_state=0)
regr.fit(X, y)
ElasticNet(random_state=0)
print(regr.coef_)
print(regr.intercept_)
print(regr.predict([[0, 0]]))

[18.83816048 64.55968825]
1.4512607561654032
[1.45126076]


In [25]:
X.dtype

dtype('float64')

In [148]:
alphas = [0, 0.25, 0.5, 0.75, 1]
# regr = ElasticNetCV(cv=10, random_state=0, alphas=alphas, max_iter=10000000, tol=0.001)
regr3 = ElasticNetCV(cv=10, random_state=0, alphas=[0.1, 0.15, 0.1994727942696716, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])
regr3.fit(X, y)
print(regr3.coef_)
print('Alpha:', regr3.alpha_)
print('L1 ratio:', regr3.l1_ratio_)
print('Intercept:', regr3.intercept_)
print(regr3.coef_, regr3.alphas_)
print(regr3.predict([[0,0]]))

[29.05783225 95.72681905]
Alpha: 0.01
L1 ratio: 0.5
Intercept: 0.02195782605835639
[29.05783225 95.72681905] [1.         0.9        0.8        0.7        0.6        0.5
 0.4        0.3        0.2        0.19947279 0.15       0.1
 0.01      ]
[0.02195783]


In [136]:
alphas = [0, 0.25, 0.5, 0.75, 1]
# regr = ElasticNetCV(cv=10, random_state=0, alphas=alphas, max_iter=10000000, tol=0.001)
regr1 = ElasticNetCV(cv=10, random_state=0, eps=1, n_alphas=10)
regr1.fit(X, y)
print(regr1.coef_)
print('Alpha:', regr1.alpha_)
print('L1 ratio:', regr1.l1_ratio_)
print('Intercept:', regr1.intercept_)
print(regr1.coef_, regr1.alphas_)
print(regr1.predict([[0,0]]))

[0.00000000e+00 1.80489686e-16]
Alpha: 199.4727942696716
L1 ratio: 0.5
Intercept: 4.0791205793431775
[0.00000000e+00 1.80489686e-16] [199.47279427 199.47279427 199.47279427 199.47279427 199.47279427
 199.47279427 199.47279427 199.47279427 199.47279427 199.47279427]
[4.07912058]


In [150]:
alphas = [0, 0.25, 0.5, 0.75, 1]
# regr = ElasticNetCV(cv=10, random_state=0, alphas=alphas, max_iter=10000000, tol=0.001)
regr2 = ElasticNetCV(cv=10, random_state=0, l1_ratio=[.1, .5, .7, .9, .95, .99, 1])
regr2.fit(X, y)
print(regr2.coef_)
print('Alpha:', regr2.alpha_)
print('L1 ratio:', regr2.l1_ratio_)
print('Intercept:', regr2.intercept_)
print(regr2.coef_, regr2.alphas_)
print(regr2.predict([[0,0]]))

[29.11573363 96.09577862]
Alpha: 0.09973639713483579
L1 ratio: 1.0
Intercept: 0.014044090462122405
[29.11573363 96.09577862] [[9.97363971e+02 9.30144978e+02 8.67456319e+02 8.08992667e+02
  7.54469269e+02 7.03620565e+02 6.56198894e+02 6.11973285e+02
  5.70728333e+02 5.32263154e+02 4.96390399e+02 4.62935347e+02
  4.31735054e+02 4.02637557e+02 3.75501133e+02 3.50193613e+02
  3.26591736e+02 3.04580546e+02 2.84052836e+02 2.64908626e+02
  2.47054671e+02 2.30404013e+02 2.14875554e+02 2.00393661e+02
  1.86887799e+02 1.74292187e+02 1.62545476e+02 1.51590455e+02
  1.41373766e+02 1.31845647e+02 1.22959692e+02 1.14672620e+02
  1.06944069e+02 9.97363971e+01 9.30144978e+01 8.67456319e+01
  8.08992667e+01 7.54469269e+01 7.03620565e+01 6.56198894e+01
  6.11973285e+01 5.70728333e+01 5.32263154e+01 4.96390399e+01
  4.62935347e+01 4.31735054e+01 4.02637557e+01 3.75501133e+01
  3.50193613e+01 3.26591736e+01 3.04580546e+01 2.84052836e+01
  2.64908626e+01 2.47054671e+01 2.30404013e+01 2.14875554e+01
  2.003

In [5]:
help(f.flush)

Help on built-in function flush:

flush() method of _io.TextIOWrapper instance
    Flush write buffers, if applicable.
    
    This is not implemented for read-only and non-blocking streams.

