eQTL mapping with tensorQTL. Try this if fastQTL does not work for you

In [None]:
# ! pip3 install tensorqtl

In [97]:
import pandas as pd
import numpy as np
from io import StringIO
import subprocess
import tensorqtl
from tensorqtl import pgen, cis, trans, post

# 0. Test section (ignore)

In [9]:
# Test load plink pgen data
# PLINK reader for genotypes
plink_prefix_path = '/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/code/example_data/cchc_chr8_example'
pgr = pgen.PgenReader(plink_prefix_path)
genotype_df = pgr.load_genotypes()
display(genotype_df.head(3))

variant_df = pgr.variant_df
display(variant_df.head(3))


Unnamed: 0,HD0280_HA0023,BD2180_BD6180,BD2179_BD6179,BD2288_BD6288,HD0119_HD4119,HD0107_HD4107,LD0213_LA0013,BD3539_BA0523,LD0175_LD4175,BD2527_BD6527,...,HD0275_HA0018,BD1533_BD5533,BD2455_BD6455,LD0144_LD4144,BD2188_BD6188,BD2606_BD6606,LD0082_LD4082,BD2833_BD6833,BD3346_BA0352,HD0145_HD4145
chr7:12795:C:T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr7:20992:C:G,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chr7:21264:C:T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,chrom,pos,index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chr7:12795:C:T,7,12795,0
chr7:20992:C:G,7,20992,1
chr7:21264:C:T,7,21264,2


In [36]:
# Get genotypes by bcftools (for fun)
vcf_fn = '/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/code/example_data/cchc_chr8_example.vcf.gz'
cmd = f"bcftools query -f %ID,%CHROM,%POS[,%GT]\\n {vcf_fn}"
result = subprocess.run(cmd.split(), capture_output=True, text=True, check=True).stdout

gp_data = StringIO(result)
df = pd.read_csv(gp_data, header=None)
df.head(2)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1600,1601,1602,1603,1604,1605,1606,1607,1608,1609
0,chr7:12795:C:T,chr7,12795,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0
1,chr7:20992:C:G,chr7,20992,0|0,0|0,0|0,0|0,0|0,0|0,0|0,...,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0,0|0


# 1. Run tensorQTL
Test run on chr21

## 1.1 Load data

In [108]:
# define paths to data
plink_prefix_path = '/data100t1/home/wanying/CCHC/CHARGE_GWAS/GEM_genotype_files/CCHC_chr10'
expression_bed = '/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/input/chr_col_fixed_tmm/CCHC_batch1_2_3_4_TMM.chr10.chr_fixed.bed.gz'
covariates_file = '/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/supporting_files/combined_covariates_peers_from_different_runs/redo_combined_covar_sex_age_pc1-5.60_peers.combined_covariates.txt'
prefix = 'test_run_chr10'

# load phenotypes and covariates
phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
covariates_df = pd.read_csv(covariates_file, sep='\t', index_col=0).T

# PLINK reader for genotypes
pgr = pgen.PgenReader(plink_prefix_path)
genotype_df = pgr.load_genotypes()
variant_df = pgr.variant_df

## 1.2 Nominal run in cis region

In [111]:
# map all cis-associations (results for each chromosome are written to file)
# all genes
cis_df = cis.map_nominal(genotype_df, variant_df,
                         phenotype_df, phenotype_pos_df,
                         prefix, covariates_df=covariates_df,
                         output_dir='/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/output/tensorQTL_runs/')

# # Only run genes on one chromomsome
# cis.map_nominal(genotype_df, variant_df,
#                 phenotype_df.loc[phenotype_pos_df['chr'] == 'chr18'],
#                 phenotype_pos_df.loc[phenotype_pos_df['chr'] == 'chr18'],
#                 prefix, covariates_df=covariates_df)

cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 1288 samples
  * 1002 phenotypes
  * 67 covariates
  * 3315380 variants
  * cis-window: ±1,000,000
  * checking phenotypes: 1002/1002
  * Computing associations
    Mapping chromosome 10
    processing phenotype 1002/1002
    time elapsed: 19.11 min
    * writing output
done.


In [120]:
# load results
output_dir = '/data100t1/home/wanying/CCHC/eQTL_gtex_pipeline/output/tensorQTL_runs'
pairs_df = pd.read_parquet(f'{output_dir}/{prefix}.cis_qtl_pairs.10.parquet')
pairs_df.head()


Unnamed: 0,phenotype_id,variant_id,start_distance,af,ma_samples,ma_count,pval_nominal,slope,slope_se
0,ENSG00000261456.5,chr10:10537:C:A,-63626,0.0,0,0,,,
1,ENSG00000261456.5,chr10:10537:C:T,-63626,0.0,0,0,,,
2,ENSG00000261456.5,chr10:10550:G:T,-63613,0.0,0,0,,,
3,ENSG00000261456.5,chr10:10552:TCAG:T,-63611,0.0,0,0,,,
4,ENSG00000261456.5,chr10:10554:A:G,-63609,0.0,0,0,,,
