In [None]:
## test only for DA D52 untreated eQTL (n=1,024, FDR<5%)
# run one of these per chromosome

In [220]:
import pandas as pd
from pandas_plink import read_plink1_bin
import xarray as xr
from numpy.linalg import cholesky
from numpy import ones
from struct_lmm2 import StructLMM2
from limix.qc import quantile_gaussianize

In [51]:
mydir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/"

In [52]:
sample_mapping_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_sample_mapping.csv"
sample_mapping = pd.read_csv(sample_mapping_file, 
                             dtype={"genotype_individual_id": str, "phenotype_sample_id": str}, index_col=0)

In [53]:
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
1,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster1
2,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster2
3,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster3
4,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster4
5,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster5


In [54]:
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 173


In [55]:
kinship_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered_Plink-f/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.recode.filtered.rel"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index)
K.shape

(1953, 1953)

In [56]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [58]:
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)
K.shape

(173, 173)

In [59]:
L_kinship = cholesky(K.values)
L_kinship = xr.DataArray(L_kinship, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(L_kinship.sample.values == K.sample_0.values)
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 1201
Sample mapping number of rows AFTER intersection: 1201


In [61]:
L_expanded = L_kinship.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == sample_mapping["genotype_individual_id"].values)
L_expanded.shape

(1201, 173)

In [196]:
plink_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20180102.genotypes.norm.renamed.recode.vcf.gz.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [01:17<00:00, 25.78s/it]


In [73]:
phenotype_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_means.csv"
phenotype = pd.read_csv(phenotype_file, index_col=0)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))

Phenotype shape BEFORE selection: (32738, 1201)


In [74]:
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape AFTER selection: (32738, 1201)


In [78]:
E_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_mean_500pcs.csv"
E = pd.read_csv(E_file, index_col=0)
E.head(2)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC491,PC492,PC493,PC494,PC495,PC496,PC497,PC498,PC499,PC500
HPSI0513i-veve_2-cluster1,-1.077326,-0.627027,0.46002,-0.303477,1.081841,-0.743598,0.281105,-0.092173,0.309893,-0.142919,...,-0.228331,0.108394,0.094212,0.072755,0.072265,-0.153802,-0.020364,-0.05128,0.031763,0.030025
HPSI0513i-veve_2-cluster2,1.261613,0.064461,-0.574645,-0.210068,-0.119665,-0.513709,0.141375,0.237369,0.234771,0.832405,...,0.183989,-0.15442,0.125119,-0.253406,0.037648,-0.242901,-0.098564,-0.053207,-0.002465,-0.301348


In [79]:
E = xr.DataArray(E.values, dims=["cell", "pc"], coords={"cell": E.index.values, "pc": E.columns.values})
E = E.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(E.cell.values == sample_mapping["phenotype_sample_id"].values)

In [82]:
gene_anno_file = "/nfs/leia/research/stegle/dseaton/hipsci/singlecell_neuroseq/data/metadata/gene_annotation/Homo_sapiens.GRCh37.82.Limix_annotation_gene_level.txt"
annotation_df = pd.read_csv(gene_anno_file, sep="\t")
annotation_df.head()

Unnamed: 0,feature_id,chromosome,start,end
0,ENSG00000223972,1,11869,14412
1,ENSG00000227232,1,14363,29806
2,ENSG00000243485,1,29554,31109
3,ENSG00000237613,1,34554,36081
4,ENSG00000268020,1,52473,54936


In [229]:
chrom = 1

In [230]:
G_sel = G.where(G.chrom == str(chrom), drop=True)
G_exp = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == G_exp.sample.values)

  return self.array[key]


In [234]:
neuro_eqtl_file = mydir+"da_d52_none_eqtl_fdr5pct.csv"
neuro_eqtl = pd.read_csv(neuro_eqtl_file, index_col = 0)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
neuro_eqtl.head()
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature_id'].unique()
genes

array(['ENSG00000116138', 'ENSG00000171729', 'ENSG00000117533',
       'ENSG00000143156', 'ENSG00000143569', 'ENSG00000117305',
       'ENSG00000179163', 'ENSG00000188529', 'ENSG00000189266',
       'ENSG00000218510', 'ENSG00000043514', 'ENSG00000168653',
       'ENSG00000116641', 'ENSG00000142700', 'ENSG00000169213',
       'ENSG00000185104', 'ENSG00000116857', 'ENSG00000198892',
       'ENSG00000143374', 'ENSG00000163125', 'ENSG00000198756',
       'ENSG00000049247', 'ENSG00000049249', 'ENSG00000116288',
       'ENSG00000130764', 'ENSG00000198912', 'ENSG00000120333',
       'ENSG00000116678', 'ENSG00000142856', 'ENSG00000213625',
       'ENSG00000117569', 'ENSG00000178104', 'ENSG00000215790',
       'ENSG00000177614', 'ENSG00000117640', 'ENSG00000142684',
       'ENSG00000142733', 'ENSG00000176092', 'ENSG00000183726',
       'ENSG00000188672', 'ENSG00000117450', 'ENSG00000117461',
       'ENSG00000132781', 'ENSG00000197429', 'ENSG00000236624',
       'ENSG00000116791', 'ENSG000001379

In [232]:
#load map HGNC to ensembl
mapping_df = pd.read_csv('/nfs/leia/research/stegle/dseaton/genomes/hg19/annotation/geneid_mappings/hgnc_symbol2ensembl_gene_id.txt', sep='\t')
mapping_df = mapping_df.set_index('hgnc_symbol')
mapping_df.head()

Unnamed: 0_level_0,ensembl_gene_id
hgnc_symbol,Unnamed: 1_level_1
SLC25A26,ENSG00000261657
,ENSG00000223116
HMGA1P6,ENSG00000233440
RNY3P4,ENSG00000207157
LINC00362,ENSG00000229483


In [235]:
gene = genes[0]
gene
trait_name = mapping_df.query("ensembl_gene_id==\"{}\"".format(gene)).index[0]
trait_name

'DNAJC16'

In [236]:
y = phenotype.sel(trait=trait_name)
leads = neuro_eqtl[neuro_eqtl['feature_id']==gene]['snp_id'].unique()
y.values = quantile_gaussianize(y.values)
E.values = quantile_gaussianize(E.values)
slmm2 = StructLMM2(y.values, M, E.values[:,0:n_envs], L_expanded.values)
G_tmp = G_exp[:,G_exp['snp'].isin(leads)]

In [237]:
ok = np.var(G_tmp.values, axis=0) > 0.0
pvals = np.full(G_tmp.shape[1], np.nan)

In [238]:
perm = None
pvals[ok] = slmm2.scan_interaction(G_tmp[:, ok], perm)
pv = pd.DataFrame({"chrom":G_tmp.chrom.values,
                    "pv":pvals,
                    "variant":G_tmp.snp.values})
pv.head()

Elapsed: 1.6896357536315918
Elapsed: 0.8039572238922119
Elapsed: 1.056905746459961
Elapsed: 1.3192522525787354
Elapsed: 1.3285138607025146
Elapsed: 0.5340051651000977
Elapsed: 0.4274172782897949
Elapsed: 0.7832086086273193
Elapsed: 0.8753387928009033
Elapsed: 0.6392817497253418
Elapsed: 0.6131033897399902
Elapsed: 0.6226623058319092


Unnamed: 0,chrom,pv,variant
0,1,0.26287,1_15815579_G_A


In [139]:
# i = 0
# trait_name = phenotype.trait.values[i]
# print("Running for gene {}".format(trait_name))
# y = phenotype.sel(trait=trait_name)
n_samples = phenotype.shape[1]
M = ones((n_samples, 1))
n_samples

Running for gene MIR1302-10


1201

In [239]:
n_envs = 10
for gene in genes:
    trait_name = mapping_df.query("ensembl_gene_id==\"{}\"".format(gene)).index[0]
    y = phenotype.sel(trait=trait_name)
    leads = neuro_eqtl[neuro_eqtl['feature_id']==gene]['snp_id'].unique()
    y.values = quantile_gaussianize(y.values)
    E.values = quantile_gaussianize(E.values)
    slmm2 = StructLMM2(y.values, M, E.values[:,0:n_envs], L_expanded.values)
    G_tmp = G_exp[:,G_exp['snp'].isin(leads)]
    ok = np.var(G_tmp.values, axis=0) > 0.0
    pvals = np.full(G_tmp.shape[1], np.nan)
    perm = None
    pvals[ok] = slmm2.scan_interaction(G_tmp[:, ok], perm)
    pv = pd.DataFrame({"chrom":G_tmp.chrom.values,
                    "pv":pvals,
                    "variant":G_tmp.snp.values})

Elapsed: 1.0260446071624756
Elapsed: 0.5815634727478027
Elapsed: 1.0604355335235596
Elapsed: 1.8170783519744873
Elapsed: 0.4938688278198242
Elapsed: 1.548565149307251
Elapsed: 1.4679741859436035
Elapsed: 0.6404452323913574
Elapsed: 0.6374661922454834
Elapsed: 1.3243319988250732
Elapsed: 0.8788602352142334
Elapsed: 0.8798308372497559
Elapsed: 1.4465348720550537
Elapsed: 1.9050266742706299
Elapsed: 0.9164495468139648
Elapsed: 0.8703644275665283
Elapsed: 0.9924607276916504
Elapsed: 0.9794554710388184
Elapsed: 0.7588181495666504
Elapsed: 0.2652573585510254
Elapsed: 0.767737865447998
Elapsed: 1.1496331691741943
Elapsed: 0.6642227172851562
Elapsed: 0.665844202041626
Elapsed: 2.7786293029785156
Elapsed: 0.9748086929321289
Elapsed: 1.023348331451416
Elapsed: 0.6731655597686768
Elapsed: 1.4528288841247559
Elapsed: 0.806574821472168
Elapsed: 0.7060031890869141
Elapsed: 1.0291366577148438
Elapsed: 0.6241958141326904
Elapsed: 1.154214859008789
Elapsed: 1.7239100933074951
Elapsed: 1.740671873092651

Elapsed: 0.5540542602539062
Elapsed: 1.1036970615386963
Elapsed: 0.7876269817352295
Elapsed: 0.46523451805114746
Elapsed: 0.2573516368865967
Elapsed: 0.25873637199401855
Elapsed: 0.7468452453613281
Elapsed: 0.09299683570861816
Elapsed: 0.18518710136413574
Elapsed: 0.8920495510101318
Elapsed: 0.35134458541870117
Elapsed: 0.3747074604034424
Elapsed: 0.14770841598510742
Elapsed: 0.3059110641479492
Elapsed: 0.3545539379119873
Elapsed: 1.422365427017212
Elapsed: 0.7377445697784424
Elapsed: 0.7394578456878662
Elapsed: 2.734727144241333
Elapsed: 0.3180251121520996
Elapsed: 0.2737598419189453
Elapsed: 0.21634197235107422
Elapsed: 0.341477632522583
Elapsed: 0.46608662605285645
Elapsed: 0.6636557579040527
Elapsed: 0.24991536140441895
Elapsed: 0.574446439743042
Elapsed: 1.0599260330200195
Elapsed: 0.7752571105957031
Elapsed: 0.7789618968963623
Elapsed: 1.4929494857788086
Elapsed: 0.5935049057006836
Elapsed: 0.1619572639465332
Elapsed: 0.2833273410797119
Elapsed: 0.3014342784881592
Elapsed: 0.3631

Elapsed: 0.2689511775970459
Elapsed: 0.5002195835113525
Elapsed: 0.3259866237640381
Elapsed: 0.3268578052520752
Elapsed: 0.909050464630127
Elapsed: 0.22762322425842285
Elapsed: 0.2102189064025879
Elapsed: 0.08527278900146484
Elapsed: 0.28380870819091797
Elapsed: 0.18944215774536133
Elapsed: 0.44845128059387207
Elapsed: 0.8145809173583984
Elapsed: 0.511167049407959
Elapsed: 0.44123411178588867
Elapsed: 0.19645404815673828
Elapsed: 0.1970047950744629
Elapsed: 0.3731844425201416
Elapsed: 0.13718891143798828
Elapsed: 0.11614322662353516
Elapsed: 0.3558614253997803
Elapsed: 0.5045311450958252
Elapsed: 0.2037982940673828
Elapsed: 0.12042069435119629
Elapsed: 0.1933596134185791
Elapsed: 0.12285327911376953
Elapsed: 0.3002183437347412
Elapsed: 0.23147034645080566
Elapsed: 0.23736977577209473
Elapsed: 0.589069128036499
Elapsed: 0.3982245922088623
Elapsed: 0.24475359916687012
Elapsed: 0.23039698600769043
Elapsed: 0.17076611518859863
Elapsed: 0.3413534164428711
Elapsed: 0.5133438110351562
Elapsed

Elapsed: 0.2734224796295166
Elapsed: 0.1444387435913086
Elapsed: 0.16685700416564941
Elapsed: 0.1821143627166748
Elapsed: 0.5344133377075195
Elapsed: 0.1647655963897705
Elapsed: 0.2339780330657959
Elapsed: 0.2522697448730469
Elapsed: 0.15937471389770508
Elapsed: 0.3756692409515381
Elapsed: 0.12525153160095215
Elapsed: 0.2333667278289795
Elapsed: 0.18136000633239746
Elapsed: 0.1003727912902832
Elapsed: 0.12952852249145508
Elapsed: 0.13037896156311035
Elapsed: 0.7755169868469238
Elapsed: 0.2908790111541748
Elapsed: 0.07504439353942871
Elapsed: 0.07486081123352051
Elapsed: 0.15055203437805176
Elapsed: 0.12614035606384277
Elapsed: 0.22052288055419922
Elapsed: 0.31055402755737305
Elapsed: 0.21818041801452637
Elapsed: 0.18263816833496094
Elapsed: 0.19761276245117188
Elapsed: 0.20909643173217773
Elapsed: 0.14223718643188477
Elapsed: 0.13318157196044922
Elapsed: 0.07433724403381348
Elapsed: 0.08362603187561035
Elapsed: 0.07076597213745117
Elapsed: 0.10560369491577148
Elapsed: 0.154336452484130

In [240]:
pv.head()

Unnamed: 0,chrom,pv,variant
0,1,0.203726,1_111544137_A_C
