In [1]:
## test only for DA D52 untreated eQTL (n=1,024, FDR<5%)
# that also showed GxE effects (n=134) - probably one single run

In [2]:
import numpy as np
import pandas as pd
from pandas_plink import read_plink1_bin
import xarray as xr
from numpy.linalg import cholesky
from numpy import ones
from struct_lmm2 import StructLMM2
from limix.qc import quantile_gaussianize

In [3]:
mydir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/"

In [4]:
sample_mapping_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_sample_mapping.csv"
sample_mapping = pd.read_csv(sample_mapping_file, 
                             dtype={"genotype_individual_id": str, "phenotype_sample_id": str}, index_col=0)

In [5]:
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
1,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster1
2,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster2
3,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster3
4,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster4
5,HPSI0513i-veve_2,HPSI0513i-veve_2-cluster5


In [6]:
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 173


In [7]:
kinship_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Filtered_Plink-f/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.recode.filtered.rel"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index)
K.shape

(1953, 1953)

In [8]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [9]:
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)
K.shape

(173, 173)

In [10]:
L_kinship = cholesky(K.values)
L_kinship = xr.DataArray(L_kinship, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(L_kinship.sample.values == K.sample_0.values)
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 1201
Sample mapping number of rows AFTER intersection: 1201


In [11]:
L_expanded = L_kinship.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == sample_mapping["genotype_individual_id"].values)
L_expanded.shape

(1201, 173)

In [12]:
plink_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/REL-2018-01/Full_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20180102.genotypes.norm.renamed.recode.vcf.gz.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [01:14<00:00, 24.83s/it]


In [13]:
phenotype_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_means.csv"
phenotype = pd.read_csv(phenotype_file, index_col=0)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))

Phenotype shape BEFORE selection: (32738, 1201)


In [14]:
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape AFTER selection: (32738, 1201)


In [15]:
E_file = mydir+"da_d52_untreated_pseudocells_20_harmony_PCs_atleast10cells_10nn_mean_500pcs.csv"
E = pd.read_csv(E_file, index_col=0)
E.head(2)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC491,PC492,PC493,PC494,PC495,PC496,PC497,PC498,PC499,PC500
HPSI0513i-veve_2-cluster1,-1.077326,-0.627027,0.46002,-0.303477,1.081841,-0.743598,0.281105,-0.092173,0.309893,-0.142919,...,-0.228331,0.108394,0.094212,0.072755,0.072265,-0.153802,-0.020364,-0.05128,0.031763,0.030025
HPSI0513i-veve_2-cluster2,1.261613,0.064461,-0.574645,-0.210068,-0.119665,-0.513709,0.141375,0.237369,0.234771,0.832405,...,0.183989,-0.15442,0.125119,-0.253406,0.037648,-0.242901,-0.098564,-0.053207,-0.002465,-0.301348


In [16]:
E = xr.DataArray(E.values, dims=["cell", "pc"], coords={"cell": E.index.values, "pc": E.columns.values})
E = E.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(E.cell.values == sample_mapping["phenotype_sample_id"].values)

In [17]:
gene_anno_file = "/nfs/leia/research/stegle/dseaton/hipsci/singlecell_neuroseq/data/metadata/gene_annotation/Homo_sapiens.GRCh37.82.Limix_annotation_gene_level.txt"
annotation_df = pd.read_csv(gene_anno_file, sep="\t")
annotation_df.head()

Unnamed: 0,feature_id,chromosome,start,end
0,ENSG00000223972,1,11869,14412
1,ENSG00000227232,1,14363,29806
2,ENSG00000243485,1,29554,31109
3,ENSG00000237613,1,34554,36081
4,ENSG00000268020,1,52473,54936


In [18]:
neuro_eqtl_file = mydir+"da_d52_none_eqtl_fdr5pct.csv"
neuro_eqtl = pd.read_csv(neuro_eqtl_file, index_col = 0)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
neuro_eqtl.head()
genes = neuro_eqtl['feature_id'].unique()
len(genes)

1024

In [19]:
leads = neuro_eqtl['snp_id'].unique()
len(leads)

1018

In [20]:
G_sel = G[:,G['snp'].isin(leads)]
G_sel.shape

(1953, 1018)

In [21]:
G_exp = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == G_exp.sample.values)

  return self.array[key]


In [22]:
#load map HGNC to ensembl
mapping_df = pd.read_csv('/nfs/leia/research/stegle/dseaton/genomes/hg19/annotation/geneid_mappings/hgnc_symbol2ensembl_gene_id.txt', sep='\t')
mapping_df = mapping_df.set_index('hgnc_symbol')
mapping_df.head()

Unnamed: 0_level_0,ensembl_gene_id
hgnc_symbol,Unnamed: 1_level_1
SLC25A26,ENSG00000261657
,ENSG00000223116
HMGA1P6,ENSG00000233440
RNY3P4,ENSG00000207157
LINC00362,ENSG00000229483


In [23]:
gene = genes[0]
gene

'ENSG00000090054'

In [24]:
trait_name = mapping_df.query("ensembl_gene_id==\"{}\"".format(gene)).index[0]
trait_name

'SPTLC1'

In [25]:
n_samples = phenotype.shape[1]
M = ones((n_samples, 1))
n_samples

1201

In [27]:
y = phenotype.sel(trait=trait_name)
y.values = quantile_gaussianize(y.values)
E.values = quantile_gaussianize(E.values)
n_envs = 10
slmm2 = StructLMM2(y.values, M, E.values[:,0:n_envs], L_expanded.values)
evars = neuro_eqtl[neuro_eqtl['feature_id']==gene]['snp_id'].unique()
G_tmp = G_exp[:,G_exp['snp'].isin(evars)]

In [30]:
ok = np.var(G_tmp.values, axis=0) > 0.0
betas = np.full((n_samples, G_tmp.shape[1]), np.nan)

In [31]:
betas = slmm2.predict_interaction(G_tmp[:, ok])
betas

array([[-0.09028429],
       [-0.14016259],
       [-0.05583952],
       ...,
       [-0.1371613 ],
       [-0.11080672],
       [-0.33967526]])

In [32]:
betas.shape

(2402, 1)

In [42]:
np.tile(y.cell.values,2).shape

(2402,)

In [46]:
G_tmp.snp.values

array(['9_94639418_T_A'], dtype='<U16')

In [54]:
beta = pd.DataFrame({str(G_tmp.snp.values[0]):betas[:,0],
                     "pseudocell":np.tile(y.cell.values,2)})
beta.head()

Unnamed: 0,9_94639418_T_A,pseudocell
0,-0.090284,HPSI0513i-veve_2-cluster1
1,-0.140163,HPSI0513i-veve_2-cluster2
2,-0.05584,HPSI0513i-veve_2-cluster3
3,-0.195948,HPSI0513i-veve_2-cluster4
4,-0.068237,HPSI0513i-veve_2-cluster5
