In [1]:
import numpy as np
from numpy import ones
from numpy_sugar import ddot
import os
import sys
import pandas as pd
from pandas_plink import read_plink1_bin
from numpy.linalg import cholesky
from numpy_sugar.linalg import economic_svd
import xarray as xr
from struct_lmm2 import StructLMM2
from limix.qc import quantile_gaussianize

In [2]:
perm = None
chrom = 1

In [3]:
## this file will map cells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0714i-iudw_1,iudw_1-d30-cluster0
1,HPSI0714i-iudw_1,iudw_1-d30-cluster1
2,HPSI0714i-iudw_1,iudw_1-d30-cluster2
3,HPSI0714i-iudw_1,iudw_1-d30-cluster3
4,HPSI0714i-iudw_1,iudw_1-d30-cluster4


In [4]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 188


In [5]:
## read in genotype file
plink_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:19<00:00, 126.41s/it]


In [6]:
## read in GRM kinship matrix
kinship_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index)
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 171


In [7]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [8]:
## and decompose such as K = L @ L.T
L_kinship = cholesky(K.values)
L_kinship = xr.DataArray(L_kinship, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(L_kinship.sample.values == K.sample_0.values)
del K

In [9]:
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 8479
Sample mapping number of rows AFTER intersection: 7691


In [10]:
# expand from donors to cells
L_expanded = L_kinship.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [11]:
# environments
# meta-cells by PCs (15)
E_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/15PCs.csv"
E = pd.read_csv(E_file, index_col = 0)
E = xr.DataArray(E.values, dims=["cell", "pc"], coords={"cell": E.index.values, "pc": E.columns.values})
E = E.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(E.cell.values == sample_mapping["phenotype_sample_id"].values)

In [12]:
# subselect to only SNPs on right chromosome
G_sel = G.where(G.chrom == str(chrom), drop=True)
# and to individuals in smf
G_exp = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == G_exp.sample.values)

  return self.array[key]
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


In [13]:
n_factors = 15
# E = E.values[:,0:n_factors]

In [14]:
# get eigendecomposition of EEt
[U, S, _] = economic_svd(E)
us = U * S
# get decomposition of K*EEt
Ls = [ddot(us[:,i], L_expanded) for i in range(us.shape[1])]

In [15]:
# Phenotype (meta-cell gene expression)
phenotype_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)

In [16]:
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (32738, 8479)
Phenotype shape AFTER selection: (32738, 7691)


In [17]:
# Filter on specific gene-SNP pairs
# eQTL from neuroseq DA (day30 + day52 + day52 ROT treated)
neuro_eqtl_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_eqtl_allconditions_FDR5pct.csv" # consider filter further (significant only)
neuro_eqtl = pd.read_csv(neuro_eqtl_file)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature'].unique()

In [18]:
n_samples = phenotype.shape[1]
M = ones((n_samples, 1))

In [19]:
i=109
trait_name = genes[i]
trait_name

'ENSG00000117245'

In [20]:
folder = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/PCA/genetic_effects"
outfilename = f"{folder}/PCA{n_factors}/{trait_name}.tsv"
print(outfilename)

/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/PCA/genetic_effects/PCA15/ENSG00000117245.tsv


In [21]:
y = phenotype.sel(trait=trait_name)
# y = quantile_gaussianize(y)
y = np.asarray(y)

# E = quantile_gaussianize(E)

In [22]:
# null model
slmm2 = StructLMM2(y, M, E, Ls)

In [23]:
leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
G_tmp = G_exp[:,G_exp['snp'].isin(leads)]

In [24]:
# predict in-sample effect size
b = slmm2.predict_interaction(G_tmp)

In [25]:
beta_star = b[0:y.shape[0],:]

In [26]:
cells = phenotype["cell"].values
snps = G_tmp["variant"].values
betas_df = pd.DataFrame(data = beta_star, columns = snps, index = cells)
betas_df.head()

Unnamed: 0,1_1_21041116_T_C
iudw_1-d30-cluster0,1.7e-05
iudw_1-d30-cluster1,-0.013735
iudw_1-d30-cluster2,0.016429
iudw_1-d30-cluster3,-0.014477
iudw_1-d30-cluster4,-0.009633


In [27]:
betas_df.to_csv(outfilename, sep='\t')

In [None]:
for trait_name in genes:

    folder = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/PCA/genetic_effects"

    outfilename = f"{folder}/PCA{n_factors}/{trait_name}.tsv"
    if os.path.exists(outfilename):
        print("File already exists, skipping gene")
        continue

    y = phenotype.sel(trait=trait_name)
    y = np.asarray(y)
    
    # null model
    slmm2 = StructLMM2(y, M, E, Ls)
    leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
    G_tmp = G_exp[:,G_exp['snp'].isin(leads)]
    
    # predict in-sample effect size
    b = slmm2.predict_interaction(G_tmp)
    beta_star = b[0:y.shape[0],:]
    cells = phenotype["cell"].values
    snps = G_tmp["variant"].values
    betas_df = pd.DataFrame(data = beta_star, columns = snps, index = cells)
    betas_df.to_csv(outfilename, sep='\t')


File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists, skipping gene
File already exists,