In [2]:
import numpy as np
from numpy import ones
from numpy_sugar import ddot
import os
import sys
import pandas as pd
from pandas_plink import read_plink1_bin
from numpy.linalg import cholesky
from numpy_sugar.linalg import economic_svd
import xarray as xr
from limix.qc import quantile_gaussianize

In [3]:
from cellregmap import CellRegMap

In [4]:
i = 10

In [5]:
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

## this file will map cells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})

## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [6]:
## read in GRM kinship matrix
kinship_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index)
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [7]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

## and decompose such as K = L @ L.T
L_kinship = cholesky(K.values)
L_kinship = xr.DataArray(L_kinship, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(L_kinship.sample.values == K.sample_0.values)

In [8]:
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

# expand from donors to cells
L_expanded = L_kinship.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

Sample mapping number of rows BEFORE intersection: 34256
Sample mapping number of rows AFTER intersection: 33964


In [9]:
# Phenotype (single-cell expression)
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [10]:
## read in genotype file
plink_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:25<00:00, 128.51s/it]


In [11]:
# gene annotation file
annotation_file = "/hps/nobackup/hipsci/scratch/processed_data/rna_seq/annotationFiles/Ensembl_75_Limix_Annotation_FC_Gene.txt"
anno_df = pd.read_csv(annotation_file, sep="\t", index_col=0)
anno_df.head()

Unnamed: 0_level_0,chromosome,start,end
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000271782,1,50902700,50902978
ENSG00000232753,1,103817769,103828355
ENSG00000225767,1,50927141,50936822
ENSG00000202140,1,50965430,50965529
ENSG00000207194,1,51048076,51048183


In [12]:
def cis_snp_selection(feature_id, annotation_df, G, window_size):
        feature = annotation_df.query("feature_id==\"{}\"".format(feature_id)).squeeze()
        chrom = str(feature['chromosome'])
        start = feature['start']
        end = feature['end']
        # make robust to features self-specified back-to-front
        lowest = min([start,end])
        highest = max([start,end])
        # for cis, we sequentially add snps that fall within each region
        G = G.where((G.chrom == str(chrom)) & (G.pos > (lowest-window_size)) & (G.pos < (highest+window_size)), drop=True)
        return G

In [13]:
# cis window size
w = 100000

In [14]:
import re
trait_name = phenotype["trait"].values[i]
feature = re.sub("_.*", "", trait_name)

In [15]:
trait_name

'ENSG00000001617_SEMA3F'

In [16]:
feature

'ENSG00000001617'

In [17]:
G_sel = cis_snp_selection(feature, anno_df, G, w)

In [18]:
G_sel

Unnamed: 0,Array,Chunk
Bytes,4.06 MB,2.58 MB
Shape,"(1610, 631)","(1024, 631)"
Count,102205 Tasks,2 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.06 MB 2.58 MB Shape (1610, 631) (1024, 631) Count 102205 Tasks 2 Chunks Type float32 numpy.ndarray",631  1610,

Unnamed: 0,Array,Chunk
Bytes,4.06 MB,2.58 MB
Shape,"(1610, 631)","(1024, 631)"
Count,102205 Tasks,2 Chunks
Type,float32,numpy.ndarray


In [19]:
# expand out genotypes from cells to donors
G_exp = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == G_exp.sample.values)

  return self.array[key]


In [20]:
# cell environments
# cells by MOFA factors (20)
E_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/mofa_logcounts_model_factors.csv"
E = pd.read_csv(E_file, index_col = 0)
E = xr.DataArray(E.values, dims=["cell", "pc"], coords={"cell": E.index.values, "pc": E.columns.values})
E = E.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(E.cell.values == sample_mapping["phenotype_sample_id"].values)

In [21]:
# decomposition of EEt through decomposition of E
[U, S, _] = economic_svd(E)
del _
us = U * S
# decomposition of K*EEt
Ls = [ddot(us[:,i], L_expanded) for i in range(us.shape[1])]
del us

In [22]:
n_samples = phenotype.shape[1]
M = ones((n_samples, 1))
E = quantile_gaussianize(E)

In [23]:
y = phenotype.sel(trait=trait_name)
G_tmp = G_exp
y = quantile_gaussianize(y)

In [24]:
import time
start_time = time.time()
crm = CellRegMap(y.values, M, E.values[:,0:10], Ls)
print("--- %s seconds ---" % (time.time() - start_time))

--- 87.79842710494995 seconds ---


In [25]:
G_tmp.shape

(33964, 631)

In [None]:
start_time = time.time()
pvals = crm.scan_interaction(G_tmp)[0]
print("--- %s seconds ---" % (time.time() - start_time))

  5%|▌         | 33/631 [2:39:35<47:13:41, 284.32s/it]

In [50]:
%tb

SystemExit: No Eigenvalue is bigger than 0!!

In [None]:
pv = pd.DataFrame({"chrom":G_tmp.chrom.values,
               "pv":pvals,
               "variant":G_tmp.snp.values})
pv.head()