In [126]:
import numpy as np
from numpy import ones
from numpy_sugar import ddot
import os
import sys
import pandas as pd
from pandas_plink import read_plink1_bin
from numpy.linalg import cholesky
from numpy_sugar.linalg import economic_svd
import xarray as xr
from limix.qc import quantile_gaussianize

In [127]:
from cellregmap import CellRegMap

In [129]:
chrom = 1

In [130]:
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

## this file will map cells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})

## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [131]:
## read in genotype file
plink_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:03<00:00, 121.18s/it]


In [132]:
## read in GRM kinship matrix
kinship_file = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index)
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [133]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

## and decompose such as K = L @ L.T
L_kinship = cholesky(K.values)
L_kinship = xr.DataArray(L_kinship, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(L_kinship.sample.values == K.sample_0.values)

In [134]:
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

# expand from donors to cells
L_expanded = L_kinship.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

Sample mapping number of rows BEFORE intersection: 34256
Sample mapping number of rows AFTER intersection: 33964


In [135]:
# environments
# cells by MOFA factors (20)
E_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/mofa_logcounts_model_factors.csv"
E = pd.read_csv(E_file, index_col = 0)
E = xr.DataArray(E.values, dims=["cell", "pc"], coords={"cell": E.index.values, "pc": E.columns.values})
E = E.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(E.cell.values == sample_mapping["phenotype_sample_id"].values)

In [136]:
# Phenotype
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [137]:
phenotype["trait"].values

array(['ENSG00000000003_TSPAN6', 'ENSG00000000419_DPM1',
       'ENSG00000000457_SCYL3', ..., 'ENSG00000272047_GTF2H5',
       'ENSG00000272325_NUDT3', 'ENSG00000272398_CD24'], dtype=object)

In [138]:
annotation_file = "/hps/nobackup/hipsci/scratch/processed_data/rna_seq/annotationFiles/Ensembl_75_Limix_Annotation_FC_Gene.txt"
anno_df = pd.read_csv(annotation_file, sep="\t", index_col=0)
anno_df.head()

Unnamed: 0_level_0,chromosome,start,end
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000271782,1,50902700,50902978
ENSG00000232753,1,103817769,103828355
ENSG00000225767,1,50927141,50936822
ENSG00000202140,1,50965430,50965529
ENSG00000207194,1,51048076,51048183


In [139]:
genes = phenotype["trait"].values
gene = genes[1]
gene

'ENSG00000000419_DPM1'

In [140]:
import re
feature = re.sub("_.*", "", gene)
feature

'ENSG00000000419'

In [141]:
def cis_snp_selection(feature_id, annotation_df, G, window_size):
        feature = annotation_df.query("feature_id==\"{}\"".format(feature_id)).squeeze()
        chrom = str(feature['chromosome'])
        start = feature['start']
        end = feature['end']
        # make robust to features selfpecified back-to-front
        lowest = min([start,end])
        highest = max([start,end])
        # for cis, we sequentially add snps that fall within each region
        G = G.where((G.chrom == str(chrom)) & (G.pos > (lowest-window_size)) & (G.pos < (highest+window_size)), drop=True)
        return G

In [142]:
# # debug cis_snp_selection function

# feature_id = feature
# annotation_df = anno_df
# G = G
# window_size = 100000

# feature = annotation_df.query("feature_id==\"{}\"".format(feature_id)).squeeze()
# feature

# chrom = str(feature['chromosome'])
# chrom

# start = feature['start']
# start

# end = feature['end']
# end

# lowest = min([start,end])
# lowest

# highest = max([start,end])
# highest

In [143]:
# lowest-window_size

# highest+window_size

# chrom
# G.where((G.chrom == chrom),drop=True)

# G_sel = G.where((G.chrom == chrom) & (G.pos > (lowest-window_size)) & (G.pos < (highest+window_size)), drop=True)
# G_sel

In [144]:
# G_sel = G.where(G.chrom == str(chrom), drop=True)
G_sel = cis_snp_selection(feature, anno_df, G, 100000)

In [145]:
G_sel

Unnamed: 0,Array,Chunk
Bytes,3.70 MB,1.43 MB
Shape,"(1610, 574)","(1024, 348)"
Count,102213 Tasks,4 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.70 MB 1.43 MB Shape (1610, 574) (1024, 348) Count 102213 Tasks 4 Chunks Type float32 numpy.ndarray",574  1610,

Unnamed: 0,Array,Chunk
Bytes,3.70 MB,1.43 MB
Shape,"(1610, 574)","(1024, 348)"
Count,102213 Tasks,4 Chunks
Type,float32,numpy.ndarray


In [146]:
G_exp = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(L_expanded.sample.values == G_exp.sample.values)

  return self.array[key]


In [147]:
[U, S, _] = economic_svd(E)
del _
us = U * S
Ls = [ddot(us[:,i], L_expanded) for i in range(us.shape[1])]
del us

In [148]:
n_samples = phenotype.shape[1]
M = ones((n_samples, 1))
E = quantile_gaussianize(E)

In [149]:
trait_name = genes[0]
y = phenotype.sel(trait=trait_name)
# # select SNPs for a given gene
# leads = endo_eqtl[endo_eqtl['feature']==trait_name]['snp_id'].unique()
# #breakpoint()
# G_tmp = G_exp[:,G_exp['snp'].isin(leads)]
G_tmp = G_exp
y = quantile_gaussianize(y)

In [116]:
# ?CellRegMap.scan_assoc0

In [117]:
# crm = CellRegMap(y=y.values, W=M, E=E.values[:,0:10], G=G_tmp)

In [120]:
# pvals = crm.scan_assoc0(G_tmp)

In [None]:
# pv = pd.DataFrame({"chrom":G_tmp.chrom.values,
#                "pv":pvals,
#                "variant":G_tmp.snp.values})
# pv.head()

In [150]:
crm0 = CellRegMap(y=y.values, W=M, E=E.values[:,0:10], Ls=Ls)

In [None]:
pvals0 = crm0.scan_assoc0(G_tmp)
pvals0

  2%|▏         | 12/574 [52:43<41:38:34, 266.75s/it]

In [None]:
pv = pd.DataFrame({"chrom":G_tmp.chrom.values,
               "pv":pvals0,
               "variant":G_tmp.snp.values})
pv.head()

In [None]:
folder = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/CellRegMap_Gtest/MOFA/"
outfilename = f"{folder}{trait_name}_all.tsv"
outfilename

In [None]:
pv.to_csv(outfilename, sep='\t')