In [1]:
import re
import time
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
import cellregmap 
cellregmap 

<module 'cellregmap' from '/hps/nobackup2/stegle/users/acuomo/git_repos/CellRegMap/cellregmap/__init__.py'>

In [3]:
from cellregmap import run_interaction

In [33]:
revision_folder = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/"

In [5]:
# run 10 tests per job

In [8]:
input_files_dir = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

In [9]:
## sample mapping file
## this file will map cells to donors 
## it will also only include donors we have single-cell data for (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})

In [10]:
## genotype_individual_id are donor IDs, as found in the genotype matrix (G) and GRM covariance (K)
## phenotype_sample_id are cell IDs, as found in the scRNA-seq phenotype vector (y) and cell context covariance (C)
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0114i-joxm_1,21843_1#10
1,HPSI0314i-fafq_1,21843_1#100
2,HPSI0314i-fafq_1,21843_1#101
3,HPSI1013i-wuye_2,21843_1#102
4,HPSI0114i-joxm_1,21843_1#103


In [11]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [12]:
############################################
################ Kinship matrix ############
############################################

In [23]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [24]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [25]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [26]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)

In [27]:
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 33964
Sample mapping number of rows AFTER intersection: 33964


In [28]:
############################################
##### expand from donors to cells ##########

In [29]:
## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [30]:
######################################
############### Genotypes ############
######################################

In [31]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [05:54<00:00, 118.29s/it]


In [20]:
#############################
###### SNP selection

In [34]:
# filter file (columns: snp_id, gene)
fvf_filename = revision_folder+"/CRM_interaction_chr21/fvf.csv"
fvf = pd.read_csv(fvf_filename, index_col = 0)
fvf.head(20)

Unnamed: 0,feature,snpID,chrom
1,ENSG00000256073_C21orf119,21_33666631_T_C,21
2,ENSG00000256073_C21orf119,21_33667566_G_A,21
3,ENSG00000256073_C21orf119,21_33667866_C_T,21
4,ENSG00000256073_C21orf119,21_33668024_T_C,21
5,ENSG00000256073_C21orf119,21_33668131_A_G,21
6,ENSG00000256073_C21orf119,21_33668493_A_C,21
7,ENSG00000256073_C21orf119,21_33668891_A_G,21
8,ENSG00000256073_C21orf119,21_33669542_A_G,21
9,ENSG00000256073_C21orf119,21_33670219_C_T,21
10,ENSG00000256073_C21orf119,21_33670344_C_T,21


In [578]:
i = 61
genes = fvf['feature'].unique()

In [579]:
gene = genes[i]

In [580]:
fvf_gene = fvf[fvf['feature']==gene]

In [581]:
n = fvf_gene.shape[0]
n

646

In [582]:
# for i in range(0,n,10):
#     print(i)

In [614]:
j = 30

In [615]:
fvf_sel = fvf_gene.iloc[j:(j+10)]
fvf_sel

Unnamed: 0,feature,snpID,chrom
39346,ENSG00000159256_MORC3,21_37609571_T_C,21
39347,ENSG00000159256_MORC3,21_37609954_G_A,21
39348,ENSG00000159256_MORC3,21_37610551_C_G,21
39349,ENSG00000159256_MORC3,21_37611563_G_C,21
39350,ENSG00000159256_MORC3,21_37611577_G_T,21
39351,ENSG00000159256_MORC3,21_37611892_G_A,21
39352,ENSG00000159256_MORC3,21_37612407_C_G,21
39353,ENSG00000159256_MORC3,21_37612564_G_A,21
39354,ENSG00000159256_MORC3,21_37612673_G_A,21
39355,ENSG00000159256_MORC3,21_37612696_T_C,21


In [616]:
genes = fvf_sel['feature'].unique()

In [617]:
genes

array(['ENSG00000159256_MORC3'], dtype=object)

In [618]:
# (1) gene name (feature_id)
gene_name = genes[0]
trait_name = re.sub("_.*","",gene_name)
trait_name

'ENSG00000159256'

In [619]:
leads = fvf_sel[fvf_sel['feature']==gene_name]['snpID'].unique()

In [620]:
G_sel = G[:,G['snp'].isin(leads)]

In [621]:
G_sel.shape

(1610, 10)

In [622]:
############################################
##### expand from donors to cells ##########

In [623]:
# expand out genotypes from cells to donors (and select relevant donors in the same step)
G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == G_expanded.sample.values)

  return self.array[key]


In [624]:
G_expanded.shape

(33964, 10)

In [625]:
######################################
############## Phenotypes ############
######################################

In [626]:
# Phenotype (single-cell expression)
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [627]:
# select gene
y = phenotype.sel(trait=gene_name)
# quantile normalise
y = quantile_gaussianize(y)

In [628]:
######################################
########## Cell contexts #############
######################################

In [629]:
# cells by MOFA factors (20)
C_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/mofa_logcounts_model_factors.csv"
C = pd.read_csv(C_file, index_col = 0)
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [630]:
# quantile normalise cell contexts
C_gauss = quantile_gaussianize(C)

In [631]:
######################################
############ Covariates ##############
######################################

In [632]:
n_cells = phenotype.shape[1]
W = ones((n_cells, 1))

In [633]:
y = y.values.reshape(y.shape[0],1)
y.shape

(33964, 1)

In [634]:
W

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [635]:
C.values[:,0:10].shape
C.values[:,0:10]

array([[-0.59770402,  1.0674358 ,  0.0564961 , ...,  0.01520844,
         0.29224277, -0.63273743],
       [ 0.23328472,  2.13788372,  0.64681148, ..., -0.37349156,
         1.78959325, -0.65923266],
       [-0.05471216,  1.78603081, -0.09962504, ...,  0.07594282,
         0.87996128,  0.6806354 ],
       ...,
       [-0.80078862,  0.93532776, -1.06457988, ..., -0.26132522,
         0.29988706, -0.34527378],
       [-0.65763778,  1.30469584, -0.52890777, ...,  0.07814063,
         0.53126179, -0.13242109],
       [-0.78633873,  1.4659787 , -0.61704333, ...,  0.28598122,
         0.43912266,  0.24403696]])

In [636]:
start_time = time.time()
GG = G_expanded.values
print("--- %s seconds ---" % (time.time() - start_time))

--- 236.69298768043518 seconds ---


In [637]:
GG.shape

(33964, 10)

In [638]:
GG

array([[2., 0., 2., ..., 0., 0., 2.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [2., 0., 2., ..., 0., 0., 2.],
       [0., 1., 2., ..., 1., 1., 1.],
       [0., 1., 2., ..., 1., 1., 1.]], dtype=float32)

In [None]:
start_time = time.time()
pvals = run_interaction(y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]
print("--- %s seconds ---" % (time.time() - start_time))

 70%|███████   | 7/10 [36:01<15:26, 308.68s/it]

In [None]:
pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
               "pv":pvals,
               "variant":G_expanded.snp.values})
pv.head()

In [None]:
folder = revision_folder+"CRM_interaction_chr22/results/"
outfilename = f"{folder}{trait_name}_{j}.tsv"

In [None]:
outfilename

In [None]:
import os
if os.path.exists(outfilename):
    print("File already exists, exiting")

In [None]:
pv.to_csv(outfilename)