In [1]:
# import re
# import os
# import sys
# import time
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
from cellregmap import run_interaction

In [3]:
chrom = 22

In [4]:
## input files folder
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/input_files/"

In [5]:
############################################
########## Sample mapping file #############
############################################

In [6]:
## this file will map pseudocells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--0
1,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--1
2,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--2
3,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--3
4,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--4


In [7]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 191


In [8]:
############################################
############# Kinship matrix ###############
############################################

In [9]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [10]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [11]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [12]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)
del K

In [13]:
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 9219
Sample mapping number of rows AFTER intersection: 8352


In [14]:
############################################
##### expand from donors to cells ##########

## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [15]:
hK_expanded.shape

(8352, 173)

In [16]:
#####################################
############ Genotypes ##############
#####################################

In [17]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [05:49<00:00, 116.48s/it]


In [18]:
G.shape

(1610, 10464962)

In [19]:
######################################
########## Cell contexts #############
######################################

In [20]:
# cells by MOFA factors (20)
C_file = input_files_dir+"MOFA_20.csv"
C = pd.read_csv(C_file, index_col = 0)
C.head()

Unnamed: 0,MOFA1,MOFA2,MOFA3,MOFA4,MOFA5,MOFA6,MOFA7,MOFA8,MOFA9,MOFA10,MOFA11,MOFA12,MOFA13,MOFA14,MOFA15,MOFA16,MOFA17,MOFA18,MOFA19,MOFA20
HPSI0714i-iudw_1--DA--d30--0,1.666456,0.589292,-1.218032,0.533271,-0.114718,-0.70045,-0.872012,-0.722233,0.145421,0.255173,-0.153703,0.046562,0.933561,-0.303333,-1.420607,0.376046,-1.375035,-1.321022,0.036285,-0.523784
HPSI0714i-iudw_1--DA--d30--1,1.793275,-1.089708,0.516194,0.303165,-0.370706,-0.636298,1.13593,0.246662,-0.323239,-0.32895,0.297498,0.106488,1.434802,0.173501,1.149061,-0.520607,0.887738,-0.130147,0.941842,0.661152
HPSI0714i-iudw_1--DA--d30--2,1.460173,-0.920466,0.433373,0.838284,0.041792,-0.349312,-0.56651,0.033184,-0.294749,1.647099,-0.244315,1.193586,0.23443,1.36761,-1.354624,-0.435662,-0.001934,0.004343,-0.109958,-0.029995
HPSI0714i-iudw_1--DA--d30--3,2.135217,0.007668,-0.340678,0.169422,-0.487267,-0.947462,0.234383,-0.442354,0.556301,0.434645,0.031946,0.581785,0.673075,0.188795,-1.24652,1.037264,-0.689319,-0.84401,1.148122,0.371534
HPSI0714i-iudw_1--DA--d30--4,-0.299618,-1.471801,-0.559841,-0.287968,-0.071475,-0.094412,-0.972754,0.581462,-1.232076,-0.334227,-0.414754,1.178597,0.021825,-0.07058,-0.086396,0.656685,0.00725,-0.023221,-1.766547,-1.082093


In [21]:
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [22]:
# quantile normalise cell contexts
C_gauss = quantile_gaussianize(C)

In [23]:
#####################################
############ Phenotypes #############
#####################################

In [24]:
# Phenotype (meta-cell gene expression)
phenotype_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)

In [25]:
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (32738, 9982)
Phenotype shape AFTER selection: (32738, 8352)


In [26]:
#####################################
############ Filter file ############
#####################################

In [27]:
# Filter on specific gene-SNP pairs
# eQTL from neuroseq DA (day30 + day52 + day52 ROT treated)
neuro_eqtl_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/DA_eqtl_allconditions_FDR5pct.csv" # consider filter further (significant only)
neuro_eqtl = pd.read_csv(neuro_eqtl_file)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature'].unique()

In [28]:
len(genes)

48

In [29]:
i=0
trait_name = genes[i]
trait_name

'MMP11'

In [30]:
leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
G_sel = G[:,G['snp'].isin(leads)]

In [31]:
############################################
##### expand from donors to cells ##########

# expand out genotypes from cells to donors (and select relevant donors in the same step)
G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == G_expanded.sample.values)

print(G_expanded.shape)

(8352, 1)


  return self.array[key]


In [32]:
# select gene
y = phenotype.sel(trait=trait_name)
# quantile normalise
y = quantile_gaussianize(y)
# reshape
y = y.values.reshape(y.shape[0],1)
print(y.shape)

(8352, 1)


In [33]:
######################################
############ Covariates ##############
######################################

# just an intercept in this case
n_cells = phenotype.shape[1]
W = ones((n_cells, 1))

In [34]:
# unpack G
GG = G_expanded.values
print("Running for gene {}".format(trait_name))

Running for gene MMP11


In [35]:
# run association test using CellRegMap
pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]

100%|██████████| 1/1 [01:35<00:00, 95.95s/it]


In [36]:
pvals

array([0.55012854])

In [37]:
pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
               "pv":pvals,
               "variant":G_expanded.snp.values})
pv.head()

Unnamed: 0,chrom,pv,variant
0,22,0.550129,22_24158149_A_G


In [47]:
# pv.to_csv(outfilename, sep='\t')