In [135]:
import re
import os
# import sys
# import time
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [136]:
from cellregmap import run_interaction

In [137]:
## input files folder
input_files_dir = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/input_files/"

In [138]:
############################################
########## Sample mapping file #############
############################################

In [139]:
## this file will map pseudocells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--0
1,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--1
2,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--2
3,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--3
4,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--4


In [140]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 191


In [141]:
############################################
############# Kinship matrix ###############
############################################

In [142]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [143]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [144]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [145]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)
del K

In [146]:
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 9219
Sample mapping number of rows AFTER intersection: 8352


In [147]:
############################################
##### expand from donors to cells ##########

## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [148]:
hK_expanded.shape

(8352, 173)

In [149]:
#####################################
############ Genotypes ##############
#####################################

In [150]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:16<00:00, 125.65s/it]


In [151]:
G.shape

(1610, 10464962)

In [152]:
######################################
########## Cell contexts #############
######################################

In [153]:
# cells by dummies (18 clusters)
C_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/C_discrete_18clusters.csv"
C = pd.read_csv(C_file, index_col = 0)
C.head()

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
HPSI0714i-iudw_1--DA--d30--0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [154]:
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [155]:
# quantile normalise cell contexts
C_gauss = C
# C_gauss = quantile_gaussianize(C)

In [156]:
#####################################
############ Phenotypes #############
#####################################

In [157]:
# Phenotype (meta-cell gene expression)
phenotype_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)

In [158]:
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (32738, 9982)
Phenotype shape AFTER selection: (32738, 8352)


In [159]:
#####################################
############ Filter file ############
#####################################

In [161]:
chrom = 19

In [162]:
# Filter on specific gene-SNP pairs
# eQTL from neuroseq DA (day30 + day52 + day52 ROT treated)
neuro_eqtl_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/DA_eqtl_allconditions_FDR5pct.csv" # consider filter further (significant only)
neuro_eqtl = pd.read_csv(neuro_eqtl_file)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature'].unique()

In [163]:
len(genes)

101

In [164]:
outdir = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/CRM_interaction_discrete_contexts/18clusters/"

In [165]:
genes

array(['ETFB', 'MED29', 'HNRNPL', 'PPP1R14A', 'MZF1', 'SF3A2', 'AMH',
       'LSM7', 'STXBP2', 'MARCH2', 'KANK3', 'RAB11B-AS1', 'UBL5',
       'C19orf82', 'MAST3', 'PGPEP1', 'NOVA2', 'PRKD2', 'SEPW1',
       'SLC25A23', 'ALKBH7', 'ZNF557', 'DUS3L', 'LONP1', 'PDCD5', 'PEPD',
       'GEMIN7', 'CEACAM21', 'TMEM91', 'TTC9B', 'ZNF880', 'RPS11', 'MRI1',
       'C19orf53', 'GIPC1', 'FARSA', 'WDR18', 'DAZAP1', 'BSG', 'ZNF730',
       'ZNF682', 'UBA52', 'POLR2I', 'THAP8', 'LINC00665', 'ZNF738',
       'ZNF429', 'COX6B1', 'ZNF30', 'NAT14', 'EPS8L1', 'TMEM190', 'ZFR2',
       'UBXN6', 'MIR7-3HG', 'ZNF324', 'ZNF419', 'ZNF773', 'ZNF584',
       'ZNF552', 'ZNF772', 'ZNF544', 'ZNF587', 'RPS9', 'NDUFA3', 'XRCC1',
       'ETHE1', 'ZNF793', 'ZNF781', 'ZNF420', 'CALR', 'CD320', 'RPS28',
       'VSTM2B', 'LINC00662', 'XAB2', 'GTF2F1', 'GPR108', 'FBXO27',
       'TPGS1', 'ZNF77', 'CRLF1', 'COPE', 'RPL28', 'C19orf10', 'SIRT6',
       'MRPL54', 'CPT1C', 'RAB8A', 'MRPL34', 'KLK10', 'SPTBN4', 'DHDH',
       'P

In [174]:
for trait_name in genes:
    gene_name = re.sub("-",".",trait_name)
    outfilename = outdir + str(gene_name) + ".tsv"
    if os.path.exists(outfilename):
        print("File already exists, exiting")
        continue
    leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
    G_sel = G[:,G['snp'].isin(leads)]
    G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
    assert all(hK_expanded.sample.values == G_expanded.sample.values)
    trait_name = re.sub("-",".",trait_name)
    y = phenotype.sel(trait=trait_name)
    y = quantile_gaussianize(y)
    y = y.values.reshape(y.shape[0],1)
    n_cells = phenotype.shape[1]
    W = ones((n_cells, 1))
    GG = G_expanded.values
    print("Running for gene {}".format(trait_name))
    pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]
    pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
           "pv":pvals,
           "variant":G_expanded.snp.values})
    pv.to_csv(outfilename)

File already exists, exiting


  return self.array[key]


Running for gene KLK10


100%|██████████| 1/1 [01:31<00:00, 91.64s/it]
