In [1]:
import re
import os
# import sys
# import time
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
from cellregmap import run_interaction

In [42]:
## input files folder
input_files_dir = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/input_files/"

In [43]:
############################################
########## Sample mapping file #############
############################################

In [44]:
## this file will map pseudocells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--0
1,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--1
2,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--2
3,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--3
4,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--4


In [45]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 191


In [46]:
############################################
############# Kinship matrix ###############
############################################

In [47]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [48]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [49]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [50]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)
del K

In [51]:
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 9219
Sample mapping number of rows AFTER intersection: 8352


In [52]:
############################################
##### expand from donors to cells ##########

## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [53]:
hK_expanded.shape

(8352, 173)

In [54]:
#####################################
############ Genotypes ##############
#####################################

In [55]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup2/stegle/users/acuomo/hipsci_genotype_files/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:45<00:00, 135.27s/it]


In [56]:
G.shape

(1610, 10464962)

In [57]:
######################################
########## Cell contexts #############
######################################

In [58]:
# cells by dummies (day 30 - day 52 - day 52ROT)
# C_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/C_discrete.csv"

In [59]:
# cells by dummies (9 clusters)
C_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/C_discrete_9clusters.csv"

In [60]:
C = pd.read_csv(C_file, index_col = 0)
C.head()

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8
HPSI0714i-iudw_1--DA--d30--0,1,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--1,1,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--2,1,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--3,1,0,0,0,0,0,0,0,0
HPSI0714i-iudw_1--DA--d30--4,1,0,0,0,0,0,0,0,0


In [61]:
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [62]:
# quantile normalise cell contexts
C_gauss = C
# C_gauss = quantile_gaussianize(C)

In [63]:
#####################################
############ Phenotypes #############
#####################################

In [64]:
# Phenotype (meta-cell gene expression)
phenotype_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)

In [65]:
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (32738, 9982)
Phenotype shape AFTER selection: (32738, 8352)


In [66]:
#####################################
############ Filter file ############
#####################################

In [78]:
chrom = 17

In [79]:
# Filter on specific gene-SNP pairs
# eQTL from neuroseq DA (day30 + day52 + day52 ROT treated)
neuro_eqtl_file = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/DA_eqtl_allconditions_FDR5pct.csv" # consider filter further (significant only)
neuro_eqtl = pd.read_csv(neuro_eqtl_file)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature'].unique()

In [80]:
len(genes)

80

In [81]:
outdir = "/hps/nobackup2/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/CRM_interaction_discrete_contexts/9clusters/"

In [84]:
genes

array(['SPATA20', 'ATP5G1', 'ZNF652', 'PRKCA', 'NEK8', 'USP22', 'MED31',
       'EIF5A', 'C17orf49', 'CCDC40', 'SRR', 'NDEL1', 'VAMP2', 'HEXDC',
       'ARL16', 'DYNLL2', 'C17orf75', 'TEFM', 'AKAP10', 'SPECC1', 'MXRA7',
       'TEN1', 'LLGL1', 'MPRIP-AS1', 'SMCR5', 'TMEM97', 'LYRM9', 'MED24',
       'TUBG2', 'SRCIN1', 'ZNHIT3', 'DDX52', 'ZSWIM7', 'ZNF286A', 'NSF',
       'MRPL10', 'ARL17A', 'ARL17B', 'LRRC37A2', 'ELAC2', 'VMO1',
       'TAX1BP3', 'MRPS7', 'FTSJ3', 'DDX42', 'MAP3K3', 'COX11', 'KANSL1',
       'CRHR1', 'KANSL1-AS1', 'WNT3', 'KPNB1', 'TRIM47', 'HID1',
       'SLC16A3', 'DBF4B', 'DCAKD', 'MAPT-AS1', 'C17orf89', 'EPN3',
       'YWHAE', 'DHRS13', 'DHRS11', 'C17orf50', 'SRSF2', 'ALKBH5',
       'GUCY2D', 'RPL23A', 'KRT19', 'UTP18', 'TOB1', 'KCNJ16', 'FAM211A',
       'TRPV2', 'LINC00672', 'RAD51C', 'COIL', 'TEX2', 'FN3K', 'SLC25A39'],
      dtype=object)

In [None]:
for trait_name in genes:
    gene_name = re.sub("-",".",trait_name)
    outfilename = outdir + str(gene_name) + ".tsv"
    if os.path.exists(outfilename):
        print("File already exists, exiting")
        continue
    leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
    G_sel = G[:,G['snp'].isin(leads)]
    G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
    assert all(hK_expanded.sample.values == G_expanded.sample.values)
    trait_name = re.sub("-",".",trait_name)
    y = phenotype.sel(trait=trait_name)
    y = quantile_gaussianize(y)
    y = y.values.reshape(y.shape[0],1)
    n_cells = phenotype.shape[1]
    W = ones((n_cells, 1))
    GG = G_expanded.values
    print("Running for gene {}".format(trait_name))
    pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]
    pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
           "pv":pvals,
           "variant":G_expanded.snp.values})
    pv.to_csv(outfilename)

File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already e

  return self.array[key]


Running for gene KANSL1


100%|██████████| 3/3 [02:17<00:00, 45.95s/it]
  return self.array[key]


Running for gene CRHR1


100%|██████████| 2/2 [01:31<00:00, 45.67s/it]
  return self.array[key]


Running for gene KANSL1.AS1


100%|██████████| 2/2 [01:27<00:00, 43.50s/it]
  return self.array[key]


Running for gene WNT3


100%|██████████| 1/1 [00:39<00:00, 39.99s/it]
  return self.array[key]


Running for gene KPNB1


100%|██████████| 1/1 [00:40<00:00, 40.96s/it]
  return self.array[key]


Running for gene TRIM47


100%|██████████| 1/1 [00:43<00:00, 43.02s/it]
  return self.array[key]


Running for gene HID1


100%|██████████| 1/1 [00:41<00:00, 41.41s/it]
  return self.array[key]


Running for gene SLC16A3


100%|██████████| 2/2 [01:22<00:00, 41.25s/it]
  return self.array[key]


Running for gene DBF4B


100%|██████████| 1/1 [00:49<00:00, 49.20s/it]
  return self.array[key]


Running for gene DCAKD


100%|██████████| 1/1 [00:48<00:00, 48.80s/it]
  return self.array[key]


Running for gene MAPT.AS1


100%|██████████| 1/1 [00:47<00:00, 47.95s/it]
  return self.array[key]


Running for gene C17orf89


100%|██████████| 2/2 [01:29<00:00, 44.73s/it]
  return self.array[key]


Running for gene EPN3


100%|██████████| 1/1 [00:46<00:00, 46.99s/it]
  return self.array[key]


Running for gene YWHAE


100%|██████████| 2/2 [01:18<00:00, 39.44s/it]
  return self.array[key]


Running for gene DHRS13


100%|██████████| 1/1 [00:44<00:00, 44.85s/it]
  return self.array[key]


Running for gene DHRS11


100%|██████████| 1/1 [00:46<00:00, 46.52s/it]
  return self.array[key]


Running for gene C17orf50


100%|██████████| 2/2 [01:32<00:00, 46.24s/it]
  return self.array[key]


Running for gene SRSF2


100%|██████████| 1/1 [00:47<00:00, 47.59s/it]
  return self.array[key]


Running for gene ALKBH5


100%|██████████| 1/1 [00:45<00:00, 45.02s/it]
  return self.array[key]


Running for gene GUCY2D


100%|██████████| 2/2 [01:33<00:00, 46.85s/it]
  return self.array[key]


Running for gene RPL23A


100%|██████████| 2/2 [01:17<00:00, 38.53s/it]
  return self.array[key]


Running for gene KRT19


  0%|          | 0/1 [00:00<?, ?it/s]