In [157]:
import re
import os
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [5]:
from cellregmap import run_interaction

In [211]:
chrom = 22

In [7]:
## input files folder
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/input_files/"

In [8]:
############################################
########## Sample mapping file #############
############################################

In [9]:
## this file will map pseudocells to donors, it will also only including donors we have single cell data (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--0
1,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--1
2,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--2
3,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--3
4,HPSI0714i-iudw_1,HPSI0714i-iudw_1--DA--d30--4


In [10]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 191


In [11]:
############################################
############# Kinship matrix ###############
############################################

In [12]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [13]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 173


In [14]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [15]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)
del K

In [16]:
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 9219
Sample mapping number of rows AFTER intersection: 8352


In [17]:
############################################
##### expand from donors to cells ##########

## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [18]:
hK_expanded.shape

(8352, 173)

In [19]:
#####################################
############ Genotypes ##############
#####################################

In [20]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)


Mapping files:   0%|          | 0/3 [00:00<?, ?it/s][A
Mapping files:  33%|███▎      | 1/3 [00:15<00:31, 15.74s/it][A
Mapping files:  67%|██████▋   | 2/3 [00:16<00:11, 11.34s/it][A
Mapping files: 100%|██████████| 3/3 [05:55<00:00, 118.61s/it][A


In [21]:
G.shape

(1610, 10464962)

In [22]:
######################################
########## Cell contexts #############
######################################

In [43]:
# cells by dummies (day 30 - day 52 - day 52ROT)
C_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/C_discrete.csv"
C = pd.read_csv(C_file, index_col = 0)
C.head()

Unnamed: 0,condition_d30,condition_d52_tr,condition_d52_untr
HPSI0714i-iudw_1--DA--d30--0,1,0,0
HPSI0714i-iudw_1--DA--d30--1,1,0,0
HPSI0714i-iudw_1--DA--d30--2,1,0,0
HPSI0714i-iudw_1--DA--d30--3,1,0,0
HPSI0714i-iudw_1--DA--d30--4,1,0,0


In [44]:
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [27]:
# quantile normalise cell contexts
C_gauss = C
# C_gauss = quantile_gaussianize(C)

In [28]:
#####################################
############ Phenotypes #############
#####################################

In [29]:
# Phenotype (meta-cell gene expression)
phenotype_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)

In [30]:
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (32738, 9982)
Phenotype shape AFTER selection: (32738, 8352)


In [31]:
#####################################
############ Filter file ############
#####################################

In [212]:
# Filter on specific gene-SNP pairs
# eQTL from neuroseq DA (day30 + day52 + day52 ROT treated)
neuro_eqtl_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/input_files/DA_all_conditions/DA_eqtl_allconditions_FDR5pct.csv" # consider filter further (significant only)
neuro_eqtl = pd.read_csv(neuro_eqtl_file)
neuro_eqtl["chrom"] = [int(i[:i.find("_")]) for i in neuro_eqtl["snp_id"]]
genes = neuro_eqtl[neuro_eqtl['chrom']==int(chrom)]['feature'].unique()

In [213]:
len(genes)

43

In [214]:
outdir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_neuroseq/May2021/REVISION/CRM_int/"

In [215]:
genes

array(['TTC5', 'GPATCH2L', 'LIN52', 'SPATA7', 'RPL36AL', 'ARF6',
       'TXNDC16', 'NIN', 'ABHD12B', 'ERO1L', 'PPP2R5C', 'MOK', 'APOPT1',
       'DTD2', 'NFATC4', 'DHRS4', 'DHRS1', 'DHRS4L2', 'CHMP4A', 'DAAM1',
       'KTN1', 'CHURC1', 'IFI27L2', 'IFI27L1', 'NUDT14', 'ELMSAN1',
       'RBM23', 'PSMB5', 'C14orf79', 'SIVA1', 'ADSSL1', 'ACYP1', 'RAB15',
       'SDR39U1', 'CHGA', 'COX16', 'PTGR2', 'VCPKMT', 'ATP5S', 'LGALS3',
       'GSKIP', 'ATXN3', 'TC2N'], dtype=object)

In [None]:
for trait_name in genes:
    outfilename = outdir + str(trait_name) + ".tsv"
    if os.path.exists(outfilename):
        print("File already exists, exiting")
        continue
    leads = neuro_eqtl[neuro_eqtl['feature']==trait_name]['snp_id'].unique()
    G_sel = G[:,G['snp'].isin(leads)]
    G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
    assert all(hK_expanded.sample.values == G_expanded.sample.values)
    trait_name = re.sub("-",".",trait_name)
    y = phenotype.sel(trait=trait_name)
    y = quantile_gaussianize(y)
    y = y.values.reshape(y.shape[0],1)
    n_cells = phenotype.shape[1]
    W = ones((n_cells, 1))
    GG = G_expanded.values
    print("Running for gene {}".format(trait_name))
    pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]
    pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
           "pv":pvals,
           "variant":G_expanded.snp.values})
    pv.to_csv(outfilename)

  return self.array[key]


Running for gene TTC5


100%|██████████| 1/1 [00:12<00:00, 12.75s/it]
  return self.array[key]


Running for gene GPATCH2L


100%|██████████| 1/1 [00:12<00:00, 12.73s/it]
  return self.array[key]


Running for gene LIN52


100%|██████████| 2/2 [00:27<00:00, 13.59s/it]
  return self.array[key]


Running for gene SPATA7


100%|██████████| 1/1 [00:12<00:00, 12.56s/it]
  return self.array[key]


Running for gene RPL36AL


100%|██████████| 2/2 [00:23<00:00, 11.81s/it]
  return self.array[key]


Running for gene ARF6


100%|██████████| 2/2 [00:25<00:00, 12.63s/it]
  return self.array[key]


Running for gene TXNDC16


100%|██████████| 1/1 [00:12<00:00, 12.94s/it]
  return self.array[key]


Running for gene NIN


100%|██████████| 1/1 [00:12<00:00, 12.26s/it]
  return self.array[key]


Running for gene ABHD12B


100%|██████████| 1/1 [00:13<00:00, 13.09s/it]
  return self.array[key]
