In [1]:
import os
import re
import time
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
import cellregmap 
cellregmap 

<module 'cellregmap' from '/hps/nobackup/stegle/users/acuomo/git_repos/CellRegMap/cellregmap/__init__.py'>

In [3]:
from cellregmap import run_interaction

In [4]:
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

In [5]:
## sample mapping file
## this file will map cells to donors 
## it will also only include donors we have single-cell data for (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})

In [6]:
## genotype_individual_id are donor IDs, as found in the genotype matrix (G) and GRM covariance (K)
## phenotype_sample_id are cell IDs, as found in the scRNA-seq phenotype vector (y) and cell context covariance (C)
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0114i-joxm_1,21843_1#10
1,HPSI0314i-fafq_1,21843_1#100
2,HPSI0314i-fafq_1,21843_1#101
3,HPSI1013i-wuye_2,21843_1#102
4,HPSI0114i-joxm_1,21843_1#103


In [7]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [8]:
############################################
################ Kinship matrix ############
############################################

In [9]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [10]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [11]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [12]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)

In [13]:
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 34256
Sample mapping number of rows AFTER intersection: 33964


In [14]:
############################################
##### expand from donors to cells ##########

In [15]:
## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [16]:
######################################
############### Genotypes ############
######################################

In [17]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [05:57<00:00, 119.10s/it]


In [18]:
######################################
########## Cell contexts #############
######################################

In [19]:
# cells by PCs (20)
C_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/20PCs.csv"
C = pd.read_csv(C_file, index_col = 0)
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [20]:
# quantile normalise cell contexts
C_gauss = quantile_gaussianize(C)

In [21]:
#####################################
############ Phenotypes #############
#####################################

In [22]:
# Phenotype (single-cell expression)
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [23]:
#####################################
############ Filter file ############
#####################################

In [24]:
# filter file (columns: snp_id, gene)
endo_eqtl_file = input_files_dir+"endodiff_eqtl_allconditions_FDR10pct.csv"
endo_eqtl = pd.read_csv(endo_eqtl_file, index_col = False)
endo_eqtl["chrom"] = [int(i[:i.find("_")]) for i in endo_eqtl["snp_id"]]
endo_eqtl.head(2)

Unnamed: 0,snp_id,feature,stage,chrom
0,5_149826526_C_T,ENSG00000164587_RPS14,ips,5
1,11_57283988_C_T,ENSG00000134809_TIMM10,ips,11


In [70]:
chrom = 11
# and consider eGenes on that chromosome
genes = endo_eqtl[endo_eqtl['chrom']==int(chrom)]['feature'].unique()

In [71]:
len(genes)

178

In [72]:
outdir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/CRM_int_PCA/"

In [77]:
genes

array(['ENSG00000134809_TIMM10', 'ENSG00000196655_TRAPPC4',
       'ENSG00000142089_IFITM3', 'ENSG00000171202_TMEM126A',
       'ENSG00000166441_RPL27A', 'ENSG00000085063_CD59',
       'ENSG00000166435_XRRA1', 'ENSG00000149089_APIP',
       'ENSG00000182919_C11orf54', 'ENSG00000166452_AKIP1',
       'ENSG00000177042_TMEM80', 'ENSG00000223756_TSSC2',
       'ENSG00000177951_BET1L', 'ENSG00000189398_OR7E12P',
       'ENSG00000110218_PANX1', 'ENSG00000173113_TRMT112',
       'ENSG00000159063_ALG8', 'ENSG00000172500_FIBP',
       'ENSG00000166402_TUB', 'ENSG00000134910_STT3A',
       'ENSG00000185885_IFITM1', 'ENSG00000162174_ASRGL1',
       'ENSG00000085733_CTTN', 'ENSG00000171067_C11orf24',
       'ENSG00000149328_GLB1L2', 'ENSG00000172922_RNASEH2C',
       'ENSG00000167311_ART5', 'ENSG00000166261_ZNF202',
       'ENSG00000254772_EEF1G', 'ENSG00000021300_PLEKHB1',
       'ENSG00000175634_RPS6KB2', 'ENSG00000185201_IFITM2',
       'ENSG00000175575_PAAF1', 'ENSG00000158483_FAM86C1',
      

In [78]:
for gene_name in genes:
    trait_name = re.sub("_.*","",gene_name)
    outfilename = outdir + str(gene_name) + ".tsv"
    if os.path.exists(outfilename):
        print("File already exists, exiting")
        continue
    leads = endo_eqtl[endo_eqtl['feature']==gene_name]['snp_id'].unique()
    G_sel = G[:,G['snp'].isin(leads)]
    G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
    assert all(hK_expanded.sample.values == G_expanded.sample.values)
#     trait_name = re.sub("-",".",trait_name)
    y = phenotype.sel(trait=gene_name)
    y = quantile_gaussianize(y)
    y = y.values.reshape(y.shape[0],1)
    n_cells = phenotype.shape[1]
    W = ones((n_cells, 1))
    GG = G_expanded.values
    print("Running for gene {}".format(gene_name))
    pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], E2=C.values[:,0:20], G=GG, hK=hK_expanded)[0]
    pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
           "pv":pvals,
           "variant":G_expanded.snp.values})
    pv.to_csv(outfilename)

File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already exists, exiting
File already e