In [1]:
import re
import pandas as pd
import xarray as xr
from numpy import ones
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
import cellregmap 
cellregmap 

<module 'cellregmap' from '/hps/nobackup/stegle/users/acuomo/git_repos/CellRegMap/cellregmap/__init__.py'>

In [3]:
from cellregmap import run_association, run_association0

In [4]:
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

In [5]:
## sample mapping file
## this file will map cells to donors 
## it will also only include donors we have single-cell data for (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})

In [6]:
## genotype_individual_id are donor IDs, as found in the genotype matrix (G) and GRM covariance (K)
## phenotype_sample_id are cell IDs, as found in the scRNA-seq phenotype vector (y) and cell context covariance (C)
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0114i-joxm_1,21843_1#10
1,HPSI0314i-fafq_1,21843_1#100
2,HPSI0314i-fafq_1,21843_1#101
3,HPSI1013i-wuye_2,21843_1#102
4,HPSI0114i-joxm_1,21843_1#103


In [7]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [8]:
############################################
################ Kinship matrix ############
############################################

In [9]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [10]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [11]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [12]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)

In [13]:
del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 34256
Sample mapping number of rows AFTER intersection: 33964


In [14]:
############################################
##### expand from donors to cells ##########

In [15]:
## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [16]:
######################################
############### Genotypes ############
######################################

In [17]:
## read in genotype file (plink format)
plink_folder = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [05:49<00:00, 116.52s/it]


In [18]:
#############################
###### SNP selection

In [19]:
# filter file (columns: snp_id, gene)
endo_eqtl_file = input_files_dir+"endodiff_eqtl_allconditions_FDR10pct.csv"
endo_eqtl = pd.read_csv(endo_eqtl_file, index_col = False)
endo_eqtl["chrom"] = [int(i[:i.find("_")]) for i in endo_eqtl["snp_id"]]
endo_eqtl.head(2)

Unnamed: 0,snp_id,feature,stage,chrom
0,5_149826526_C_T,ENSG00000164587_RPS14,ips,5
1,11_57283988_C_T,ENSG00000134809_TIMM10,ips,11


In [20]:
chrom = 22
# and consider eGenes on that chromosome
genes = endo_eqtl[endo_eqtl['chrom']==int(chrom)]['feature'].unique()

In [21]:
#########################################################
# cis window around a specific gene (discovery)

In [35]:
def cis_snp_selection(feature_id, annotation_df, G, window_size):
    feature = annotation_df.query("feature_id==\"{}\"".format(feature_id)).squeeze()
    chrom = str(feature['chromosome'])
    start = feature['start']
    end = feature['end']
    # make robust to features self-specified back-to-front
    lowest = min([start,end])
    highest = max([start,end])
    # for cis, we sequentially add snps that fall within each region
    G = G.where((G.chrom == str(chrom)) & (G.pos > (lowest-window_size)) & (G.pos < (highest+window_size)), drop=True)
    return G

In [81]:
# (1) gene name (feature_id)
gene_name = genes[0]
trait_name = re.sub("_.*","",gene_name)
trait_name

'ENSG00000100058'

In [82]:
# (2) annotation linking gene to genomic position
annotation_file = "/hps/nobackup/hipsci/scratch/processed_data/rna_seq/annotationFiles/Ensembl_75_Limix_Annotation_FC_Gene.txt"
anno_df = pd.read_csv(annotation_file, sep="\t", index_col=0)
anno_df.head(2)

Unnamed: 0_level_0,chromosome,start,end
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000271782,1,50902700,50902978
ENSG00000232753,1,103817769,103828355


In [83]:
# (3) window size (cis)
w = 100000

In [84]:
G_sel = cis_snp_selection(trait_name, anno_df, G, w)

In [85]:
G_sel.shape

(1610, 1078)

In [86]:
############################################
##### expand from donors to cells ##########

In [87]:
# expand out genotypes from cells to donors (and select relevant donors in the same step)
G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == G_expanded.sample.values)

  return self.array[key]


In [88]:
G_expanded.shape

(33964, 1078)

In [89]:
######################################
############## Phenotypes ############
######################################

In [90]:
# Phenotype (single-cell expression)
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [91]:
# select gene
y = phenotype.sel(trait=gene_name)
# quantile normalise
y = quantile_gaussianize(y)

In [92]:
######################################
########## Cell contexts #############
######################################

In [93]:
# cells by MOFA factors (20)
C_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/mofa_logcounts_model_factors.csv"
C = pd.read_csv(C_file, index_col = 0)
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

In [94]:
# quantile normalise cell contexts
C = quantile_gaussianize(C)

In [95]:
######################################
############ Covariates ##############
######################################

In [96]:
n_cells = phenotype.shape[1]
W = ones((n_cells, 1))

In [97]:
y = y.values.reshape(y.shape[0],1)
y.shape

(33964, 1)

In [98]:
W

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [99]:
C.values[:,0:10].shape
C.values[:,0:10]

array([[-0.22698835,  0.82917178,  0.23411244, ...,  0.07390445,
         0.30870685, -0.66988754],
       [ 0.11928532,  2.94980516,  0.8196344 , ..., -0.3224365 ,
         1.85295159, -0.70238707],
       [ 0.05970302,  1.86963924,  0.02166209, ...,  0.1331965 ,
         1.00717123,  0.75346536],
       ...,
       [-0.39533548,  0.72850657, -1.20369879, ..., -0.20673893,
         0.31816383, -0.3551869 ],
       [-0.26984302,  1.04975257, -0.57387367, ...,  0.13543057,
         0.58636113, -0.12188718],
       [-0.38132928,  1.2401305 , -0.68501989, ...,  0.33271513,
         0.47799852,  0.28748933]])

In [100]:
GG = G_expanded.values

In [101]:
GG.shape

(33964, 1078)

In [102]:
GG

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [103]:
revision_folder = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/"

In [104]:
y_df = pd.DataFrame(y, columns = [str(gene_name)])
y_df.head(2)

Unnamed: 0,ENSG00000100058_CRYBB2P1
0,-0.312579
1,-1.686282


In [105]:
G_df = pd.DataFrame(GG, columns = G_expanded.snp.values)
G_df.head(2)

Unnamed: 0,22_25744472_C_T,22_25744915_T_C,22_25745441_G_A,22_25746593_C_T,22_25746647_C_T,22_25746766_C_G,22_25746838_C_G,22_25747123_G_A,22_25747160_G_A,22_25747716_A_G,...,22_26012979_G_C,22_26012999_A_G,22_26013350_C_T,22_26014158_G_A,22_26014175_C_T,22_26014831_C_T,22_26015423_A_G,22_26015480_T_C,22_26015591_C_T,22_26016746_A_G
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
# C_df = pd.DataFrame(C.values[:,0:10], columns = ["MOFA"+str(i+1) for i in range(10)])
# C_df.head()

Unnamed: 0,MOFA1,MOFA2,MOFA3,MOFA4,MOFA5,MOFA6,MOFA7,MOFA8,MOFA9,MOFA10
0,-0.226988,0.829172,0.234112,1.343998,0.594003,0.309558,-0.434996,0.073904,0.308707,-0.669888
1,0.119285,2.949805,0.819634,1.203699,-0.361482,-0.07383,0.951731,-0.322436,1.852952,-0.702387
2,0.059703,1.869639,0.021662,0.55785,0.058151,0.199431,0.332403,0.133196,1.007171,0.753465
3,-0.328117,0.882065,0.402768,0.783509,1.675973,0.232444,-0.355737,-0.153478,0.104952,0.090492
4,-0.024615,1.308609,-0.053124,0.366056,0.676553,-0.250452,-0.389278,-0.172923,0.076347,-0.444426


In [106]:
y_df.to_csv(revision_folder+"example_y_ENSG00000100058.csv")
G_df.to_csv(revision_folder+"example_G_ENSG00000100058.csv")
# C_df.to_csv(revision_folder+"example_C.csv")

In [56]:
import time

In [57]:
start_time = time.time()
pvals0 = run_association0(y, W, C.values[:,0:10], G=GG[:,0:10], hK=hK_expanded)[0]
print("--- %s seconds ---" % (time.time() - start_time))

--- 14.400843620300293 seconds ---


In [58]:
start_time = time.time()
pvals = run_association(y, W, C.values[:,0:10], G=GG[:,0:10], hK=hK_expanded)[0]
print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 10/10 [00:13<00:00,  1.36s/it]

--- 27.94093656539917 seconds ---





In [52]:
pv0 = pd.DataFrame({"chrom":G_expanded[:,0:10].chrom.values,
               "pv":pvals0,
               "variant":G_expanded[:,0:10].snp.values})
pv0.head()

Unnamed: 0,chrom,pv,variant
0,19,0.000113,19_57847904_C_T
1,19,0.392785,19_57847915_G_A
2,19,0.017736,19_57848002_C_T
3,19,0.000235,19_57848207_T_C
4,19,0.017736,19_57848474_G_T


In [53]:
pv = pd.DataFrame({"chrom":G_expanded[:,0:10].chrom.values,
               "pv":pvals,
               "variant":G_expanded[:,0:10].snp.values})
pv.head()

Unnamed: 0,chrom,pv,variant
0,19,7.4e-05,19_57847904_C_T
1,19,0.392059,19_57847915_G_A
2,19,0.016497,19_57848002_C_T
3,19,0.000166,19_57848207_T_C
4,19,0.016497,19_57848474_G_T
