In [1]:
import re
import os
import sys
import time
import pandas as pd
import xarray as xr
from numpy import ones, random
from numpy.linalg import cholesky
from pandas_plink import read_plink1_bin
from limix.qc import quantile_gaussianize

In [2]:
from cellregmap import run_interaction

In [3]:
?run_interaction

In [64]:
i = 2
seed = 0

In [65]:
revision_folder = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/"

In [66]:
####### right away check if this was already run for this gene
# filter file (columns: snp_id, gene)
fvf_filename = revision_folder+"/CRM_interaction_chr22/fvf.csv"
fvf = pd.read_csv(fvf_filename, index_col = 0)
fvf.head()

Unnamed: 0,feature,snpID,chrom
1,ENSG00000100206_DMC1,22_38815543_G_T,22
2,ENSG00000100206_DMC1,22_38815767_C_T,22
3,ENSG00000100206_DMC1,22_38816059_T_C,22
4,ENSG00000100206_DMC1,22_38816351_G_A,22
5,ENSG00000100206_DMC1,22_38817047_A_G,22


In [67]:
genes = fvf['feature'].unique()
genes

array(['ENSG00000100206_DMC1', 'ENSG00000182858_ALG12',
       'ENSG00000100243_CYB5R3', 'ENSG00000099974_DDTL',
       'ENSG00000128340_RAC2', 'ENSG00000240972_MIF',
       'ENSG00000100034_PPM1F', 'ENSG00000130638_ATXN10',
       'ENSG00000198355_PIM3', 'ENSG00000099904_ZDHHC8',
       'ENSG00000100029_PES1', 'ENSG00000133466_C1QTNF6',
       'ENSG00000186716_BCR', 'ENSG00000100263_RHBDD3',
       'ENSG00000099889_ARVCF', 'ENSG00000100151_PICK1',
       'ENSG00000099949_LZTR1', 'ENSG00000184674_GSTT1',
       'ENSG00000099958_DERL3', 'ENSG00000099977_DDT',
       'ENSG00000185252_ZNF74', 'ENSG00000100014_SPECC1L',
       'ENSG00000128191_DGCR8', 'ENSG00000100350_FOXRED2',
       'ENSG00000239713_APOBEC3G', 'ENSG00000128309_MPST',
       'ENSG00000100099_HPS4', 'ENSG00000100033_PRODH',
       'ENSG00000188130_MAPK12', 'ENSG00000184164_CRELD2',
       'ENSG00000100410_PHF5A', 'ENSG00000100056_DGCR14',
       'ENSG00000100354_TNRC6B', 'ENSG00000205853_RFPL3S',
       'ENSG00000100225_FB

In [68]:
gene_name= genes[i]
trait_name = re.sub("_.*","",gene_name)
print(gene_name)
print(trait_name)

ENSG00000100243_CYB5R3
ENSG00000100243


In [69]:
fvf_gene = fvf[fvf['feature']==gene_name]
fvf_gene

Unnamed: 0,feature,snpID,chrom
735,ENSG00000100243_CYB5R3,22_42914931_T_C,22
736,ENSG00000100243_CYB5R3,22_42914937_A_G,22
737,ENSG00000100243_CYB5R3,22_42914974_T_C,22
738,ENSG00000100243_CYB5R3,22_42915479_G_A,22
739,ENSG00000100243_CYB5R3,22_42916215_C_G,22
...,...,...,...
1416,ENSG00000100243_CYB5R3,22_43144054_G_C,22
1417,ENSG00000100243_CYB5R3,22_43144502_A_G,22
1418,ENSG00000100243_CYB5R3,22_43144771_G_C,22
1419,ENSG00000100243_CYB5R3,22_43145421_G_C,22


In [70]:
folder = revision_folder+"CRM_interaction_chr22/results_permG/"
outfilename = f"{folder}{trait_name}_{seed}.tsv"
print(outfilename)

/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/REVISION/CRM_interaction_chr22/results_permG/ENSG00000100243_0.tsv


In [71]:
if os.path.exists(outfilename):
    print("File already exists, exiting")

In [72]:
# input files directory
input_files_dir = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/new/input_files/"

In [73]:
############################################
########## Sample mapping file #############
############################################

In [74]:
## this file will map cells to donors 
## it will also only include donors we have single-cell data for (a subset of all of HipSci donors)
sample_mapping_file = input_files_dir+"sample_mapping_file.csv"
sample_mapping = pd.read_csv(sample_mapping_file, dtype={"genotype_individual_id": str, "phenotype_sample_id": str})
## genotype_individual_id are donor IDs, as found in the genotype matrix (G) and GRM covariance (K)
## phenotype_sample_id are cell IDs, as found in the scRNA-seq phenotype vector (y) and cell context covariance (C)
sample_mapping.head()

Unnamed: 0,genotype_individual_id,phenotype_sample_id
0,HPSI0114i-joxm_1,21843_1#10
1,HPSI0314i-fafq_1,21843_1#100
2,HPSI0314i-fafq_1,21843_1#101
3,HPSI1013i-wuye_2,21843_1#102
4,HPSI0114i-joxm_1,21843_1#103


In [75]:
## extract unique individuals
donors = sample_mapping["genotype_individual_id"].unique()
donors.sort()
print("Number of unique donors: {}".format(len(donors)))

Number of unique donors: 126


In [76]:
############################################
############# Kinship matrix ###############
############################################

In [77]:
## read in GRM (genotype relationship matrix; kinship matrix)
kinship_folder="/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/"
kinship_file=kinship_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship"
K = pd.read_csv(kinship_file, sep="\t", index_col=0)
assert all(K.columns == K.index) #symmetric matrix, donors x donors

In [78]:
K = xr.DataArray(K.values, dims=["sample_0", "sample_1"], coords={"sample_0": K.columns, "sample_1": K.index})
K = K.sortby("sample_0").sortby("sample_1")
donors = sorted(set(list(K.sample_0.values)).intersection(donors))
print("Number of donors after kinship intersection: {}".format(len(donors)))

Number of donors after kinship intersection: 125


In [79]:
## subset to relevant donors
K = K.sel(sample_0=donors, sample_1=donors)
assert all(K.sample_0 == donors)
assert all(K.sample_1 == donors)

In [80]:
## and decompose such as K = hK @ hK.T (using Cholesky decomposition)
hK = cholesky(K.values)
hK = xr.DataArray(hK, dims=["sample", "col"], coords={"sample": K.sample_0.values})
assert all(hK.sample.values == K.sample_0.values)

del K
print("Sample mapping number of rows BEFORE intersection: {}".format(sample_mapping.shape[0]))
## subsample sample mapping file to donors in the kinship matrix
sample_mapping = sample_mapping[sample_mapping["genotype_individual_id"].isin(donors)]
print("Sample mapping number of rows AFTER intersection: {}".format(sample_mapping.shape[0]))

Sample mapping number of rows BEFORE intersection: 34256
Sample mapping number of rows AFTER intersection: 33964


In [81]:
############################################
##### expand from donors to cells ##########

## use sel from xarray to expand hK (using the sample mapping file)
hK_expanded = hK.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == sample_mapping["genotype_individual_id"].values)

In [82]:
#####################################
############ Genotypes ##############
#####################################

## read in genotype file (plink format)
plink_folder = "/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/"
plink_file = plink_folder+"hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.bed"
G = read_plink1_bin(plink_file)

Mapping files: 100%|██████████| 3/3 [06:12<00:00, 124.15s/it]


In [83]:
######################################
########## Cell contexts #############
######################################

# cells by MOFA factors (20)
C_file = "/hps/nobackup/stegle/users/acuomo/all_scripts/struct_LMM2/sc_endodiff/debug_May2021/mofa_logcounts_model_factors.csv"
C = pd.read_csv(C_file, index_col = 0)
C = xr.DataArray(C.values, dims=["cell", "pc"], coords={"cell": C.index.values, "pc": C.columns.values})
C = C.sel(cell=sample_mapping["phenotype_sample_id"].values)
assert all(C.cell.values == sample_mapping["phenotype_sample_id"].values)

# quantile normalise cell contexts
C_gauss = quantile_gaussianize(C)

In [84]:
#####################################
############ Phenotypes #############
#####################################

# Phenotype (single-cell expression)
phenotype_file = input_files_dir+"phenotype.csv.pkl"
phenotype = pd.read_pickle(phenotype_file)
print("Phenotype shape BEFORE selection: {}".format(phenotype.shape))
phenotype = xr.DataArray(phenotype.values, dims=["trait", "cell"], coords={"trait": phenotype.index.values, "cell": phenotype.columns.values})
phenotype = phenotype.sel(cell=sample_mapping["phenotype_sample_id"].values)
print("Phenotype shape AFTER selection: {}".format(phenotype.shape))
assert all(phenotype.cell.values == sample_mapping["phenotype_sample_id"].values)

Phenotype shape BEFORE selection: (11231, 34256)
Phenotype shape AFTER selection: (11231, 33964)


In [85]:
fvf_sel = fvf_gene.iloc[seed:(seed+2)]
fvf_sel 

Unnamed: 0,feature,snpID,chrom
735,ENSG00000100243_CYB5R3,22_42914931_T_C,22
736,ENSG00000100243_CYB5R3,22_42914937_A_G,22


In [86]:
# SNP selection
leads = fvf_sel[fvf_sel['feature']==gene_name]['snpID'].unique()
G_sel = G[:,G['snp'].isin(leads)]

In [87]:
#### to permute G, create shuffled index
# step 1 - shuffle G across donors (prior to expanding)
# step 2 - expand normally
# this is such as all cells from a given donor will keep the same genotype, but it will be that from another donor

rand = random.RandomState(int(seed))
idx = rand.permutation(G_sel.shape[0])
Idx = xr.DataArray(idx, dims=["sample"], coords = {"sample": G_sel.sample.values})
idx_G = Idx.sel(sample=sample_mapping["genotype_individual_id"].values)

In [88]:
############################################
##### expand from donors to cells ##########

# expand out genotypes from cells to donors (and select relevant donors in the same step)
G_expanded = G_sel.sel(sample=sample_mapping["genotype_individual_id"].values)
assert all(hK_expanded.sample.values == G_expanded.sample.values)

  return self.array[key]


In [89]:
print(G_expanded.shape)

(33964, 2)


In [90]:
# select gene
y = phenotype.sel(trait=gene_name)
# quantile normalise
y = quantile_gaussianize(y)
# reshape
y = y.values.reshape(y.shape[0],1)
print(y.shape)

(33964, 1)


In [91]:
######################################
############ Covariates ##############
######################################

# just an intercept in this case
n_cells = phenotype.shape[1]
W = ones((n_cells, 1))

# unpack G
GG = G_expanded.values

In [92]:
print("Running for gene {}".format(trait_name))

# run association test using CellRegMap
pvals = run_interaction(y=y, W=W, E=C_gauss.values[:,0:10], E1=C_gauss.values[:,0:10], 
                        E2=C.values[:,0:20], G=GG, hK=hK_expanded, idx_G=idx_G)[0]

Running for gene ENSG00000100243


100%|██████████| 2/2 [10:43<00:00, 321.88s/it]


In [93]:
pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
               "pv":pvals,
               "variant":G_expanded.snp.values})
pv.head()

Unnamed: 0,chrom,pv,variant
0,22,0.007616,22_42914931_T_C
1,22,0.007616,22_42914937_A_G


In [63]:
pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
               "pv":pvals,
               "variant":G_expanded.snp.values})
pv.head()

Unnamed: 0,chrom,pv,variant
0,22,0.002625,22_50195375_A_G
1,22,0.029408,22_50196343_G_A


In [33]:
pv = pd.DataFrame({"chrom":G_expanded.chrom.values,
               "pv":pvals,
               "variant":G_expanded.snp.values})
pv.head()

Unnamed: 0,chrom,pv,variant
0,22,0.113902,22_38815543_G_T
1,22,0.02192,22_38815767_C_T


In [26]:
pv.to_csv(outfilename)


Mapping files:   0%|          | 0/3 [00:00<?, ?it/s][A

KeyboardInterrupt: 