In [9]:
import scanpy as sc
import pandas as pd
import xarray as xr

In [10]:
chrom=22

In [12]:
# open anndata 
my_file = "/share/ScratchGeneral/anncuo/OneK1K/expression_objects/sce"+str(chrom)+".h5ad"
adata = sc.read(my_file)
# sparse to dense
mat = adata.raw.X.todense()
# make pandas dataframe
mat_df = pd.DataFrame(data=mat.T, index=adata.raw.var.index, columns=adata.obs.index)
# turn into xr array
phenotype = xr.DataArray(mat_df.values, dims=["trait", "cell"], coords={"trait": mat_df.index.values, "cell": mat_df.columns.values})

In [13]:
phenotype

In [19]:
cov_filename = "/share/ScratchGeneral/anncuo/OneK1K/covariates_new.csv"
cov_df = pd.read_csv(cov_filename, index_col=0)
cov_df.head(2)

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,pool,individual,percent.mt,latent,nCount_SCT,nFeature_SCT,cell_type,...,sex,predicted.celltype,predicted.celltype.l3,predicted.celltype.l4,predicted.celltype.l5,predicted.celltype.l6,G2M.Score,S.Score,Phase,barcode
AAACCTGAGAATGTTG-1,onek1k,4754,1201,pool_1,691_692,1.451409,b1,3226,1186,CD4+ KLRB1+ T cell,...,1,CD4+ KLRB1+ T cell,CD4 TEM,CD4 T,T cell,Lymphoid,-0.024395,0.008655,S,AAACCTGAGAATGTTG-1
AAACCTGAGAGAACAG-1,onek1k,2459,896,pool_1,693_694,3.700691,b1,2710,896,XCL1- NK,...,2,XCL1- NK,NK,NK cell,NK cell,Lymphoid,-0.026514,-0.015411,G1,AAACCTGAGAGAACAG-1


In [20]:
cov_df.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual',
       'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type',
       'predicted.celltype.l2', 'predicted.celltype.l1', 'age', 'sex',
       'predicted.celltype', 'predicted.celltype.l3', 'predicted.celltype.l4',
       'predicted.celltype.l5', 'predicted.celltype.l6', 'G2M.Score',
       'S.Score', 'Phase', 'barcode'],
      dtype='object')

In [21]:
cov_df['predicted.celltype.l2'].unique()

array(['CD4 TEM', 'NK', 'CD4 Naive', 'CD8 TEM', 'B naive', 'CD8 TCM',
       'CD4 TCM', 'B intermediate', nan, 'CD8 Naive', 'NK_CD56bright',
       'B memory', 'CD16 Mono', 'MAIT', 'Treg', 'CD14 Mono', 'dnT',
       'Eryth', 'NK Proliferating', 'CD4 CTL', 'HSPC', 'Platelet',
       'Plasmablast', 'ILC', 'gdT', 'cDC2', 'pDC', 'ASDC',
       'CD4 Proliferating', 'CD8 Proliferating', 'cDC1', 'Doublet'],
      dtype=object)

In [24]:
#### select naive B cells
cov_naive_Bcell = cov_df[cov_df['predicted.celltype.l2'] == 'B naive']
cov_naive_Bcell.shape

(71374, 23)

In [25]:
cov_naive_Bcell.head()

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,pool,individual,percent.mt,latent,nCount_SCT,nFeature_SCT,cell_type,...,sex,predicted.celltype,predicted.celltype.l3,predicted.celltype.l4,predicted.celltype.l5,predicted.celltype.l6,G2M.Score,S.Score,Phase,barcode
AAACCTGCAACGATCT-1,onek1k,3847,884,pool_1,686_687,3.717182,b1,3085,883,TCL1A+ FCER2+ B cell,...,1,TCL1A+ FCER2+ B cell,B naive,B cell,B cell,Lymphoid,-0.026746,-0.000268,G1,AAACCTGCAACGATCT-1
AAACGGGCAGTTCATG-1,onek1k,3092,811,pool_1,682_683,4.010349,b1,2941,811,TCL1A+ FCER2+ B cell,...,1,TCL1A+ FCER2+ B cell,B naive,B cell,B cell,Lymphoid,-0.022354,0.018777,S,AAACGGGCAGTTCATG-1
AAACGGGGTCCCTACT-1,onek1k,3402,879,pool_1,692_693,2.704292,b1,3008,879,TCL1A+ FCER2+ B cell,...,2,TCL1A+ FCER2+ B cell,B naive,B cell,B cell,Lymphoid,0.01342,-0.004957,G2M,AAACGGGGTCCCTACT-1
AAACGGGGTTGCGCAC-1,onek1k,2194,544,pool_1,683_684,3.783045,b1,2714,544,TCL1A+ FCER2+ B cell,...,1,TCL1A+ FCER2+ B cell,B naive,B cell,B cell,Lymphoid,0.006212,0.010032,S,AAACGGGGTTGCGCAC-1
AAAGATGGTTATGCGT-1,onek1k,1608,603,pool_1,686_687,4.415423,b1,2530,607,TCL1A+ FCER2+ B cell,...,1,TCL1A+ FCER2+ B cell,B naive,B cell,B cell,Lymphoid,-0.011067,0.004024,S,AAAGATGGTTATGCGT-1


In [26]:
phenotype = phenotype.sel(cell=cov_naive_Bcell["barcode"].values)

In [27]:
# still 600 genes, but "only" ~70k cells
phenotype

In [None]:
## We want to indentify outlier donors
## Strategy 1:
### take sum / mean / median / variance across cells per donor
### split AGGREGATED expression of genes in quantiles (Z-scores?), consider outlier values=donors

## Strategy 2:
### split SINGLE_CELL expression of genes in quantiles (Z-scores?), consider outlier values
### divide those counts but total cells from that individual

In [None]:
## Marc's paper outlier detection procedure
# * Based on featureCount gene quantifications (log TPM), we considered autosomal protein-coding and long noncoding RNA genes. 
# * Cell lines from donors with predicted ancestry other than the European super-population were discarded
# * and we additionally limited the analysis to lines with paired-end RNA-seq data
# * Genes were filtered for minimal expression, defined as gene expression TPM > 0 in 50% or more in each study.
# * To adjust for transcriptome-wide confounders, PEER-correction was run on the filtered data (n = 50 PEER factors). 
# * The resulting residual expression profiles were scaled and centered (z-score normalization). 
# * cell lines with an expression absolute z-score > 2 in > 100 genes were discarded from subsequent analyses. 
# * Finally, cell lines were retained if WGS data were available in addition to RNA-seq data

In [None]:
# * variants were filtered based on the variant quality score recalibration method using a tranche cutoff of 99%. 
# * The software vcfanno (v0.2.9) was used to annotate the WGS vcf file with MAF from gnomAD (v2.0.2) and 
# CADD score from CADD (v1.3). 
# * Variants were filtered on a per-sample level to retain variants with at least one alternate allele. 
# * Variants were then linked to genes using the bcftools (v1.11) window command, selecting a maximum distance of
# 10 kb based on the Ensembl 75 GTF reference. 
# * A separate file was produced for each cell line consisting of the following columns: 
# [cell line ID; gene ID; chromosome; position; gnomAD MAF; CADD (phred); and CADD (raw).]

In [None]:
# * We considered the subset of lines with both RNA-seq and WGS data available 
# * and focused on variants up to 10 kb upstream and downstream of the protein-coding and long noncoding RNA genes. 
# * Gene expression outliers for a given gene were defined as samples with a minimum gene expression z-score 
# (z-score < −2; underexpression outlier) or a maximum gene expression z-score (z-score > 2; overexpression outlier). 
# * Separate scores were computed for gene-level underexpression outliers and overexpression outliers. 
# * The reported enrichment score was calculated as the ratio of the proportion of outlier lines with variants across 
# several MAF/CADD bins compared to non-outlier lines. 
# Specifically, enrichment here refers to the relative risk (RR):

In [None]:
# * The analysis was performed separately for SNPs, indels and SVs, and across different 
# MAF bins (from common to rare) and CADD bins (progressively more deleterious variants). 
# * Genes were discarded if there was not at least 1 outlier and 1 non-outlier line matching MAF and CADD thresholds