## _In silico_ barnyard

**Inputs and Outputs**
- Inputs:
  - Harmonized and annotated short-read and long-read AnnData (raw, SCT)
- Outputs:
  - Figures
  - Tables of global and per-cluster DE and DS pvalues for all genes.

In [None]:
import os
import sys

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
from time import time
import logging
import pickle
from operator import itemgetter
import json, pprint
import tables 

import scanpy as sc
import anndata

from collections import defaultdict
from itertools import groupby
from operator import itemgetter
from typing import List, Dict, Union, Any

from time import time

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_info = logger.warning

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
repo_root = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis'

# inputs
output_prefix = 'new_pipeline__revised_v2'
input_prefix = f'M132TS_both_{output_prefix}.harmonized.barnyard'
output_path = 'output/t-cell-vdj-cite-seq'

# outputs
final_long_adata_raw_h5_path = os.path.join(repo_root, output_path, f'{input_prefix}.long.h5ad')
final_short_adata_raw_h5_path = os.path.join(repo_root, output_path, f'{input_prefix}.short.h5ad')

# constants
# GENE_IDS_KEY = 'gencode_overlap_gene_ids'
# GENE_NAMES_KEY = 'gencode_overlap_gene_names'

GENE_IDS_KEY = 'gene_ids'
GENE_NAMES_KEY = 'gene_names'

## Configuration

In [None]:
min_cells_per_transcript = 0 # 1
min_cells_per_gene = 0 # 50

group_cells_by_obs_key = 'leiden_crude'

## Preprocess

In [None]:
adata_long = sc.read(final_long_adata_raw_h5_path)

In [None]:
adata_long

In [None]:
total_umis = adata_long.X.sum()
log_info(f'Total UMIs: {total_umis}')

## Filtering

In [None]:
# remove genes that are lowly expressed
from collections import defaultdict
gene_id_to_tx_indices_map = defaultdict(list)
for i, gid in enumerate(adata_long.var[GENE_IDS_KEY].values):
    gene_id_to_tx_indices_map[gid].append(i)

included_gene_ids = []
tx_counts_i = np.asarray(adata_long.X.sum(0)).flatten()
for gid, tx_indices in gene_id_to_tx_indices_map.items():
    if np.sum(tx_counts_i[tx_indices]) >= min_cells_per_gene:
        included_gene_ids.append(gid)

adata_long = adata_long[:, adata_long.var[GENE_IDS_KEY].values.isin(included_gene_ids)]

# remove transcript that are very lowly expressed
sc.pp.filter_genes(adata_long, min_cells=min_cells_per_transcript)
tpm_threshold = 1_000_000 * min_cells_per_transcript / total_umis

log_info(f'Removing isoforms with TPM < {tpm_threshold:.2f}')

In [None]:
adata_long

In [None]:
# mapping from gene id to spanning tx icatces
from collections import defaultdict
gene_id_to_tx_indices_map = defaultdict(list)
for i, gid in enumerate(adata_long.var[GENE_IDS_KEY].values):
    gene_id_to_tx_indices_map[gid].append(i)

# useful auxiliary data structures    
gene_ids = sorted(list(gene_id_to_tx_indices_map.keys()))
n_genes = len(gene_ids)
n_transcripts = adata_long.shape[1]
gene_id_to_gene_name_map = {
    gene_id: gene_name for gene_id, gene_name in zip(adata_long.var[GENE_IDS_KEY], adata_long.var[GENE_NAMES_KEY])}
gene_name_to_gene_id_map = {
    gene_name: gene_id for gene_id, gene_name in zip(adata_long.var[GENE_IDS_KEY], adata_long.var[GENE_NAMES_KEY])}
gene_names = list(map(gene_id_to_gene_name_map.get, gene_ids))

# mapping from gene id to spanning tx indices
group_ids = adata_long.obs[group_cells_by_obs_key].values.categories.values
group_id_to_obs_indices_map = defaultdict(list)
for group_id in group_ids:
    group_id_to_obs_indices_map[group_id] = [
        idx for idx in range(len(adata_long))
        if adata_long.obs[group_cells_by_obs_key].values[idx] == group_id]

In [None]:
import scipy

# get gene expression from isoform expression
row_indices = []
col_indices = []
values = []
for j, gene_id in enumerate(gene_ids):
    tx_indices = gene_id_to_tx_indices_map[gene_id]
    row_indices += tx_indices
    col_indices += [j] * len(tx_indices)
    values += [1] * len(tx_indices)
Y_ij = scipy.sparse.coo_matrix((values, (row_indices, col_indices)), shape=(n_transcripts, n_genes)).tocsr()
gex_X_nj = adata_long.X @ Y_ij

# normalize
adata_long_gex = sc.AnnData(
    X=gex_X_nj,
    obs=adata_long.obs,
    var=pd.DataFrame(index=pd.Index(list(map(gene_id_to_gene_name_map.get, gene_ids)))))

adata_long_gex.var_names_make_unique()
# sc.pp.normalize_per_cell(adata_long_gex)
# sc.pp.log1p(adata_long_gex)

In [None]:
adata_short = sc.read(os.path.join(repo_root, final_short_adata_raw_h5_path))

In [None]:
adata_short

In [None]:
adata_short_final = adata_short[:, adata_short.var.index.isin(adata_long_gex.var.index.values)]
adata_long_final = adata_long_gex[:, adata_short_final.var.index]

In [None]:
adata_short_final

In [None]:
adata_long_final

## Discovery

In [None]:
discovery_adata_name = 'short'

In [None]:
def get_grouped_expression(adata, group_cells_by_obs_key):
    # mapping from gene id to spanning tx indices
    group_ids = adata.obs[group_cells_by_obs_key].values.categories.values
    group_id_to_obs_indices_map = defaultdict(list)
    for group_id in group_ids:
        group_id_to_obs_indices_map[group_id] = [
            idx for idx in range(len(adata))
            if adata.obs[group_cells_by_obs_key].values[idx] == group_id]
    
    n_genes = adata.shape[1]
    n_groups = len(group_id_to_obs_indices_map)
    group_expr_gi = np.zeros((n_groups, n_genes), dtype=np.int)
    for i_group, group_id in enumerate(group_ids):
        group_expr_gi[i_group, :] = np.asarray(adata.X[group_id_to_obs_indices_map[group_id], :].sum(0)).flatten()
        
    return group_expr_gi

In [None]:
discovery_adata = {
    'short': adata_short_final,
    'long': adata_long_final}[discovery_adata_name]

metacell_mg = get_grouped_expression(discovery_adata, group_cells_by_obs_key)
normed_metacell_mg = metacell_mg / np.sum(metacell_mg, -1, keepdims=True)
lo_expr_threshold = 1e-5
hi_expr_threshold = 1e-4

barnyard_gene_indices_list = []
for group_a in range(2):
    for group_b in range(2):
        lo_in_a = normed_metacell_mg[group_a, :] < lo_expr_threshold
        hi_in_b = normed_metacell_mg[group_b, :] > hi_expr_threshold
        barnyard_mask_g = lo_in_a & hi_in_b
        barnyard_gene_indices = np.where(barnyard_mask_g)[0]
        for idx in barnyard_gene_indices:
            barnyard_gene_indices_list.append((group_a, group_b, idx, discovery_adata.var.index.values[idx]))
        print(f'{group_a}, {group_b}: {barnyard_mask_g.sum()}')

In [None]:
putative_tumor_gene_indices = [t[2] for t in barnyard_gene_indices_list if t[0] == 0 and t[1] == 1]
putative_immune_gene_indices = [t[2] for t in barnyard_gene_indices_list if t[0] == 1 and t[1] == 0]
putative_tumor_gene_names = [t[3] for t in barnyard_gene_indices_list if t[0] == 0 and t[1] == 1]
putative_immune_gene_names = [t[3] for t in barnyard_gene_indices_list if t[0] == 1 and t[1] == 0]

putative_tumor_gene_expr_in_immune_n = np.asarray(
    discovery_adata[discovery_adata.obs['leiden_crude'] == '0'][:, putative_tumor_gene_indices].X.sum(-1)).flatten()
putative_immune_gene_expr_in_tumor_n = np.asarray(
    discovery_adata[discovery_adata.obs['leiden_crude'] == '1'][:, putative_immune_gene_indices].X.sum(-1)).flatten()

In [None]:
plt.hist(putative_immune_gene_expr_in_tumor_n, bins=100);

In [None]:
plt.hist(putative_tumor_gene_expr_in_immune_n, bins=100);

In [None]:
cell_purification_threshold = 1.

pure_immune_cells_mask = putative_tumor_gene_expr_in_immune_n < cell_purification_threshold
pure_tumor_cells_mask = putative_immune_gene_expr_in_tumor_n < cell_purification_threshold
pure_immune_cell_indices = np.where((discovery_adata.obs['leiden_crude'] == '0').values)[0][pure_immune_cells_mask]
pure_tumor_cell_indices = np.where((discovery_adata.obs['leiden_crude'] == '1').values)[0][pure_tumor_cells_mask]
pure_both_indices = pure_immune_cell_indices.tolist() + pure_tumor_cell_indices.tolist()

In [None]:
adata_short_final_pure = adata_short_final[pure_both_indices]
adata_long_final_pure = adata_long_final[pure_both_indices]

In [None]:
discovery_adata = {
    'short': adata_short_final_pure,
    'long': adata_long_final_pure}[discovery_adata_name]

metacell_mg = get_grouped_expression(adata_short_final_pure, group_cells_by_obs_key)
normed_metacell_mg = metacell_mg / np.sum(metacell_mg, -1, keepdims=True)
lo_expr_threshold = 1e-6
hi_expr_threshold = 1e-4

barnyard_gene_indices_list = []
for group_a in range(2):
    for group_b in range(2):
        lo_in_a = normed_metacell_mg[group_a, :] < lo_expr_threshold
        hi_in_b = normed_metacell_mg[group_b, :] > hi_expr_threshold
        barnyard_mask_g = lo_in_a & hi_in_b
        barnyard_gene_indices = np.where(barnyard_mask_g)[0]
        for idx in barnyard_gene_indices:
            barnyard_gene_indices_list.append((group_a, group_b, idx, discovery_adata.var.index.values[idx]))
        print(f'{group_a}, {group_b}: {barnyard_mask_g.sum()}')

In [None]:
final_tumor_gene_indices = [t[2] for t in barnyard_gene_indices_list if t[0] == 0 and t[1] == 1]
final_immune_gene_indices = [t[2] for t in barnyard_gene_indices_list if t[0] == 1 and t[1] == 0]
final_tumor_gene_names = [t[3] for t in barnyard_gene_indices_list if t[0] == 0 and t[1] == 1]
final_immune_gene_names = [t[3] for t in barnyard_gene_indices_list if t[0] == 1 and t[1] == 0]

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))

adata = adata_short_final_pure.copy()

contamination_threshold = 0.1
min_counts = 50

final_tumor_gene_expr_in_immune_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '0'][:, final_tumor_gene_indices].X.sum(-1)).flatten()
final_immune_gene_expr_in_tumor_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '1'][:, final_immune_gene_indices].X.sum(-1)).flatten()
final_tumor_gene_expr_in_tumor_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '1'][:, final_tumor_gene_indices].X.sum(-1)).flatten()
final_immune_gene_expr_in_immune_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '0'][:, final_immune_gene_indices].X.sum(-1)).flatten()

outlier_tumor_cells_n = final_immune_gene_expr_in_tumor_n > (contamination_threshold * final_tumor_gene_expr_in_tumor_n)
outlier_tumor_cells_n = outlier_tumor_cells_n & (final_tumor_gene_expr_in_tumor_n > min_counts)
outlier_immune_cells_n = final_tumor_gene_expr_in_immune_n > (contamination_threshold * final_immune_gene_expr_in_immune_n)
outlier_immune_cells_n = outlier_immune_cells_n & (final_immune_gene_expr_in_immune_n > min_counts)

# other statistics
median_umi_per_cell = np.median(np.asarray(adata_short.X.sum(-1)).flat)
tumor_purity = 100. * final_tumor_gene_expr_in_tumor_n.sum() / (final_immune_gene_expr_in_tumor_n.sum() + final_tumor_gene_expr_in_tumor_n.sum())
immune_purity = 100. * final_immune_gene_expr_in_immune_n.sum() / (final_tumor_gene_expr_in_immune_n.sum() + final_immune_gene_expr_in_immune_n.sum())

n_immune_cells = (adata.obs['leiden_crude'] == '0').sum()
n_tumor_cells = (adata.obs['leiden_crude'] == '1').sum()

# all points
ax.scatter(
    final_immune_gene_expr_in_immune_n,
    final_tumor_gene_expr_in_immune_n,
    s=1,
    label=f'Immune (N={n_immune_cells})')

ax.scatter(
    final_immune_gene_expr_in_tumor_n,
    final_tumor_gene_expr_in_tumor_n,
    s=1,
    label=f'Tumor (N={n_tumor_cells})')

# outliers
ax.scatter(
    final_immune_gene_expr_in_immune_n[outlier_immune_cells_n],
    final_tumor_gene_expr_in_immune_n[outlier_immune_cells_n],
    s=50,
    facecolor='none',
    edgecolor='red',
    marker='o',
    lw=0.5,
    label=f'Tumor in Immune > {int(100. * contamination_threshold)}% (N={outlier_immune_cells_n.sum()})')

ax.scatter(
    final_immune_gene_expr_in_tumor_n[outlier_tumor_cells_n],
    final_tumor_gene_expr_in_tumor_n[outlier_tumor_cells_n],
    s=50,
    facecolor='none',
    edgecolor='black',
    marker='o',
    lw=0.5,
    label=f'Immune in Tumor > {int(100. * contamination_threshold)}% (N={outlier_tumor_cells_n.sum()})')


plt.plot(
    [], [], ' ',
    label=f"Median UMIs per cell: {int(median_umi_per_cell)}")
plt.plot(
    [], [], ' ',
    label=f"Tumor purity: {tumor_purity:.1f}%")
plt.plot(
    [], [], ' ',
    label=f"Immune purity: {immune_purity:.1f}%")

# ax.set_xscale('log')
# ax.set_yscale('log')


ax.set_xlim((-40, 2000))
ax.set_ylim((-40, 2000))

ax.set_xlabel('Immune-specific total GEX')
ax.set_ylabel('Tumor-specific total GEX')

ax.set_title(f'Illumina')
ax.legend(fontsize=10)
ax.set_aspect('equal')
fig.tight_layout()
fig.savefig(f'./output/M132TS__short_barnyard__{output_prefix}.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(5, 4))

adata = adata_long_final_pure.copy()

contamination_threshold = 0.1
min_counts = 100

final_tumor_gene_expr_in_immune_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '0'][:, final_tumor_gene_indices].X.sum(-1)).flatten()
final_immune_gene_expr_in_tumor_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '1'][:, final_immune_gene_indices].X.sum(-1)).flatten()
final_tumor_gene_expr_in_tumor_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '1'][:, final_tumor_gene_indices].X.sum(-1)).flatten()
final_immune_gene_expr_in_immune_n = np.asarray(
    adata[adata.obs['leiden_crude'] == '0'][:, final_immune_gene_indices].X.sum(-1)).flatten()

outlier_tumor_cells_n = final_immune_gene_expr_in_tumor_n > (contamination_threshold * final_tumor_gene_expr_in_tumor_n)
outlier_tumor_cells_n = outlier_tumor_cells_n & (final_tumor_gene_expr_in_tumor_n > min_counts)
outlier_immune_cells_n = final_tumor_gene_expr_in_immune_n > (contamination_threshold * final_immune_gene_expr_in_immune_n)
outlier_immune_cells_n = outlier_immune_cells_n & (final_immune_gene_expr_in_immune_n > min_counts)

# other statistics
median_umi_per_cell = np.median(np.asarray(adata_long.X.sum(-1)).flat)
tumor_purity = 100. * final_tumor_gene_expr_in_tumor_n.sum() / (final_immune_gene_expr_in_tumor_n.sum() + final_tumor_gene_expr_in_tumor_n.sum())
immune_purity = 100. * final_immune_gene_expr_in_immune_n.sum() / (final_tumor_gene_expr_in_immune_n.sum() + final_immune_gene_expr_in_immune_n.sum())

n_immune_cells = (adata.obs['leiden_crude'] == '0').sum()
n_tumor_cells = (adata.obs['leiden_crude'] == '1').sum()

# all points
ax.scatter(
    final_immune_gene_expr_in_immune_n,
    final_tumor_gene_expr_in_immune_n,
    s=1,
    label=f'Immune (N={n_immune_cells})')

ax.scatter(
    final_immune_gene_expr_in_tumor_n,
    final_tumor_gene_expr_in_tumor_n,
    s=1,
    label=f'Tumor (N={n_tumor_cells})')

# outliers
ax.scatter(
    final_immune_gene_expr_in_immune_n[outlier_immune_cells_n],
    final_tumor_gene_expr_in_immune_n[outlier_immune_cells_n],
    s=50,
    facecolor='none',
    edgecolor='red',
    marker='o',
    lw=0.5,
    label=f'Tumor in Immune > {int(100. * contamination_threshold)}% (N={outlier_immune_cells_n.sum()})')

ax.scatter(
    final_immune_gene_expr_in_tumor_n[outlier_tumor_cells_n],
    final_tumor_gene_expr_in_tumor_n[outlier_tumor_cells_n],
    s=50,
    facecolor='none',
    edgecolor='black',
    marker='o',
    lw=0.5,
    label=f'Immune in Tumor > {int(100. * contamination_threshold)}% (N={outlier_tumor_cells_n.sum()})')


plt.plot(
    [], [], ' ',
    label=f"Median UMIs per cell: {int(median_umi_per_cell)}")
plt.plot(
    [], [], ' ',
    label=f"Tumor purity: {tumor_purity:.1f}%")
plt.plot(
    [], [], ' ',
    label=f"Immune purity: {immune_purity:.1f}%")

# ax.set_xscale('log')
# ax.set_yscale('log')


ax.set_xlim((-20, 1000))
ax.set_ylim((-20, 1000))

ax.set_xlabel('Immune-specific total GEX')
ax.set_ylabel('Tumor-specific total GEX')

ax.set_title(f'MAS-ISO-Seq ({output_prefix})')
ax.legend(fontsize=10)

fig.tight_layout()
fig.savefig(f'./output/M132TS__long_barnyard__{output_prefix}.pdf')