## Summarize barnyard experiments

In [None]:
import os
import sys
import matplotlib.pylab as plt
import yaml
import numpy as np

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
barnyard_nalysis_root_path = './barnyard_analysis'
output_path = './barnyard_analysis'
file_list = './filelist'
input_prefix = 'M132TS_MAS_15x_'

metrics_all = dict()
output_prefix_list = []

with open(file_list) as of:
    for filename in of:
        
        # extract prefix
        output_prefix = filename[filename.find(input_prefix):].split('.')[0]
        print(f'Processing {output_prefix} ...')
        output_prefix_list.append(output_prefix)
        metrics_all[output_prefix] = dict()
        
        # load metrics
        with open(os.path.join(output_path, f'{output_prefix}.metrics.yaml'), 'r') as stream:
            try:
                basic_metrics_dict = yaml.safe_load(stream)
                metrics_all[output_prefix] = dict(**metrics_all[output_prefix], **basic_metrics_dict)
            except yaml.YAMLError as exc:
                print(exc)
        with open(os.path.join(output_path, f'{output_prefix}.barnyard.metrics.yaml'), 'r') as stream:
            try:
                barnyard_metrics_dict = yaml.safe_load(stream)
                metrics_all[output_prefix] = dict(**metrics_all[output_prefix], **barnyard_metrics_dict)
            except yaml.YAMLError as exc:
                print(exc)

In [None]:
plot_manifest = [
    dict(key='gex_concordance_r2',
         better='higher',
         tol=1e-4),

    dict(key='mean_umi_per_mutual_barcode_long',
         better='higher',
         tol=1e-4),

    dict(key='median_umi_per_mutual_barcode_long',
         better='higher',
         tol=1e-4),

    dict(key='pct_mutual_barcodes',
         better='higher',
         tol=1e-4),

    dict(key='pct_umis_in_empty_barcodes_long',
         better='lower',
         tol=1e-4),

    dict(key='group_0_purity__long',
         better='higher',
         tol=1e-3),

    dict(key='group_1_purity__long',
         better='higher',
         tol=1e-3),

#     dict(key='n_outlier_group_0_cells__long',
#          better='lower',
#          tol=1e-3),

#     dict(key='n_outlier_group_1_cells__long',
#          better='lower',
#          tol=1e-3),

]

for m in plot_manifest:
    
    metrics_key = m['key']
    better = m['better']
    tol = m['tol']

    fig, ax = plt.subplots()

    values = np.asarray([metrics_all[output_prefix][metrics_key] for output_prefix in output_prefix_list])
    best_idx = np.argmax(values) if better == 'higher' else np.argmin(values)
    best_indices = [idx for idx in range(len(output_prefix_list)) if np.abs(values[idx] - values[best_idx]) < tol]

    ax.bar(np.arange(len(output_prefix_list)), values, color='royalblue')
    ax.bar(np.arange(len(output_prefix_list))[best_indices], values[best_indices], color=(0.3, 0.3, 0.3))

    ax.set_xticks(np.arange(len(output_prefix_list)))
    ax.set_xticklabels(output_prefix_list, rotation=45, ha='right')
    ax.set_ylabel(metrics_key)
    ax.grid()
    
    mean, std = np.mean(values), np.std(values)
    ax.set_ylim(((max(0, mean - 6 * std), mean + 6 * std)))

    fig.savefig(f'./barnyard_analysis/summary__{metrics_key}.png', bbox_inches='tight', dpi=200)

In [None]:
import colorcet as cc

purity = [metrics_all[output_prefix]['group_1_purity__long'] for output_prefix in output_prefix_list]
short_purity = [metrics_all[output_prefix]['group_1_purity__short'] for output_prefix in output_prefix_list]
umi = [metrics_all[output_prefix]['mean_umi_per_mutual_barcode_long'] for output_prefix in output_prefix_list]
short_umi = [metrics_all[output_prefix]['mean_umi_per_mutual_barcode_short'] for output_prefix in output_prefix_list]

fig, ax = plt.subplots()

for i, output_prefix in enumerate(output_prefix_list):
    ax.scatter(purity[i], umi[i], color=cc.glasbey[i], label=output_prefix)

ax.scatter(short_purity[0], short_umi[0], label='Short-reads', s=100, marker='d', color='black')

ax.legend(bbox_to_anchor = (1.05, 1.0))
# ax.set_xlim((99.6, 100.))
# ax.set_ylim((2700, 3500))
ax.set_xlabel('Barnyard Purity')
ax.set_ylabel('Mean UMI per cell')

fig.savefig(f'./barnyard_analysis/summary__purity_vs_cell_umi.png', bbox_inches='tight', dpi=200)

In [None]:
import colorcet as cc

purity = [metrics_all[output_prefix]['group_1_purity__long'] for output_prefix in output_prefix_list]
short_purity = [metrics_all[output_prefix]['group_1_purity__short'] for output_prefix in output_prefix_list]
umi = [metrics_all[output_prefix]['mean_umi_per_mutual_barcode_long'] for output_prefix in output_prefix_list]
short_umi = [metrics_all[output_prefix]['mean_umi_per_mutual_barcode_short'] for output_prefix in output_prefix_list]

fig, ax = plt.subplots()

for i, output_prefix in enumerate(output_prefix_list):
    ax.scatter(purity[i], umi[i], color=cc.glasbey[i], label=output_prefix)

# ax.scatter(short_purity[0], short_umi[0], label='Short-reads', s=100, marker='d', color='black')

ax.legend(bbox_to_anchor = (1.05, 1.0))
ax.set_xlim((99.5, 100.))
# ax.set_ylim((2700, 3500))
ax.set_xlabel('Barnyard Purity')
ax.set_ylabel('Mean UMI per cell')

fig.savefig(f'./barnyard_analysis/summary__purity_vs_cell_umi_no_short.png', bbox_inches='tight', dpi=200)

In [None]:
# estimate UMIs in background for short-reads data
import scanpy as sc

repo_root = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis'
m132ts_h5_path = 'data/t-cell-vdj/M132TS/raw_feature_bc_matrix.h5'
m132ts_h5_full_path = os.path.join(repo_root, m132ts_h5_path)
h5_full_path = m132ts_h5_full_path
adata_short_full = sc.read_10x_h5(h5_full_path, gex_only=False)
adata_short_full.var_names_make_unique()
# remove '-1' suffixed from barcodes
def remove_barcode_suffix(barcode: str, delimiter='-') -> str:
    return barcode.split(delimiter)[0]
adata_short_full.obs.index = np.asarray(list(map(remove_barcode_suffix, adata_short_full.obs.index.values)))

In [None]:
cell_short_h5_path = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis/output/t-cell-vdj-cite-seq/M132TS_both.h5ad'
adata_short_cell = sc.read(cell_short_h5_path)
cell_barcodes = adata_short_cell.obs.index.values

In [None]:
with open('./M132TS_cell_barcodes.txt', 'w') as f:
    for cb in cell_barcodes:
        f.write(cb + '\n')

In [None]:
short_cell_umis = adata_short_full[cell_barcodes].X.sum()
short_total_umis = adata_short_full.X.sum()
short_pct_umis_in_empty_barcodes = 100. * (short_total_umis - short_cell_umis) / short_total_umis

In [None]:
import colorcet as cc

purity = [metrics_all[output_prefix]['group_1_purity__long'] for output_prefix in output_prefix_list]
short_purity = [metrics_all[output_prefix]['group_1_purity__short'] for output_prefix in output_prefix_list]
bg_umi = [metrics_all[output_prefix]['pct_umis_in_empty_barcodes_long'] for output_prefix in output_prefix_list]

fig, ax = plt.subplots()

for i, output_prefix in enumerate(output_prefix_list):
    ax.scatter(purity[i], bg_umi[i], color=cc.glasbey[i], label=output_prefix)

ax.scatter(short_purity[0],
           short_pct_umis_in_empty_barcodes,
           label='Short-reads', s=100, marker='d', color='black')

# ax.axvline(max_purity[0], label='Short-read gold standard')
ax.legend(bbox_to_anchor = (1.05, 1.0))
# ax.set_xlim((99.6, 100.))
ax.set_xlabel('Barnyard Purity')
ax.set_ylabel('Fraction of UMIs in non-cell BCs')

fig.savefig(f'./barnyard_analysis/summary__purity_vs_bg_umi.png', bbox_inches='tight', dpi=200)