In [None]:
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import os
import sys
from time import time
import logging
import pickle
from operator import itemgetter
import scanpy as sc
import argparse
import yaml

sc.settings.set_figure_params(dpi=80, facecolor='white')

import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault)

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_info = logger.warn

import warnings
warnings.filterwarnings("ignore")

## config

In [None]:
repo_root = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis'

short_read_mol_info_h5_path = f'{repo_root}/data/t-cell-vdj/M132TS/molecule_info.h5'
long_read_reads_per_umi_tsv = f'{repo_root}/data/t-cell-vdj/long/quant/revised/final/reads_per_umi/M132TS_MAS_15x_overall_gene_tx_expression_count_matrix.tsv'


# inputs
input_prefix = 'M132TS_immune.final.harmonized'
output_path = 'output/t-cell-vdj-cite-seq'
final_long_adata_raw_h5_path = os.path.join(repo_root, output_path, f'{input_prefix}.long.stringtie.final.raw.h5ad')

In [None]:
final_adata_long = sc.read(final_long_adata_raw_h5_path)
cbc_list = final_adata_long.obs.index.values

## short

In [None]:
import tables

In [None]:
mol_info = tables.open_file(short_read_mol_info_h5_path)

counts = mol_info.root.count.read()
all_barcodes = [bc.decode() for bc in mol_info.root.barcodes.read()]
barcode_idx = mol_info.root.barcode_idx.read()
barcodes = [all_barcodes[idx] for idx in barcode_idx]

In [None]:
# subset to molecules in the T cell whitelist
cbc_set = set(cbc_list)
cbc_counts = [counts[idx] for idx in range(len(counts)) if barcodes[idx] in cbc_set]

In [None]:
fig, ax = plt.subplots()
ax.hist(cbc_counts[:10000], range=(1, 10), bins=10)

## long

In [None]:
long_cbc_counts = []
cbc_set = set(cbc_list)
with open(long_read_reads_per_umi_tsv, 'r') as f:
    for line in f:
        tokens = line.strip().split('\t')
        if tokens[0] == 'Equivalence_Class':
            continue
        cbc = tokens[1]
        if cbc in cbc_set:
            count = int(tokens[-1])
            long_cbc_counts.append(count)

In [None]:
fig, ax = plt.subplots()
ax.hist(long_cbc_counts[:10000], range=(1, 10), bins=10)

## final plot

In [None]:
max_mols = 1_000_000
cutoff = 10

fig, axs = plt.subplots(ncols=2, figsize=(12, 4))

axs[0].hist(cbc_counts[:max_mols], range=(1, cutoff), bins=cutoff, density=True, color='gray',
            label=f'Mean Reads/Molecule = {np.mean(cbc_counts):.2f}')
axs[0].set_ylabel('Fraction of Molecules')
axs[0].set_xlabel('Reads per Molecule = (CBC, UMI, gene)')
axs[0].set_title('Sample #2 (short-read)')
axs[0].set_xticks(0.5 + np.arange(1, cutoff))
axs[0].set_xticklabels([str(x) for x in np.arange(1, cutoff)])
axs[0].legend()
axs[0].set_ylim((0, 1))

axs[1].hist(long_cbc_counts[:max_mols], range=(1, cutoff), bins=cutoff, density=True, color='gray',
            label=f'Mean Reads/Molecule = {np.mean(long_cbc_counts):.2f}')
axs[1].set_ylabel('Fraction of Molecules')
axs[1].set_xlabel('Reads per Molecule = (CBC, UMI, transcript eq. class)')
axs[1].set_title('Sample #2 (MAS-ISO-seq)')
axs[1].set_xticks(0.5 + np.arange(1, cutoff ))
axs[1].set_xticklabels([str(x) for x in np.arange(1, cutoff )])
axs[1].legend()
axs[1].set_ylim((0, 1))

fig.tight_layout()

fig.savefig('./reads_per_umi.pdf')