## Extract UMI stats [M132TS Downsampling Analysis]

Extracts the total number of UMIs (raw), immune component, and after all cell QC filtering from an AnnData in the downsampling series.

In [2]:
%matplotlib inline

import os
import sys
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import logging
from operator import itemgetter

import scanpy as sc
import anndata

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_info = logger.warning

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=80, facecolor='white')

In [3]:
# load sample metadata
import yaml

with open('./downsampling_series_sample_metadata.yaml', 'r') as f:
    sample_meta_dict = yaml.safe_load(f)    

notebook_mode = True

if not notebook_mode:
    sample_key = sys.argv[1]
else:
    sample_key = 'gencode_1m'
    
log_info(f'Processing {sample_key} ...')

Processing gencode_1m ...


In [4]:
repo_root = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis'
long_tx_counts_root ='data/t-cell-vdj/long/quant/ds'
short_h5_path = 'output/t-cell-vdj-cite-seq/M132TS_immune.h5ad'
output_root = 'output/t-cell-vdj-cite-seq/ds'
fig_output_root = 'output/t-cell-vdj-cite-seq/ds/figures'
misc_output_root = 'output/t-cell-vdj-cite-seq/ds/misc'

output_prefix = 'M132TS_immune'

ADATA_SHORT_GENE_IDS_COL = 'gene_ids'
ADATA_LONG_GENE_IDS_COL = 'gencode_overlap_gene_ids'

In [5]:
output_prefix_full = f'{output_prefix}_{sample_key}'

raw_long_adata_h5_path = os.path.join(
    repo_root, long_tx_counts_root, sample_meta_dict[sample_key])

harmonized_long_adata_h5_path = os.path.join(
    repo_root, output_root, f'{output_prefix_full}_harmonized_long.h5ad')

final_long_adata_h5_path = os.path.join(
    repo_root, output_root, f'{output_prefix_full}_final_long_raw.h5ad')

In [6]:
adata_long_raw = sc.read(raw_long_adata_h5_path)
adata_long_harmonized = sc.read(harmonized_long_adata_h5_path)
adata_long_final = sc.read(final_long_adata_h5_path)

In [7]:
# all UMIs
n_umi_raw = int(adata_long_raw.X.sum())

# subsetting to immune cells
n_umi_harmonized = int(adata_long_harmonized.X.sum())

# removing doublets and contamination clusters
n_umi_final = int(adata_long_final.X.sum())

In [8]:
with open(os.path.join(repo_root, misc_output_root, output_prefix_full + "_n_umi_raw.txt"), 'w') as f:
    f.write(str(n_umi_raw) + '\n')
    
with open(os.path.join(repo_root, misc_output_root, output_prefix_full + "_n_umi_immune.txt"), 'w') as f:
    f.write(str(n_umi_harmonized) + '\n')
    
with open(os.path.join(repo_root, misc_output_root, output_prefix_full + "_n_umi_final.txt"), 'w') as f:
    f.write(str(n_umi_final) + '\n')