## Analysis of short-reads scRNA-seq (immune component)

This notebook is a second-pass analysis of immune component of the sample.

**Inputs and Outputs**
- Inputs:
  - `raw_feature_bc_matrix.h5`: raw counts from 10x cellranger counts pipeline
  - `M132TS_immune_barcodes.npy`: list of immune cell barcodes for the sample
- Outputs:
  - `M132TS_immune.hd5ad`: an AnnData object object containing filtered cells (immune)  

In [None]:
%matplotlib inline

import matplotlib.pylab as plt

import numpy as np
import pandas as pd
import os
import sys
from time import time
import logging
import pickle

import scanpy as sc

SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_info = logger.warning

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
sc.__version__

In [None]:
repo_root = '/home/jupyter/mb-ml-data-disk/MAS-seq-analysis'

m132ts_h5_path = 'data/t-cell-vdj/M132TS/raw_feature_bc_matrix.h5'
barcode_path = 'output/t-cell-vdj-cite-seq/M132TS_immune_barcodes.npy'
m132ts_h5ad_out_path = 'output/t-cell-vdj-cite-seq/M132TS_immune.h5ad'
m132ts_h5_full_path = os.path.join(repo_root, m132ts_h5_path)
m132ts_h5ad_out_full_path = os.path.join(repo_root, m132ts_h5ad_out_path)
barcode_full_path = os.path.join(repo_root, barcode_path)

h5_full_path = m132ts_h5_full_path
h5ad_out_full_path = m132ts_h5ad_out_full_path

In [None]:
adata = sc.read_10x_h5(h5_full_path, gex_only=False)
adata.var_names_make_unique()
barcodes = np.load(barcode_full_path, allow_pickle=True)

In [None]:
# remove '-1' suffixed from barcodes
def remove_barcode_suffix(barcode: str, delimiter='-') -> str:
    return barcode.split(delimiter)[0]
adata.obs.index = np.asarray(list(map(remove_barcode_suffix, adata.obs.index.values)))

In [None]:
# subset adata to the specified barcodes
barcodes_set = set(barcodes)
adata_barcode_keep_indices = [
    idx for idx in range(len(adata))
    if adata.obs.index.values[idx] in barcodes_set]
adata = adata[adata_barcode_keep_indices]

In [None]:
adata

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')
adata.var['antibody_feature'] = (adata.var.feature_types == 'Antibody Capture')

# add log(1 + antibody counts) as 'obs' annotations for each cell
adata_ab = adata[:, adata.var['antibody_feature']]
ab_list = list(adata_ab.var.index.values)
for ab_idx, ab in enumerate(ab_list):
    adata.obs[ab] = \
        np.log1p(np.asarray(adata_ab.X[:, ab_idx].todense()).squeeze())
    
# remove antibody counts from variables
adata = adata[:, ~adata.var['antibody_feature']]

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

In [None]:
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
adata

In [None]:
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
adata.raw = adata

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)

In [None]:
sc.pp.log1p(adata)

## A first look at clustering (based on log-normalized counts)

Note: Our final clustering and analysis will be based on Seurat's SCT counts.

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=4, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
# keep genes that are highly variable
adata = adata[:, adata.var.highly_variable]

In [None]:
sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(adata, svd_solver='arpack')

In [None]:
sc.pl.pca(adata, color='CST3')

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

In [None]:
adata

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)

In [None]:
sc.tl.tsne(adata)

In [None]:
sc.pl.tsne(adata)

In [None]:
sc.pl.tsne(adata, color=['CD45_TotalSeqC', 'CD45R_B220_TotalSeqC', 'CD45RA_TotalSeqC', 'CD45RO_TotalSeqC'])

In [None]:
# make a scatter plot of CD45 GEX vs. AB
CD45_gex = np.asarray(adata.raw.X[:, adata.raw.var.index.values == 'PTPRC'].todense()).flatten()
CD45_ab = adata.obs['CD45_TotalSeqC'].values.flatten()
plt.scatter(CD45_gex, CD45_ab, s=1, alpha=0.1)
plt.xlabel('CD45 GEX (log normalized)')
plt.ylabel('CD45 AB (log)')

In [None]:
sc.pl.tsne(
    adata,
    color=['PTPRC', 'CD63', 'LGALS3', 'SERPINE2', 'FTH1', 'S100A13', 'GNLY', 'NKG7', 'CD8A', 'CD3D',
            'pct_counts_mt', 'total_counts', 'n_genes_by_counts'])

In [None]:
sc.tl.leiden(adata, resolution=1.0)

In [None]:
sc.pl.tsne(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')

In [None]:
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=True)

In [None]:
adata

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
result_df = pd.DataFrame(
    {group + '_' + key: result[key][group]
    for group in groups for key in ['names', 'pvals', 'pvals_adj']})

In [None]:
result_df

In [None]:
adata.write(h5ad_out_full_path)