In [None]:
import os
import sys
print("Python version" + sys.version)
os.getcwd()
print(sys.executable)

In [None]:
import numpy as np
np.random.seed(123)
import pandas as pd
import scipy
import itertools

import umap
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import scanpy as sc
import anndata as ad
import scvelo as scv
from tqdm.notebook import tqdm

from pathlib import Path

In [None]:
import scrublet as scr

In [None]:
sc.settings.verbosity = 1
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# remove weird grid from scvelo
plt.rcParams['axes.grid'] = False

from cellbender_adata_fix import *

In [None]:
# revised from Stefan's cell type signature
signatures_path_ = '../cell_type_from_stefan/scrnaseq_signature_collection/'
from score_and_classify import *

In [None]:
data_folder = '/fast/users/twei_m/work/crc/datasets'

In [None]:
new_data_folder = '/fast/users/twei_m/work/crc/datasets_new_preprocessing'

### refer to 20221118_icms_classifier.ipynb

#### read Joanito et al, assumed only processed by CellRanger

In [None]:
import h5py

In [None]:
from scipy.sparse import csr_matrix

In [None]:
f = h5py.File(Path(data_folder)/'icms/Epithelial_Count_matrix.h5', 'r')

In [None]:
list(f.keys())

In [None]:
list(f['matrix'])

In [None]:
def _collect_datasets(dsets: dict, group: h5py.Group):
    for k, v in group.items():
        if isinstance(v, h5py.Dataset):
            dsets[k] = v[:]
        else:
            _collect_datasets(dsets, v)

In [None]:
with h5py.File(Path(data_folder)/'icms/Epithelial_Count_matrix.h5', 'r') as f:
    dsets = {}
    _collect_datasets(dsets, f["matrix"])
    M, N = dsets['shape']
    data = dsets['data']
    matrix = csr_matrix(
                (data, dsets['indices'], dsets['indptr']),
                shape=(N, M),
            )

In [None]:
joanito = ad.AnnData(
                matrix,
                obs=dict(obs_names=dsets['barcodes'].astype(str)),
                var=dict(
                    var_names=dsets['name'].astype(str),
                    #gene_ids=dsets['id'].astype(str),
                    feature_types=dsets['feature_type'].astype(str),
                    genome=dsets['genome'].astype(str),
                ),
            )

joanito = anndata_from_h5(Path(data_folder)/'icms/Epithelial_Count_matrix.h5')

In [None]:
joanito.shape

In [None]:
metadata = pd.read_csv(Path(data_folder)/'icms/Epithelial_metadata.csv', index_col=0)

In [None]:
metadata['dataset'].value_counts()

In [None]:
metadata = metadata.reindex(joanito.obs.index)

In [None]:
joanito.obs = metadata

In [None]:
joanito.obs['percent.mt'].hist();

In [None]:
joanito.obs['sample.ID'].value_counts().hist();

In [None]:
joanito.var['mt'] = joanito.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(joanito, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(joanito, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt', 'percent.mt'],
             jitter=0.4, multi_panel=True)

In [None]:
joanito.obs['sample.ID'].value_counts().hist();

### no scrublet because some samples have really low number of cells

In [None]:
joanito.layers['counts'] = joanito.X.copy()

In [None]:
joanito.shape

In [None]:
sc.pp.filter_cells(joanito, min_counts=1000)  # uhlitz
#sc.pp.filter_cells(adata_all, max_counts=50000)  # uhlitz
sc.pp.filter_cells(joanito, min_genes=500)  # uhlitz
#sc.pp.filter_cells(adata_all, max_genes=5000)  # uhlitz

In [None]:
joanito.shape

In [None]:
sc.pp.filter_genes(joanito, min_cells=1)

In [None]:
joanito.shape

In [None]:
sc.pp.normalize_per_cell(joanito)
sc.pp.log1p(joanito)

In [None]:
score_cell_cycle(joanito, signatures_path_)

In [None]:
joanito.obs['sample.ID'].value_counts()

In [None]:
joanito.obs['patient.ID'].value_counts()

In [None]:
sc.pp.highly_variable_genes(joanito, n_top_genes=2000, batch_key= 'patient.ID')
# cannot use sample ID as batch key because some only have one cell

In [None]:
sc.tl.pca(joanito, svd_solver='arpack', n_comps = 50, use_highly_variable=True)
sc.pl.pca_variance_ratio(joanito, log=False)

In [None]:
sc.pp.neighbors(joanito, n_neighbors=50, n_pcs=30)
sc.tl.umap(joanito)
sc.tl.louvain(joanito, key_added='louvain', resolution=1)
sc.tl.leiden(joanito, key_added='leiden', resolution=1)

#### clean up a bit for later concat

In [None]:
joanito.obs['sample.origin'].cat.reorder_categories(['Normal', 'Tumor', 'Tumor-2', 'LymphNode'], inplace=True)

In [None]:
joanito.obs['MS_status'] = joanito.obs['msi'].cat.rename_categories({'MSI-H':'MSI'})

In [None]:
joanito.obs['MS_status'].value_counts(dropna=False)

In [None]:
joanito.obs['sample'] = joanito.obs['sample.ID']
joanito.obs['patient'] = joanito.obs['patient.ID']
joanito.obs['sample_origin'] = joanito.obs['sample.origin']

joanito.write(os.path.join(new_data_folder, 'iCMS', '202305_joanito_epi_mtx_processed.h5'))