## Clustering dinh dataset

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from matplotlib import pyplot as plt
import scvelo as scv

In [None]:
sc.settings.verbosity=3
sc.logging.print_header()

In [None]:
dinh = sc.read_loom('/home/j87832lw/mounting/oviduct/data/1a/Dinh.h5ad')

In [None]:
dinh.var_names_make_unique()
sc.external.pp.scrublet(dinh)
sc.pl.highest_expr_genes(dinh, n_top=20)
sc.pp.filter_cells(dinh, min_genes=200)
sc.pp.filter_genes(dinh, min_cells=3)
dinh.var['mt']=dinh.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(dinh, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(dinh, ['n_genes_by_counts','total_counts','pct_counts_mt'],jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(dinh, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(dinh, x='total_counts', y='n_genes_by_counts')

In [None]:
dinh=dinh[dinh.obs.n_genes_by_counts <5000,:]
dinh=dinh[dinh.obs.total_counts <250000,:]
dinh=dinh[dinh.obs.pct_counts_mt <5,:]
dinh.shape

In [None]:
sc.pp.normalize_total(dinh, target_sum=1e4) 
sc.pp.log1p(dinh)
sc.pp.highly_variable_genes(dinh, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(dinh)
dinh = dinh [:, dinh.var.highly_variable]
sc.pp.scale(dinh, max_value=10)

In [None]:
sce.pp.harmony_integrate(dinh, 'Run')
dinh.obsm['X_pca'] = dinh.obsm['X_pca_harmony']
sc.pp.neighbors(dinh, n_neighbors=10, n_pcs=8)
sc.tl.umap(dinh)
sc.tl.leiden(dinh, resolution=0.5)

In [None]:
sc.pl.umap(dinh, color=['leiden','Patient', 'Run'])
sc.pl.umap(dinh, color=['leiden','JCHAIN','KLRC1','CD4','CD8A','CD3E','CD3G','OVGP1','SNTN','CCL14','COL1A1','PTPRC','CAPS','CD3E','TRBC1','DCN','TRAC','CD4','TPSAB1','LYZ'])

In [None]:
cell_dict = {'T-cell': ['0','1','3','8'], 
 'Fibroblast': ['2'],
 'Ciliated': ['4','5'], 
 'Secretory': ['6'],
'Monocytes': ['7'],
 'Endothelial': ['9'],
 'Smooth Muscle': ['10'], 
 'Mast': ['11'],
 'Plasma/B-cell': ['12']}

dinh.obs['Cell_Types'] = np.nan
for i in cell_dict.keys():
    ind = pd.Series(dinh.obs.leiden).isin(cell_dict[i])
    dinh.obs.loc[ind,'Cell_Types'] = i

In [None]:
sc.pl.umap(dinh, color=['Cell_Types','leiden'], legend_loc='on data', legend_fontsize=6, save='dinh_labeled.pdf')
sc.pl.umap(dinh, color=['Cell_Types'], legend_loc='right margin', legend_fontsize=14, save='dinhumap.pdf')


In [None]:
results_file = '/home/j87832lw/mounting/oviduct/data/1a/dinh_clustering_renamedleidengroups.h5ad'
dinh.write(results_file)