## Clustering Ulrich dataset

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import scanpy.external as sce
from matplotlib import pyplot as plt

In [None]:
sc.settings.verbosity=3
sc.logging.print_header()

In [None]:
ulrich = sc.read_h5ad('/home/j87832lw/oviduct/Ulrich/ulrich_all.h5ad')

In [None]:
ulrich.var_names_make_unique()
sc.external.pp.scrublet(dinh)
ulrich

In [None]:
sc.pl.highest_expr_genes(ulrich, n_top=20)
sc.pp.filter_cells(ulrich, min_genes=200)
sc.pp.filter_genes(ulrich, min_cells=3)
ulrich.var['mt']=ulrich.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(ulrich, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(ulrich, ['n_genes_by_counts','total_counts','pct_counts_mt'],jitter=0.4, multi_panel=True)

In [None]:
sc.pl.scatter(ulrich, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(ulrich, x='total_counts', y='n_genes_by_counts')

In [None]:
ulrich=ulrich[ulrich.obs.n_genes_by_counts <5500,:]
ulrich=ulrich[ulrich.obs.total_counts <37500,:]
ulrich=ulrich[ulrich.obs.pct_counts_mt <5,:]
ulrich.shape

In [None]:
surgical_data = ulrich[ulrich.obs['Source'] == 'Surgical',:].copy()
sc.pp.normalize_total(surgical_data, target_sum=1e4) 
surgical_data

In [None]:
sc.pp.log1p(surgical_data)
sc.pp.highly_variable_genes(surgical_data, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(surgical_data)

In [None]:
surgical_data.raw = surgical_data
surgical_data = surgical_data [:,surgical_data.var.highly_variable]
sc.pp.regress_out(surgical_data, ['total_counts','pct_counts_mt'])
sc.pp.scale(surgical_data, max_value=10)
sc.tl.pca(surgical_data, svd_solver='arpack')
sc.pl.pca(surgical_data, color='OVGP1')

In [None]:
sce.pp.harmony_integrate(surgical_data, 'Run')
surgical_data.obsm['X_pca'] = surgical_data.obsm['X_pca_harmony']
sc.pp.neighbors(surgical_data, n_neighbors=10, n_pcs=8)
sc.tl.umap(surgical_data)
sc.tl.leiden(surgical_data, resolution=0.49)
sc.pl.umap(surgical_data, color=['leiden','Run', 'OVGP1','SNTN','FOXJ1','PAX8'])

In [None]:
cell_dict = {'Secretory': ['0','12','15','8'], 
'T-cell': ['1','6','18'],
 'Fibroblast': ['2','3','4'],
 'Ciliated': ['11'], 
'Monocytes': ['9'],
 'Endothelial': ['7','8','17'],
 'Smooth Muscle': ['5','10'], 
 'Mast': ['13'],
 'Plasma/B-cell': ['14'],
 'Dendritic':['16']}

ulrich.obs['Cell_Types'] = np.nan
for i in cell_dict.keys():
    ind = pd.Series(ulrich.obs.leiden).isin(cell_dict[i])
    ulrich.obs.loc[ind,'Cell_Types'] = i

sc.pl.umap(ulrich, color=['Cell_Types'], legend_loc='right margin', legend_fontsize=14, legend_fontoutline=2, save='ulrichumap.png')

In [None]:
results_file = '/home/j87832lw/oviduct/Ulrich/write/ulrich_clustering_renamedleidengroups.h5ad'
ulrich.write(results_file)