# YanLing single-cell analysis

In [None]:
from cellassign import assign_cats
import gzip
import itertools as itl
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import seaborn as sns
import triku as tk

In [None]:
# Palettes for UMAP gene expression

magma = [plt.get_cmap('magma')(i) for i in np.linspace(0,1, 80)]
magma[0] = (0.88, 0.88, 0.88, 1)
magma = mpl.colors.LinearSegmentedColormap.from_list("", magma[:65])

In [None]:
mpl.rcParams['figure.dpi'] = 250

In [None]:
def plot_batch_abundance(adatax, plot_status=False):
    samples, clusters, list_prop = sorted(dict.fromkeys(adatax.obs['batch'])), sorted(dict.fromkeys(adatax.obs['leiden'])), []
    product = itl.product(*[samples, clusters])

    correction_factor = {sample: len(adatax)/(len(samples) * np.sum(adatax.obs['batch'] == sample)) for sample in samples}

    for sample, cluster in product:
        list_prop.append(correction_factor[sample] * 
                         len(adatax[(adatax.obs['leiden'] == cluster) & (adatax.obs['batch'] == sample)])/
                         len(adatax[adatax.obs['leiden'] == cluster]))

    df = pd.DataFrame({'x':clusters * len(samples), 'y':list_prop, 'hue':sorted(samples * len(clusters))})
    if plot_status:
        df['hue_status'] = [i[0] for i in df['hue']]
        fig, axs = plt.subplots(1, 2, figsize=(16,4))
        sns.barplot(x='x', y='y', hue='hue', data=df, ax=axs[0])
        sns.barplot(x='x', y='y', hue='hue_status', data=df, ax=axs[1])
    else:
        fig, axs = plt.subplots(1, 1, figsize=(8,4))
        sns.barplot(x='x', y='y', hue='hue', data=df, ax=axs)

## Dataset processing

The reads are produced based on the 10X v3 library preparation kit, which consists of a cell barcode of 16 bp, a UMI of 12 bp, and a read of 91 bp.
To process the files we are going to first trim the reads to that length, and then preprocess them using `loompy fastq`.

In [None]:
fastq_dir = f'{os.getcwd()}/fastq' 
mouse_gencode_dir = "/media/seth/SETH_DATA/SETH_Alex/Programs/mouse_GRCm38_gencode.v31"

In [None]:
prefixes = ['KOD11', 'KOD12', 'WT1', 'WT2']

In [None]:
df = pd.DataFrame({'name': prefixes, 'technology': ['10xv3'] * len(prefixes), 'targetnumcells': [1000] * len(prefixes)})
df.to_csv(fastq_dir + '/metadata.tab', sep='\t', index=None)

In [None]:
for filename_root in prefixes: 
    fileinR1 = gzip.open(f'{fastq_dir}/{filename_root}/long_{filename_root}_L001_R1_001.fastq.gz', 'rt') 
    fileinR2 = gzip.open(f'{fastq_dir}/{filename_root}/long_{filename_root}_L001_R2_001.fastq.gz', 'rt') 
    
    fileoutR1 = open(f'{fastq_dir}/{filename_root}/{filename_root}_L001_R1_001.fastq', 'w') 
    fileoutR2 = open(f'{fastq_dir}/{filename_root}/{filename_root}_L001_R2_001.fastq', 'w') 


    count = 0

    while True: 
        count += 1

        # Get next line from file 
        lineR1 = fileinR1.readline() 
        lineR2 = fileinR2.readline() 

        if count % 4 in [1, 3]:
            fileoutR1.write(lineR1.replace('\n', '') + '\n')
            fileoutR2.write(lineR2.replace('\n', '') + '\n')
        elif count == 2:
            fileoutR1.write(lineR1.replace('\n', '')[:28] + '\n')
            fileoutR2.write(lineR2.replace('\n', '')[:91] + '\n')
        else:
            fileoutR1.write(lineR1.replace('\n', '')[:28] + '\n')
            fileoutR2.write(lineR2.replace('\n', '')[:91] + '\n')


        # if line is empty 
        # end of file is reached 
        if not lineR1: 
            break

    fileinR1.close() 
    fileinR2.close()
    fileoutR1.close()
    fileoutR2.close()
    
    os.system(f"cd {fastq_dir}/{filename_root} && gzip {filename_root}_L001_R1_001.fastq")
    os.system(f"cd {fastq_dir}/{filename_root} && gzip {filename_root}_L001_R2_001.fastq")
    os.system(f"cd {fastq_dir} && loompy fromfq {filename_root}.loom {filename_root} {mouse_gencode_dir} metadata.tab {fastq_dir}/{filename_root}/{filename_root}_L001_R1_001.fastq.gz {fastq_dir}/{filename_root}/{filename_root}_L001_R2_001.fastq.gz")

## Load adatas

In [None]:
seed = 0

In [None]:
adata_KOD11 = sc.read(f"{fastq_dir}/KOD11.loom")
adata_KOD11.var_names_make_unique()

adata_KOD12 = sc.read(f"{fastq_dir}/KOD12.loom")
adata_KOD12.var_names_make_unique()

adata_WT1 = sc.read(f"{fastq_dir}/WT1.loom")
adata_WT1.var_names_make_unique()

adata_WT2 = sc.read(f"{fastq_dir}/WT2.loom")
adata_WT2.var_names_make_unique()

In [None]:
adata_all = sc.AnnData.concatenate(adata_KOD11, adata_KOD12, adata_WT1, adata_WT2, batch_categories=['KOD11', 'KOD12', 'WT1', 'WT2'])

In [None]:
adata_all.obs['status'] = [i[0] for i in adata_all.obs['batch']]

In [None]:
for prefix in prefixes:
    adata_all.obs[f'is_{prefix}'] = (adata_all.obs['batch'] == prefix).astype(str)
    adata_all.uns[f'is_{prefix}_colors'] = ['#bcbcbc', '#bc0000']

In [None]:
adata_all

In [None]:
# Basic QC filtering
adata_all.var['mt'] = adata_all.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_all, qc_vars=['mt'], percent_top=None, inplace=True)

In [None]:
sc.pl.violin(adata_all, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
             jitter=0.4, multi_panel=True)

sc.pl.scatter(adata_all, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata_all, x='total_counts', y='n_genes_by_counts')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_all.obs['batch'], 'y': np.log1p(adata_all.obs['pct_counts_mt'])})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))
df = pd.DataFrame({'x': adata_all.obs['batch'], 'y': adata_all.obs['log1p_n_genes_by_counts']})
sns.violinplot(x='x', y='y', data=df, ax=ax)

In [None]:
adata_all = adata_all[
(((adata_all.obs.batch == 'KOD11') & (adata_all.obs.log1p_n_genes_by_counts < 8.2) & (adata_all.obs.log1p_n_genes_by_counts > 5.5)) |
 ((adata_all.obs.batch == 'KOD12') & (adata_all.obs.log1p_n_genes_by_counts < 8.0) & (adata_all.obs.log1p_n_genes_by_counts > 6.5)) |
 ((adata_all.obs.batch == 'WT1')   & (adata_all.obs.log1p_n_genes_by_counts < 8.0) & (adata_all.obs.log1p_n_genes_by_counts > 6.5)) |
 ((adata_all.obs.batch == 'WT2')   & (adata_all.obs.log1p_n_genes_by_counts < 8.0) & (adata_all.obs.log1p_n_genes_by_counts > 6.5)))]
adata_all = adata_all[adata_all.obs.pct_counts_mt < 10, :]

In [None]:
sc.pp.filter_genes(adata_all, min_counts=1)
sc.pp.normalize_total(adata_all)
sc.pp.log1p(adata_all)

## "Fast" cell type identification

In [None]:
# 5 good genes per cell type
# For immune cell types I used http://rstats.immgen.org/Skyline/skyline.html

dict_celltypes = {'Keratinocyte': ['Lgals7', 'Dmkn', 'S100a14', 'Perp', 'Dsc3'], 
                  'Keratinocyte Krt79$^+$': ['Apoc1', 'Ldhb', 'Krt79', 'Ldhb', 'Cidea'],
                  'Fibroblast': ['Lum', 'Pdgfra', 'Mfap2', 'Mfap5', 'Clec3b'],
                  'Fibroblast Rab37$^+$': ['Rab37', 'Col22a1', 'F13a1', 'Htra4', 'Tspan15'],
                  'Fibroblast Chf$^+$': ['Cfh', 'Alpl', 'Lifr', 'Sp7', 'Spp1'],
                  'Fibroblast Il1rl1$^+$': ['Il1rl1', 'Ptgs2', 'Nr4a2', 'Gxylt2', 'Lum'],
                  'Fibroblast Serpine2$^+$': ['Serpine2', 'Shox2', 'Wif1', 'Gm48159', 'Col23a1'],
                  'Chondrocyte?': ['Col9a1', 'Col9a2', 'Scrg1', 'Hapln1', 'Trpv4'],
                  'Endothelial': ['Pecam1', 'Cldn5', 'Cdh5', 'Ptprb', 'Tie1'],
                  'Lymphatic': ['Mmrn1', 'Ccl21a', 'Prox1', 'Lyve1', 'Flt4'],
                  'Perivascular cell Inpp4b$^+$': ['Rgs5', 'Myh11', 'Aoc3', 'Inpp4b', 'Mrvi1'],
                  'Perivascular cell Il6$^+$': ['Rgs5', 'Myh11', 'Il6', 'Procr', 'Ngf'],
                  'Schwann cell': ['Prx', 'Mbp', 'Mpz', 'Ncmap', 'Cldn19'], 
                  'Glial cell': ['Fgl2', 'Cdh19', 'Adam23', 'Fcgr2b', 'Rxrg'], 
                  'Melanocyte': ['Pmel', 'Mlana', 'Dct'],
                  'Skeletal muscle': ['Msc', 'Myod1', 'Cdh15', 'Peg3', 'Dag1'], 
                  'Red blood cell': ['Hba-a1', 'Hbb-bt', 'Hbb-bs', 'Car2', 'Rhd'],
                  'T cell': ['Cd3d', 'Cd3e', 'Ifngr1', 'Klf2', 'Cd27'],
                  'T cell (ILC/gd)?': ['Cd7', 'Cd3e', 'Ctsw', 'Cd3d', 'Cd3g'],
                  'B cell': ['Rrm2', 'Rpa3', 'Cd79b', 'Dntt', 'Cd79a'],
                  'Plasma cell': ['Ighm', 'Igkc', 'Cd79b', 'Iglc1', 'Iglc2'],
                  'NK cell': ['Cd3d', 'Cd3e', 'Nkg7', 'Klrk1', 'Trdv4'],
                  'Macrophage': ['C1qa', 'C1qc', 'Wfdc17', 'Pf4', 'Folr2'],
                  'Monocyte': ['Wfdc17', 'Csf1r', 'F10', 'Ly6c2', 'Gsr'],
                  'Neutrophil': ['S100a8', 'S100a9', 'Camp', 'Ltf', 'Chil3'],
                  'Dendritic cell': ['Cd209a', 'Irf5', 'Plbd1', 'Aif1', 'Cd209d'],
                  'Langerhans cell': ['Cd207', 'Mfge8', 'Cd74', 'Il1r2', 'Tnfaip2'],
                  'Mast cell': ['Cpa3', 'Cyp11a1', 'Cma1', 'Mcpt4', 'Tpsb2']
                  }

### WT1

In [None]:
adata_WT1 = adata_all[adata_all.obs['batch'] == 'WT1']
sc.pp.filter_genes(adata_WT1, min_counts=1)

sc.pp.pca(adata_WT1, random_state=seed, n_comps=30)
sc.pp.neighbors(adata_WT1, random_state=seed, n_neighbors=int(0.5 * len(adata_WT1) ** 0.5), metric='cosine')
tk.tl.triku(adata_WT1)

sc.tl.umap(adata_WT1, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_WT1, resolution=13, random_state=seed)

assign_cats(adata_WT1, dict_cats=dict_celltypes, min_score=0.4, quantile_gene_sel=0.7, key_added='cell_type')

sc.pp.subsample(adata_WT1, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT1, color=['log1p_n_genes_by_counts', 'leiden',], alpha=0.5, ncols=3, legend_loc='on data')
sc.pl.umap(adata_WT1, color=['cell_type'], ncols=3)

In [None]:
sc.tl.rank_genes_groups(adata_WT1, groupby='leiden', groups=['30'])
sc.pl.rank_genes_groups_tracksplot(adata_WT1, dendrogram=False, n_genes=200)

In [None]:
for key, val in dict_celltypes.items():
    print(key)
    sc.pl.umap(adata_WT1, color=['cell_type'] + [i for i in val if i in adata_WT1.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### WT2

In [None]:
adata_WT2 = adata_all[adata_all.obs['batch'] == 'WT2']
sc.pp.filter_genes(adata_WT2, min_counts=1)

sc.pp.pca(adata_WT2, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_WT2, random_state=seed, n_neighbors=int(0.5 * len(adata_WT2) ** 0.5), metric='cosine')
tk.tl.triku(adata_WT2)

sc.tl.umap(adata_WT2, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_WT2, resolution=13, random_state=seed)

assign_cats(adata_WT2, dict_cats=dict_celltypes, min_score=0.45, quantile_gene_sel=0.8, key_added='cell_type')

sc.pp.subsample(adata_WT2, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT2, color=['log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2, legend_loc='on data')
sc.pl.umap(adata_WT2, color=['cell_type'], ncols=2, cmap=magma)

In [None]:
sc.tl.rank_genes_groups(adata_WT2, groupby='leiden', groups=['8', '10'])
sc.pl.rank_genes_groups_tracksplot(adata_WT2, dendrogram=False, n_genes=150)

In [None]:
for key, val in dict_celltypes.items():
    print(key)
    sc.pl.umap(adata_WT2, color=['cell_type'] + [i for i in val if i in adata_WT2.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### KOD11

In [None]:
adata_KOD11 = adata_all[adata_all.obs['batch'] == 'KOD11']
sc.pp.filter_genes(adata_KOD11, min_counts=1)

sc.pp.pca(adata_KOD11, random_state=seed, n_comps=50)
sc.pp.neighbors(adata_KOD11, random_state=seed, n_neighbors=int(len(adata_KOD11) ** 0.5), metric='cosine')
tk.tl.triku(adata_KOD11)

sc.tl.umap(adata_KOD11, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_KOD11, resolution=11, random_state=seed)

assign_cats(adata_KOD11, dict_cats=dict_celltypes, min_score=0.4, quantile_gene_sel=0.7, key_added='cell_type')

sc.pp.subsample(adata_KOD11, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_KOD11, color=['log1p_n_genes_by_counts', 'leiden',], alpha=0.5, ncols=3, legend_loc='on data')
sc.pl.umap(adata_KOD11, color=['cell_type'], ncols=3)

In [None]:
sc.tl.rank_genes_groups(adata_KOD11, groupby='cell_type', groups=['Schwann cell'])
sc.pl.rank_genes_groups_tracksplot(adata_KOD11, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_celltypes.items():
    print(key)
    sc.pl.umap(adata_KOD11, color=['cell_type'] + [i for i in val if i in adata_KOD11.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

### KOD12

In [None]:
adata_KOD12 = adata_all[adata_all.obs['batch'] == 'KOD12']
sc.pp.filter_genes(adata_KOD12, min_counts=1)

sc.pp.pca(adata_KOD12, random_state=seed, n_comps=35)
sc.pp.neighbors(adata_KOD12, random_state=seed, n_neighbors=int(0.5 * len(adata_KOD12) ** 0.5), metric='cosine')
tk.tl.triku(adata_KOD12)

sc.tl.umap(adata_KOD12, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_KOD12, resolution=13, random_state=seed)

assign_cats(adata_KOD12, dict_cats=dict_celltypes, min_score=0.4, quantile_gene_sel=0.7, key_added='cell_type')

sc.pp.subsample(adata_KOD12, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_KOD12, color=['log1p_n_genes_by_counts', 'leiden',], alpha=0.5, ncols=3, legend_loc='on data')
sc.pl.umap(adata_KOD12, color=['cell_type'], ncols=3)

In [None]:
sc.tl.rank_genes_groups(adata_KOD12, groupby='leiden', groups=['20', '31'], method='wilcoxon')
sc.pl.rank_genes_groups_tracksplot(adata_KOD12, dendrogram=False, n_genes=100)

In [None]:
for key, val in dict_celltypes.items():
    print(key)
    sc.pl.umap(adata_KOD12, color=['cell_type'] + [i for i in val if i in adata_KOD12.var_names], legend_loc='on data', cmap=magma, use_raw=False, ncols=4) 

## WT dataset analysis

### Dataset analysis (WT-bbknn)

In [None]:
adata_WT_bbknn = adata_all[adata_all.obs['batch'].isin(['WT1', 'WT2'])]
sc.pp.filter_genes(adata_WT_bbknn, min_counts=1)

In [None]:
sc.pp.pca(adata_WT_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_WT_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(len(adata_WT_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_WT_bbknn)

In [None]:
sc.tl.umap(adata_WT_bbknn, min_dist=0.4, random_state=seed)
sc.tl.leiden(adata_WT_bbknn, resolution=0.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_WT_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT_bbknn, color=['batch', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5)

In [None]:
sc.pl.umap(adata_WT_bbknn, color=['Lum', 'Pdgfra',  # Fb
                                  'Rgs5', 'Abcc9', 'Myl9',  # Peri
                                  'Pecam1', 'Cldn5', 'Rgcc',  # Endo
                                  'Krt5', 'Krt14', 'Krt17', 'Krt79', 'Krt10',  # Krt
                                  'Cd3e', 'Thy1',  # T cells
                                  'C1qa', 'Cd209f',  # MC
                                  'Cd209a', # DC
                                  'Cd207', 'Mfge8', # Langerhans
                                  'Cnp', 'Mbp', 'Mpz', # Schwann
                                  'Pmel', 'Tyr', 'Dct', # Mel1+Mel2
                                  'Hba-a1', 'Hbb-bt', # RBC
                                  'Des', 'Acta1', 'Cxcl1', # Skeletal muscle
                                 ], alpha=0.5, cmap=magma)

In [None]:
plot_batch_abundance(adata_WT_bbknn, plot_status=False)

In [None]:
sc.tl.rank_genes_groups(adata_WT_bbknn, groupby='leiden', groups=['0'])
sc.pl.rank_genes_groups_tracksplot(adata_WT_bbknn, dendrogram=False, n_genes=50)

### Dataset analysis (WT-harmony)

In [None]:
adata_WT_harmony = adata_all[adata_all.obs['batch'].isin(['WT1', 'WT2'])]
sc.pp.filter_genes(adata_WT_harmony, min_counts=1)

In [None]:
sc.pp.pca(adata_WT_harmony, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_WT_harmony, key='batch', max_iter_harmony=50, plot_convergence=True)
sc.pp.neighbors(adata_WT_harmony, random_state=seed, n_neighbors=int(len(adata_WT_harmony) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_WT_harmony)

In [None]:
sc.tl.umap(adata_WT_harmony, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_WT_harmony, resolution=0.5, random_state=seed)

In [None]:
sc.pp.subsample(adata_WT_harmony, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT_harmony, color=['batch', 'leiden', 'log1p_n_genes_by_counts'], alpha=0.5)

In [None]:
sc.pp.subsample(adata_WT_harmony, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT_harmony, color=['Msc', 'Cdh15'], alpha=0.5, cmap=magma)

In [None]:
plot_batch_abundance(adata_WT_harmony, plot_status=False)

### Dataset analysis (WT-scanorama)

In [None]:
adata_WT_scanorama = adata_all[adata_all.obs['batch'].isin(['WT1', 'WT2'])]
sc.pp.filter_genes(adata_WT_scanorama, min_counts=1)
adata_WT_scanorama = adata_WT_scanorama[adata_WT_scanorama.obs.sort_values(by='batch').index, :]

In [None]:
sc.pp.pca(adata_WT_scanorama, random_state=seed, n_comps=50)
sce.pp.scanorama_integrate(adata_WT_scanorama, 'batch', knn=int(len(adata_WT_scanorama) ** 0.5))
sc.pp.neighbors(adata_WT_scanorama, random_state=seed, n_neighbors=int(len(adata_WT_scanorama) ** 0.5), metric='cosine', use_rep='X_scanorama')
tk.tl.triku(adata_WT_scanorama)

In [None]:
sc.tl.umap(adata_WT_scanorama, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_WT_scanorama, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_WT_scanorama, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_WT_scanorama, color=['batch', 'leiden', 'log1p_n_genes_by_counts'], alpha=0.5)

## Results on integration of WT datasets
All 3 methods seem to integrate the datasets correctly. bbknn and harmony seem to yield similar UMAPs. We will select bbknn as the WT dataset batch effect correction. In general, we observe a predominance of WT1 cells, mainly because there are fewer WT2 cells. We observe several cases, however, where the predominance of WT1 cells is much higher:
* A large cluster expressing *Col12a1*, *Col9a1* and *Snorc*, possibly fibroblasts.
* A small cluster expressing *

### Dataset analysis (KOD-bbknn)

In [None]:
adata_KOD = adata_all[adata_all.obs['batch'].isin(['KOD11', 'KOD12'])]
sc.pp.filter_genes(adata_KOD, min_counts=1)

In [None]:
sc.pp.pca(adata_KOD, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_KOD, metric='angular', batch_key='batch', neighbors_within_batch=3)
tk.tl.triku(adata_KOD)

In [None]:
sc.tl.umap(adata_KOD, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_KOD, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_KOD, fraction=1, random_state=seed, copy=False)

In [None]:
sc.pl.umap(adata_KOD, color=['batch', 'leiden'], alpha=0.5)

In [None]:
sc.pl.umap(adata_KOD, color=['KRT10', 'KRT13', 'TMEM238', 'HBB', 'FAM117A', 'SOX6'], cmap=magma, ncols=3)

### Dataset analysis (KOD-harmony)

In [None]:
adata_KOD = adata_all[adata_all.obs['batch'].isin(['KOD11', 'KOD12'])]
sc.pp.filter_genes(adata_KOD, min_counts=1)

In [None]:
sc.pp.pca(adata_KOD, random_state=seed, n_comps=50)
sce.pp.harmony_integrate(adata_KOD, key='batch', max_iter_harmony=50, plot_convergence=True)
sc.pp.neighbors(adata_KOD, random_state=seed, n_neighbors=int(0.5 * len(adata_KOD) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_KOD)

In [None]:
sc.tl.umap(adata_KOD, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_KOD, resolution=1.3, random_state=seed)

In [None]:
sc.pl.umap(adata_KOD, color=['batch', 'leiden'], alpha=0.5)

In [None]:
sc.pl.umap(adata_KOD, color=['KRT10', 'KRT13', 'TMEM238', 'HBB', 'FAM117A', 'SOX6'], cmap=magma, ncols=3)

### Dataset analysis (KOD-scanorama)

In [None]:
adata_KOD = adata_all[adata_all.obs['batch'].isin(['KOD11', 'KOD12'])]
sc.pp.filter_genes(adata_KOD, min_counts=1)
adata_KOD = adata_KOD[adata_KOD.obs.sort_values(by='batch').index, :]

In [None]:
sc.pp.pca(adata_KOD, random_state=seed, n_comps=50)
sce.pp.scanorama_integrate(adata_KOD, 'batch', knn=12)
sc.pp.neighbors(adata_KOD, random_state=seed, n_neighbors=int(0.5 * len(adata_KOD) ** 0.5), metric='cosine', use_rep='X_scanorama')
tk.tl.triku(adata_KOD)

In [None]:
sc.tl.umap(adata_KOD, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_KOD, resolution=1.3, random_state=seed)

In [None]:
sc.pl.umap(adata_KOD, color=['batch', 'leiden'], alpha=0.5)

In [None]:
sc.pl.umap(adata_KOD, color=['KRT10', 'KRT13', 'TMEM238', 'HBB', 'FAM117A', 'SOX6'], cmap=magma, ncols=3)

In [None]:
sc.tl.rank_genes_groups(adata_KOD, groupby='leiden', groups=['7', '11'])
sc.pl.rank_genes_groups_tracksplot(adata_KOD, dendrogram=False, n_genes=35)

## Dataset analysis (whole-bbknn)

In [None]:
# neighbors_within_batch

In [None]:
adata_all_bbknn = adata_all.copy()
sc.pp.pca(adata_all_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_all_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(len(adata_all_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_all_bbknn)

sc.tl.umap(adata_all_bbknn, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_bbknn, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_bbknn, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_bbknn, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_bbknn = adata_all.copy()
sc.pp.pca(adata_all_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_all_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(0.5 * len(adata_all_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_all_bbknn)

sc.tl.umap(adata_all_bbknn, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_bbknn, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_bbknn, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_bbknn, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_bbknn = adata_all.copy()
sc.pp.pca(adata_all_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_all_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(0.1 * len(adata_all_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_all_bbknn)

sc.tl.umap(adata_all_bbknn, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_bbknn, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_bbknn, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_bbknn, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_bbknn = adata_all.copy()
sc.pp.pca(adata_all_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_all_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(0.05 * len(adata_all_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_all_bbknn)

sc.tl.umap(adata_all_bbknn, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_bbknn, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_bbknn, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_bbknn, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_bbknn = adata_all.copy()

In [None]:
sc.pp.pca(adata_all_bbknn, random_state=seed, n_comps=50)
sce.pp.bbknn(adata_all_bbknn, metric='angular', batch_key='batch', neighbors_within_batch=int(len(adata_all_bbknn) ** 0.5 / len(prefixes)))
tk.tl.triku(adata_all_bbknn)

In [None]:
sc.tl.umap(adata_all_bbknn, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_bbknn, resolution=0.8, random_state=seed)

In [None]:
sc.pp.subsample(adata_all_bbknn, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_bbknn, color=['batch', 'status', 'leiden'], alpha=0.5)
sc.pl.umap(adata_all_bbknn, color=['status', 'leiden'], alpha=0.5, legend_loc='on data', ncols=2)

In [None]:
plot_batch_abundance(adata_all_bbknn, plot_status=True)

## Dataset imbalance analysis (whole-harmony)

In [None]:
#Changing sigma

In [None]:
adata_all_harmony = adata_all.copy()
sc.pp.pca(adata_all_harmony, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_all_harmony, key='batch', max_iter_harmony=50, plot_convergence=True, sigma=0.1)
sc.pp.neighbors(adata_all_harmony, random_state=seed, n_neighbors=int(0.5 * len(adata_all_harmony) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_all_harmony)

sc.tl.umap(adata_all_harmony, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_harmony, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_harmony, fraction=1, random_state=seed, copy=False)

sc.pl.umap(adata_all_harmony, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_harmony, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_harmony = adata_all.copy()
sc.pp.pca(adata_all_harmony, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_all_harmony, key='batch', max_iter_harmony=50, plot_convergence=True, sigma=0.5)
sc.pp.neighbors(adata_all_harmony, random_state=seed, n_neighbors=int(0.5 * len(adata_all_harmony) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_all_harmony)

sc.tl.umap(adata_all_harmony, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_harmony, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_harmony, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_harmony, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_harmony, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_harmony = adata_all.copy()
sc.pp.pca(adata_all_harmony, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_all_harmony, key='batch', max_iter_harmony=50, plot_convergence=True, sigma=0.01)
sc.pp.neighbors(adata_all_harmony, random_state=seed, n_neighbors=int(0.5 * len(adata_all_harmony) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_all_harmony)

sc.tl.umap(adata_all_harmony, min_dist=0.3, random_state=seed)
sc.tl.leiden(adata_all_harmony, resolution=1.3, random_state=seed)

sc.pp.subsample(adata_all_harmony, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_harmony, color=['batch', 'status', 'log1p_n_genes_by_counts', 'leiden'], alpha=0.5, ncols=2)
sc.pl.umap(adata_all_harmony, color=[f'is_{prefix}' for prefix in prefixes], alpha=0.5, ncols=2)

In [None]:
adata_all_harmony = adata_all.copy()

In [None]:
sc.pp.pca(adata_all_harmony, random_state=seed, n_comps=30)
sce.pp.harmony_integrate(adata_all_harmony, key='batch', max_iter_harmony=50, plot_convergence=True)
sc.pp.neighbors(adata_all_harmony, random_state=seed, n_neighbors=int(0.5 * len(adata_all_harmony) ** 0.5), metric='cosine', use_rep='X_pca_harmony')
tk.tl.triku(adata_all_harmony)

In [None]:
sc.tl.umap(adata_all_harmony, min_dist=0.2, random_state=seed)
sc.tl.leiden(adata_all_harmony, resolution=1.3, random_state=seed)

In [None]:
sc.pp.subsample(adata_all_harmony, fraction=1, random_state=seed, copy=False)
sc.pl.umap(adata_all_harmony, color=['batch', 'status', 'leiden'], alpha=0.5)
sc.pl.umap(adata_all_harmony, color=['status', 'leiden'], alpha=0.5, legend_loc='on data', ncols=2)

In [None]:
plot_batch_abundance(adata_all_harmony, plot_status=True)

## Dataset analysis (whole-scanorama)

In [None]:
adata_all_scanorama = adata_all.copy()

In [None]:
adata_all_scanorama = adata_all_scanorama[adata_all_scanorama.obs.sort_values(by='batch').index, :]

In [None]:
sc.pp.pca(adata_all_scanorama, random_state=seed, n_comps=50)
sce.pp.scanorama_integrate(adata_all_scanorama, 'batch', knn=int(0.5 * len(adata_all_scanorama) ** 0.5), alpha=2)
sc.pp.neighbors(adata_all_scanorama, random_state=seed, n_neighbors=int(0.5 * len(adata_all_scanorama) ** 0.5), metric='cosine', use_rep='X_scanorama')
tk.tl.triku(adata_all_scanorama)

In [None]:
sc.tl.umap(adata_all_scanorama, min_dist=0.5, random_state=seed)
sc.tl.leiden(adata_all_scanorama, resolution=1.3, random_state=seed)

In [None]:
sc.pl.umap(adata_all_scanorama, color=['batch', 'status', 'leiden'], alpha=0.5)

In [None]:
plot_batch_abundance(adata_all_scanorama, plot_status=True)