In [None]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


import datetime
import os
import re
import random

# Disable INFO and WARNING errors from tensorflow
# Most nodes do not have GPU's and this prevents warnings about that being printed.
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import anndata as ad
import collections as c
import pickle
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

from pathlib import Path
from matplotlib import rcParams

import stereo as st

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

random.seed('cbd-bioinf')

In [None]:
sc.settings.cachedir = '/staging/leuven/stg_00003/cbd-bioinf/cache/'
sc.settings.verbosity = 3
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, facecolor='white')
rcParams["figure.dpi"] = 120

projects_data_dir = Path('./')

In [None]:
# Define the wrapper class for single cell samples

def load_stereoseq_data(project, sample, species, bin_data, bin_sizes=[50]):
    if bin_data.endswith('gef') or bin_data.endswith('gef.gz'):
        print(f'Detected GEF file')
        load_func = st.io.read_gef
    elif bin_data.endswith('gem') or bin_data.endswith('gem.gz'):
        print(f'Detected GEM file')
        load_func = st.io.read_gem
    else:
        raise IOError('File type not recognised.')
    
    return_datas = {}
    for bin_size in bin_sizes:
        stereoseq_data = load_func(bin_data, bin_size=bin_size, bin_type='bins')
        return_datas[f'{sample}_bin{bin_size}'] = SingleCellSample(
            project,
            f'{sample}_bin{bin_size}',
            species,
            st.io.stereo_to_anndata(stereoseq_data, sample_id=sample))
        return_datas[f'{sample}_bin{bin_size}'].spatial = True
        
    return return_datas
    

class SingleCellSample:
    '''
    This class contains the main functions used in the rest of the notebook. 
    Each function is typically a wrapper around one or multiple scanpy functions.
    
    '''
    
    SCS_VER = '2.0.5'
    
    def __init__(self, project, sample, species, adata, data_path=None):
        self.SCS_VER = SingleCellSample.SCS_VER
        self.sample = sample
        self.adata = adata
        self.project = project
        self.species = species
        self.data_path = data_path
        self.multiplexed = False
        self.spatial = False
        self.pca_n = 0
        self.filters = {}

    @classmethod
    def load(cls, filename):
        with open(filename, 'rb') as fh:
            data = pickle.load(fh)
            if data.SCS_VER != SingleCellSample.SCS_VER:
                print(f'WARNING: Loaded SCS_VER ({data.SCS_VER} does not equal notebook version ({SingleCellSample.SCS_VER})')
            return data     
        
    @classmethod
    def read_10x(cls, project, sample, species, data_path, **adata_kwargs):
        adata = sc.read_10x_mtx(TenXDir, var_names='gene_symbols', cache=True, **adata_kwargs)
        scs = SingleCellSample(project, sample, species, adata, data_path=data_path)
        return scs
        
        
    def save(self, filename):
        if not filename.endswith('pickle'):
            filename = f'{filename}.pickle'
        with open(filename, 'wb') as fh:
            pickle.dump(self, fh)

    def save_h5ad(self, filename_prefix, save_real_raw=False):
        if filename_prefix.endswith('.h5ad'):
            filename_prefix = filename_prefix.replace('.h5ad', '')
        self.adata.write(f'{filename_prefix}.h5ad')
        if save_real_raw:
            self.raw.write(f'{filename_prefix}_raw.h5ad')
            

    def preprocess_adata(self):
        """ Preprocess data on basic level.

        Fix any known issues for gene names (i.e. nan)

        """

        self.adata.var_names_make_unique()

        if 'gene_ids' not in self.adata.var.keys():
            self.adata.var['gene_ids'] = self.adata.var_names

        # This is an issue in Drosophila genomes. The symbol for 'nanchung' is 'nan' and gets cast to np.NaN
        if np.NaN in self.adata.var['gene_ids']:
            nan_loc = self.adata.var.index.get_loc(np.NaN)
            self.adata.var['gene_ids'][nan_loc] = 'nan'
            newIx = list(self.adata.var.index)
            newIx[nan_loc] = 'nan'
            self.adata.var.index = newIx


    def append_sample_names(self):
        """ Append sample name to cells for easier identification later """
        self.adata.obs.index = ['-'.join([x.split('-')[0], '__'.join(self.sample.split('__')[-2:])]) for x in self.adata.obs.index]
                  

    def calc_qc(self, mito_identifier):
                
        self.adata.var['mt'] = self.adata.var_names.str.startswith(mito_identifier)
        sc.pp.calculate_qc_metrics(self.adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
    
    def plot_qc(self):
        
        sc.pl.violin(self.adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                     jitter=0.4, multi_panel=True)
        
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 4))
                
        sc.pl.scatter(self.adata, x='total_counts', y='pct_counts_mt', ax=ax1, show=False)
        sc.pl.scatter(self.adata, x='total_counts', y='n_genes_by_counts', ax=ax2, show=False)
        plt.tight_layout()

    def filter_cells(self, min_genes = 1000, min_cells = 3, n_genes = 7000, percent_mito = 7.5, n_counts = 0, mito_identifier = 'MT-'):
        
        if 'mt' not in self.adata.obs.columns:
            self.calc_qc(mito_identifier=mito_identifier)
        
        if percent_mito < 1:
            print(f'WARNING: Automatically adjusting percent_mito... {percent_mito} -> {percent_mito * 100}')
            percent_mito = percent_mito * 100
                    
        """ Perform basic filtering steps """
        self.filters['min_genes'] = min_genes
        self.filters['min_cells'] = min_cells
        self.filters['n_genes'] = n_genes
        self.filters['percent_mito'] = percent_mito
        
        sc.pp.filter_cells(self.adata, min_genes=min_genes)
        sc.pp.filter_genes(self.adata, min_cells=min_cells)

        self.adata = self.adata[self.adata.obs['n_genes_by_counts'] < n_genes, :]
        self.adata = self.adata[self.adata.obs['pct_counts_mt'] < percent_mito, :]
        if n_counts > 0:
            self.adata = self.adata[self.adata.obs['total_counts'] < n_counts, :]
            self.filters['n_counts'] = n_counts


        print(f"{self.sample} post-filter stats")
        self.plot_qc()

        sc.pl.highest_expr_genes(self.adata)

                  
    def normalize_and_log_transform(self):
        print('Storing raw counts in self.raw')
        self.raw = self.adata.copy()
        sc.pp.normalize_total(self.adata, target_sum=1e4)
        sc.pp.log1p(self.adata)
        self.adata.raw = self.adata
                  
    
    def detect_high_variable_genes(self):
        sc.pp.highly_variable_genes(self.adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
        sc.pl.highly_variable_genes(self.adata, )
        self.hvgs = self.adata.var.index[self.adata.var['highly_variable']]
        self.adata = self.adata[:, self.hvgs]
        self.filters['highly_variable_genes'] = True

                  
    def regress_and_scale(self, features = ['total_counts', 'pct_counts_mt'], n_jobs=None):
        for feature in features:
            if feature not in self.adata.obs.columns:
                print(f'Removing feature {feature} as it is not in observations')
                features.remove(feature)
        sc.pp.regress_out(self.adata, features, n_jobs=n_jobs)
        sc.pp.scale(self.adata, max_value=10)
        
    
    def perform_pca(self, max_pcs = 150, colour = None):
        sc.tl.pca(self.adata, svd_solver='arpack', n_comps=max_pcs)

        # Adjust the figure size to show 150 components correctly
        _old_figsize = rcParams['figure.figsize']
        rcParams['figure.figsize'] = (12.0, 4.0)
        sc.pl.pca_variance_ratio(self.adata, log=True, n_pcs=max_pcs)
        rcParams['figure.figsize'] = _old_figsize

        sc.pl.pca(self.adata, color = colour)
        
        
    def reduce_dimensions(self, n_jobs=None, use_paga_init=False, use_harmony=True):
        random.seed('cbd-bioinf')
        if 'X_pca_harmony' in self.adata.obsm and use_harmony:
            print('Using Harmony corrected PCA results. Pass "use_harmony=False" to disable.')
            if self.pca_n > 0:
                print(f"Creating self.adata.obsm['X_pca_harmony_trimmed'] with {self.pca_n} components")
                self.adata.obsm['X_pca_harmony_trimmed'] = self.adata.obsm['X_pca_harmony'][:, :self.pca_n]
                rep_to_use = 'X_pca_harmony_trimmed'
            else:
                rep_to_use = 'X_pca_harmony'
        else:
            rep_to_use = None
        sc.pp.neighbors(self.adata, n_neighbors=15, n_pcs=self.pca_n, use_rep=rep_to_use)
        if use_paga_init:
            sc.tl.paga(self.adata)
            sc.pl.paga(self.adata, plot=False)
            sc.tl.umap(self.adata, init_pos='paga')
        else:
            sc.tl.umap(self.adata)
        sc.tl.tsne(self.adata, n_pcs=self.pca_n, n_jobs=n_jobs, use_rep=rep_to_use)
        

    def detect_clusters(self, res=None, method='leiden', calc_markers=True, diff_method='wilcoxon', n_genes=None):
        key_added = f'{method}_res{res}' if res != None else None

        if method == 'leiden':
            sc.tl.leiden(self.adata, resolution = res, key_added=key_added)
        elif method == 'louvain':
            sc.tl.louvain(self.adata, resolution = res, key_added=key_added)
        if calc_markers:
            sc.tl.rank_genes_groups(self.adata, key_added, method=diff_method, 
                                    n_genes=self.adata.raw.shape[1] if n_genes == None else n_genes,
                                    key_added=f'rank_genes_groups_{key_added}')

        
    def plot_qc(self):
        
        sc.pl.violin(self.adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                     jitter=0.4, multi_panel=True)
        
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 4))
                
        sc.pl.scatter(self.adata, x='total_counts', y='pct_counts_mt', ax=ax1, show=False)
        sc.pl.scatter(self.adata, x='total_counts', y='n_genes_by_counts', ax=ax2, show=False)
        plt.tight_layout()

In [None]:
# This should match a pre-determined project code from https://docs.google.com/spreadsheets/d/1ctn8ULH3T6YHzGdxXDVAO6YG6E2qQN-25tJ7z4zyVgk
project = 'NovaST'

In [None]:
species = "Mus musculus"

In [None]:
adatas = {}


nst_datas = load_stereoseq_data(
    project,
    'StereoSeq_MouseBrain',
    species,
    f"StereoSeq_20231121/00.RAW/SS200000366BL_E4/Downsampled/SS200000366BL_E4_Saw_Output_v7/02.count/SS200000366BL_E4.raw.gef",
    bin_sizes=[
        50,
        100,
        200
    ],  # Bin 200
)
for k, v in nst_datas.items():
    adatas[k] = v

In [None]:
for sample, data in adatas.items():
    data.preprocess_adata()
    data.adata.var_names_make_unique()
    data.append_sample_names()

In [None]:
## Human: MT- | Mouse: mt- | Drosophila: mt:
mito_identifier = 'mt-'

In [None]:
for sample, data in adatas.items():
    data.calc_qc(mito_identifier=mito_identifier)
    data.plot_qc()

In [None]:
for sample, data in adatas.items():
    sc.pl.embedding(data.adata, color='total_counts', basis='spatial')

In [None]:
for sample, data in adatas.items():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
    sp_1_counts = sorted(zip(data.adata.obsm['spatial'][:, 0], data.adata.obs['total_counts']), key=lambda x: x[0])
    sp_2_counts = sorted(zip(data.adata.obsm['spatial'][:, 1], data.adata.obs['total_counts']), key=lambda x: x[0])
    # average counts per coord
    sp_1_counts = [(coord, np.sum([x[1] for x in sp_1_counts if x[0] == coord])) for coord in set([x[0] for x in sp_1_counts])]
    sp_2_counts = [(coord, np.sum([x[1] for x in sp_2_counts if x[0] == coord])) for coord in set([x[0] for x in sp_2_counts])]
    
    ax1.scatter([x[0] for x in sp_1_counts], [x[1] for x in sp_1_counts])
    ax2.scatter([x[0] for x in sp_2_counts], [x[1] for x in sp_2_counts])

    # title
    plt.suptitle(sample)
    plt.show()

In [None]:
for sample, data in adatas.items():
    data.adata = data.adata[np.logical_and(
        np.logical_and(data.adata.obsm['spatial'][:, 0] > 7000, data.adata.obsm['spatial'][:, 0] < 21000),
        np.logical_and(data.adata.obsm['spatial'][:, 1] > 2000, data.adata.obsm['spatial'][:, 1] < 23000)
    )]

In [None]:
for sample, data in adatas.items():
    sc.pl.embedding(data.adata, color=['total_counts', 'n_genes_by_counts'], basis='spatial')

In [None]:
for sample, data in adatas.items():
    print(sample)
    print(data.adata.obs['total_counts'].median(), data.adata.obs['n_genes_by_counts'].median())

In [None]:
for sample, data in adatas.items():
    sc.pl.violin(data.adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=False, multi_panel=True)

In [None]:
gene_thresholds = {
    'bin50': 200,
    'bin100': 750,
    'bin200': 2250,
    }

In [None]:
for sample, data in adatas.items():
    bin_size = sample.split('_')[-1]
    data.filter_cells(n_counts = 1000000, n_genes=25000, percent_mito=100, min_genes=gene_thresholds[bin_size], mito_identifier=mito_identifier)

In [None]:
for sample, data in adatas.items():
    print(data.adata.obs['total_counts'].median(), data.adata.obs['n_genes_by_counts'].median())

In [None]:
umi_dfs = {}
gene_dfs = {}

for sample in ['StereoSeq_MouseBrain']:
    umi_df = pd.DataFrame(columns=['bin', 'count'])
    gene_df = pd.DataFrame(columns=['bin', 'count'])
    for bin_size in [50, 100, 200]:
        umi_df = umi_df.append(pd.DataFrame({
            'bin': [f'Bin {bin_size}'] * adatas[f'{sample}_bin{bin_size}'].adata.shape[0],
            'count': adatas[f'{sample}_bin{bin_size}'].adata.obs['total_counts'].values
        }), ignore_index=True)
        gene_df = gene_df.append(pd.DataFrame({
            'bin': [f'Bin {bin_size}'] * adatas[f'{sample}_bin{bin_size}'].adata.shape[0],
            'count': adatas[f'{sample}_bin{bin_size}'].adata.obs['n_genes_by_counts'].values
        }), ignore_index=True)
        umi_df['bin'] = umi_df['bin'].astype('category')
        gene_df['bin'] = gene_df['bin'].astype('category')
        umi_df['count'] = umi_df['count'].astype('int')
        gene_df['count'] = gene_df['count'].astype('int')
    umi_dfs[sample] = umi_df
    gene_dfs[sample] = gene_df


In [None]:
for sample in ['StereoSeq_MouseBrain']:
    sns.set_style('white')
    df1 = gene_dfs[sample]
    df2 = umi_dfs[sample]
    df1['metric'] = 1
    df2['metric'] = 2
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))

    df1['bin'] = df1['bin'].astype('category')
    df2['bin'] = df2['bin'].astype('category')

    for i, bin_size in enumerate([50, 100, 200]):

        df_g = df1[df1['bin'] == f'Bin {bin_size}']
        df_u = df2[df2['bin'] == f'Bin {bin_size}']

        df_g_dummy = pd.DataFrame({'bin': pd.Categorical.from_codes([0], [f'Bin {bin_size}']), 'count': [np.nan]})
        df_u_dummy = pd.DataFrame({'bin': pd.Categorical.from_codes([0], [f'Bin {bin_size}']), 'count': [np.nan]})
        df_g_dummy['metric'] = 2
        df_u_dummy['metric'] = 1

        ax = axs[i]
        sns.violinplot(x='bin', 
                       y='count', 
                       hue='metric', 
                       split=True, 
                       data=pd.concat([df_g, df_g_dummy]), 
                       ax=ax
                       )
        ax.legend_.remove()
        ax.set_ylabel('Gene count')
        
        ax2 = ax.twinx()
        sns.violinplot(x='bin', 
                       y='count', 
                       hue='metric', 
                       split=True, 
                       data=pd.concat([df_u, df_u_dummy]), 
                       ax=ax2
                       )

        ax2.set_ylabel('UMI count')
        sns.despine(top=True, right=False, left=False, bottom=True, ax=ax)
        sns.despine(top=True, right=False, left=False, bottom=True, ax=ax2)

    plt.tight_layout()
    plt.savefig(f'{sample}_StdScale.pdf')


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(36, 12))
for n, (sample, data) in enumerate(adatas.items()):
    sc.pl.embedding(data.adata, color='n_genes_by_counts', basis='spatial', ax=axs[n], s=13 * ((n+1)*3), show=False)
plt.show()

In [None]:
for sample, data in adatas.items():
    data.normalize_and_log_transform()

In [None]:
for sample, data in adatas.items():
    data.detect_high_variable_genes()
    data.regress_and_scale(n_jobs=24)

In [None]:
for sample, data in adatas.items():
    data.perform_pca()

In [None]:
adatas['StereoSeq_MouseBrain_bin50'].pca_n = 22
adatas['StereoSeq_MouseBrain_bin100'].pca_n = 27
adatas['StereoSeq_MouseBrain_bin200'].pca_n = 35

In [None]:
for sample, data in adatas.items():
    data.reduce_dimensions(n_jobs=24)

In [None]:
for sample, data in adatas.items():
    for res in [0.8, 1.0, 2.0, 4.0, 8.0]:
        data.detect_clusters(res=res)

In [None]:
for sample, data in adatas.items():
    print(sample)
    sc.pl.umap(data.adata, color=['total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden_res1.0'], wspace=0.35, ncols=4)

In [None]:
for sample, data in adatas.items():
    print(sample)
    sc.pl.embedding(data.adata, basis='spatial', color=['total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden_res1.0'], wspace=0.35, ncols=4)

In [None]:
for sample, data in adatas.items():
    # rotate spatial 90 degrees clockwise, x, y -> y, -x
    data.adata.obsm['spatial'][:, 0] = -data.adata.obsm['spatial'][:, 0]
    data.adata.obsm['spatial'] = np.flip(data.adata.obsm['spatial'], axis=1)


In [None]:
for sample, data in adatas.items():
    print(sample)
    sc.pl.embedding(data.adata, basis='spatial', color=['total_counts', 'n_genes_by_counts', 'pct_counts_mt', 'leiden_res1.0'], wspace=0.35, ncols=4)

In [None]:
for sample, data in adatas.items():
    print(sample)
    sc.pl.embedding(data.adata, basis='spatial', color=['leiden_res1.0', 'leiden_res2.0', 'leiden_res4.0'], wspace=0.35, ncols=4)

In [None]:
import seaborn as sns

In [None]:
df = pd.DataFrame(columns=['Count', 'Type', 'Bin Size'])

In [None]:
for n, (sample, data) in enumerate(adatas.items()):
    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
    sc.pl.violin(data.adata, keys=['n_genes_by_counts'], jitter=False, show=False, ax=axs[0])
    sc.pl.violin(data.adata, keys=['total_counts'], jitter=False, show=False, ax=axs[1])
    fig.suptitle(sample)
    axs[0].set_ylim(0, 4000 * (n+1))
    axs[1].set_ylim(0, 40000 * (n+1))
    plt.tight_layout()



In [None]:
for sample, data in adatas.items():
    bin_size = sample.split('_')[-1]
    df = df.append(pd.DataFrame(data={'Count': data.adata.obs['n_genes_by_counts'], 'Type': 'Genes', 'Bin Size': bin_size}))
    df = df.append(pd.DataFrame(data={'Count': data.adata.obs['total_counts'], 'Type': 'UMIs', 'Bin Size': bin_size}))


In [None]:
df['Count'] = df['Count'].astype(int)

In [None]:
df.sort_values(by=['Type'], inplace=True)

In [None]:
sns.violinplot(data=df[df['Type'] == 'Genes'], x="Bin Size", y="Count", inner="quart")
plt.title('Genes')

In [None]:
sns.violinplot(data=df[df['Type'] == 'UMIs'], x="Bin Size", y="Count", inner="quart")
plt.title('UMIs')

In [None]:
import json
import loompy as lp
import datetime


def dfToNamedMatrix(df):
    arr_ip = [tuple(i) for i in df.values]
    dtyp = np.dtype(list(zip(df.dtypes.index, df.dtypes)))
    arr = np.array(arr_ip, dtype=dtyp)
    return arr

In [None]:
outputs_dir = './outputs/'

In [None]:
for sample, data in adatas.items():
    
    if data.spatial:
        mainEmbedding = pd.DataFrame(data.adata.obsm['spatial'], columns=['_X', '_Y'])

        Embeddings_X = pd.DataFrame()
        Embeddings_Y = pd.DataFrame()
        
        Embeddings_X["1"] = pd.DataFrame(data.adata.obsm['X_umap'])[0]
        Embeddings_Y["1"] = pd.DataFrame(data.adata.obsm['X_umap'])[1]
        
        Embeddings_X["2"] = pd.DataFrame(data.adata.obsm['X_tsne'])[0]
        Embeddings_Y["2"] = pd.DataFrame(data.adata.obsm['X_tsne'])[1]

        Embeddings_X["3"] = pd.DataFrame(data.adata.obsm['X_pca'])[0]
        Embeddings_Y["3"] = pd.DataFrame(data.adata.obsm['X_pca'])[1]

        metaJson = {}

        metaJson['embeddings'] = [
            {
                "id": -1,
                "name": "Spatial"
            },
            {
                "id": 1,
                "name": f"Scanpy UMAP {data.pca_n}PC"
            },
            {
                "id": 2,
                "name": f"Scanpy t-SNE {data.pca_n}PC"
            },
            {
                "id": 3,
                "name": "Scanpy PC1/PC2"
            }
        ]
    else:
        mainEmbedding = pd.DataFrame(data.adata.obsm['X_umap'], columns=['_X', '_Y'])

        Embeddings_X = pd.DataFrame()
        Embeddings_Y = pd.DataFrame()

        Embeddings_X["1"] = pd.DataFrame(data.adata.obsm['X_tsne'])[0]
        Embeddings_Y["1"] = pd.DataFrame(data.adata.obsm['X_tsne'])[1]

        Embeddings_X["2"] = pd.DataFrame(data.adata.obsm['X_pca'])[0]
        Embeddings_Y["2"] = pd.DataFrame(data.adata.obsm['X_pca'])[1]

        metaJson = {}

        metaJson['embeddings'] = [
            {
                "id": -1,
                "name": f"Scanpy UMAP {data.pca_n}PC"
            },
            {
                "id": 1,
                "name": f"Scanpy t-SNE {data.pca_n}PC"
            },
            {
                "id": 2,
                "name": "Scanpy PC1/PC2"
            }
        ]
    
    
    col_attrs = {"CellID": np.array(data.adata.obs.index),
                 "nUMI": np.array(data.adata.obs['total_counts'].values),
                 "nGene": np.array(data.adata.obs['n_genes_by_counts'].values),
                 "Percent_mito": np.array(data.adata.obs['pct_counts_mt'].values),
                 "Embedding": dfToNamedMatrix(mainEmbedding),
                 "Embeddings_X": dfToNamedMatrix(Embeddings_X),
                 "Embeddings_Y": dfToNamedMatrix(Embeddings_Y)
                }
    
    if not data.spatial:
        col_attrs["Doublet_score"] = np.array(data.adata.obs['doublet_score'].values)
    
    row_attrs = {"Gene": np.array(data.raw.var.index)}
    
    metaJson["clusterings"] = []
    clusterings = pd.DataFrame()
    
    clusterings_to_process = []    
    for key in data.adata.obs.columns:
        if 'leiden' in key or 'louvain' in key:
            clusterings_to_process.append(key)
            
        

#     for n, res in enumerate([1.0, 2.0]):
    for n, key in enumerate(clusterings_to_process):
        if n == 1:
            col_attrs["ClusterID"] = np.array(data.adata.obs[key].values)

        ClusterMarkers = pd.DataFrame(index=data.raw.var.index, columns=[str(x) for x in range(max(set([int(x) for x in data.adata.obs[key]])) + 1)])
        ClusterMarkers_avg_logFC = pd.DataFrame(index=data.raw.var.index, columns=[str(x) for x in range(max(set([int(x) for x in data.adata.obs[key]])) + 1)])
        ClusterMarkers_pval = pd.DataFrame(index=data.raw.var.index, columns=[str(x) for x in range(max(set([int(x) for x in data.adata.obs[key]])) + 1)])

        ClusterMarkers.fillna(0, inplace=True)
        ClusterMarkers_avg_logFC.fillna(0, inplace=True)
        ClusterMarkers_pval.fillna(0, inplace=True)

        for i in range(max(set([int(x) for x in data.adata.obs[key]])) + 1):
            i = str(i)
            tot_genes = len(data.adata.uns[f'rank_genes_groups_{key}']['pvals_adj'][i])
            sigGenes = data.adata.uns[f'rank_genes_groups_{key}']['pvals_adj'][i] < 0.05
            deGenes = np.logical_and(np.logical_or(data.adata.uns[f'rank_genes_groups_{key}']['logfoldchanges'][i] >= 1.5, data.adata.uns[f'rank_genes_groups_{key}']['logfoldchanges'][i] <= -1.5), np.isfinite(data.adata.uns[f'rank_genes_groups_{key}']['logfoldchanges'][i]))
            sigAndDE = np.logical_and(sigGenes, deGenes)

            names = data.adata.uns[f'rank_genes_groups_{key}']['names'][i][sigAndDE]
            ClusterMarkers.loc[names, i] = 1
            ClusterMarkers_avg_logFC.loc[names, i] = np.around(data.adata.uns[f'rank_genes_groups_{key}']['logfoldchanges'][i][sigAndDE], decimals=6)
            ClusterMarkers_pval.loc[names, i] = np.around(data.adata.uns[f'rank_genes_groups_{key}']['pvals_adj'][i][sigAndDE], decimals=6)
            
        row_attrs[f"ClusterMarkers_{n}"] = dfToNamedMatrix(ClusterMarkers)
        row_attrs[f"ClusterMarkers_{n}_avg_logFC"] = dfToNamedMatrix(ClusterMarkers_avg_logFC)
        row_attrs[f"ClusterMarkers_{n}_pval"] = dfToNamedMatrix(ClusterMarkers_pval)
        cluster_meta = {
                "id": n,
                "group": "Scanpy",
                "name": f"Scanpy {key}",
                "clusters": [],
                "clusterMarkerMetrics": [{
                    "accessor": "avg_logFC", 
                    "name": "Avg. logFC",
                    "description": "Average log fold change from Wilcox test"
                }, {
                    "accessor": "pval", 
                    "name": "Adj. P-Value",
                    "description": "Adj. P-Value from Wilcox test"
                }
                ]
            }
        metaJson["clusterings"].append(cluster_meta)
        
        for i in range(max(set([int(x) for x in data.adata.obs[key]])) + 1):
            clustDict = {}
            clustDict['id'] = i
            clustDict['description'] = f"Unannotated Cluster {str(i+1)}"
            metaJson['clusterings'][n]['clusters'].append(clustDict)

        clusterings[str(n)] = data.adata.obs[key].values.astype(np.int64)
    col_attrs['Clusterings'] = dfToNamedMatrix(clusterings)

    metaJson["metrics"] = [
            {
                "name": "nUMI"
            }, {
                "name": "nGene"
            }, {
                "name": "Percent_mito"
            }
    ]
    
    if not data.spatial:
        metaJson["metrics"].append(
            {
                "name": "Doublet_score"
            })

    metaJson["annotations"] = []
    
    for key in []:
        if key in data.adata.obs.keys():
            col_attrs[key] = np.array([str(x) for x in data.adata.obs[key].values])
            metaJson["annotations"].append(
            {
                "name": key,
                "values": [str(x) for x in sorted(list(set(data.adata.obs[key].values)))]
            }
            )
        else:
            print(f'{key} not found in {sample}!')
    
    
    attrs = {"title": sample,
             "MetaData": json.dumps(metaJson),
             "Genome": '',
             "SCopeTreeL1": "Deep_Seq",
             "SCopeTreeL2": "",
             "SCopeTreeL3": ""
            }

        
    attrs['MetaData'] = json.dumps(metaJson)
    lp.create(filename=f'{outputs_dir}/{datetime.date.today().strftime("%Y%m%d")}_{sample}.loom', layers=data.raw.X.T.todense(), row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs)

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
ax_n = 0
for sample, data in adatas.items():
    if not sample.endswith("bin2912"):
        continue
    ax = axs[ax_n // 3, ax_n % 3]
    ax.set_title(f'{sample}\n{int(data.adata.obs["total_counts"].sum()) / 1000000:.2f}M UMIs')
    sc.pl.violin(data.adata, "n_genes_by_counts", ax=ax, show=False)
    ax_n += 1
plt.tight_layout()
plt.show()