In [None]:
import numpy as np
import anndata
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import sys  
import ast
import loompy as lp # to install loompy and scikit-misc (pip)

sys.path.insert(1, '../../backend/')
import dataset

In [None]:
baseFolder='./'
filename = 'r_fca_biohub_all_wo_blood_10x.loom'
h5ad_filename = 'Fly_Atlas.h5ad'

# Reading Loom File

In [None]:
loom = lp.connect(filename, mode='r', validate=False)

ex_mtx = pd.DataFrame(loom[:, :], index=loom.ra.Gene, columns=loom.ca.CellID).T
col_attrs = {k: v for k, v in loom.ca.items()}
row_attrs = {k: v for k, v in loom.ra.items()}
global_attrs = {k: v for k, v in loom.attrs.items()}
global_attrs["MetaData"] = global_attrs["MetaData"]

In [None]:
list(global_attrs.keys())

In [None]:
list(col_attrs.keys())

In [None]:
global_attrs["MetaData"] = ast.literal_eval(global_attrs["MetaData"])

In [None]:
global_attrs["MetaData"].keys()

## Create Anndata Object

In [None]:
obs = pd.DataFrame.from_dict(col_attrs)
obs = obs.set_index("CellID")

In [None]:
obs_cols = ['S_annotation',
 'S_annotation_broad',
 'S_annotation_broad_extrapolated',
 'age',
 'annotation',
 'annotation_broad',
 'batch',
 'dissection_lab',
 'fly_genetics',
 'leiden',
 'n_counts',
 'n_genes',
 'note',
 'percent_mito',
 'scrublet__doublet_scores',
 'scrublet__predicted_doublets',
 'scrublet__predicted_doublets_based_on_10x_chromium_spec',
 'sex',
 'tissue']

In [None]:
obs = obs[obs_cols]

In [None]:
for col in obs.columns:
    #print(f"col {col} has {len(pd.unique(obs[col]))} values and type {obs[col].dtype}")
    if obs[col].dtype.str == 'object' and col != 'note':
        obs[col] = pd.Categorical(obs[col])

In [None]:
adata = anndata.AnnData(np.asarray(ex_mtx))
adata.obs_names = loom.ca.CellID
adata.var_names = loom.ra.Gene

for col in obs.columns:
    adata.obs[col] = obs[col]

In [None]:
# https://github.com/vib-singlecell-nf/vsn-pipelines/blob/65056919560a4c82ff560c9499c199d138c8b6c7/src/scanpy/bin/feature_selection/sc_find_variable_genes.py#L29
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    flavor="seurat_v3"
)

In [None]:
adata.write_h5ad("Fly_Atlas_full.h5ad", compression="gzip")

## PCA

In [None]:
pca_emb = sc.pp.pca(adata, n_comps=200, zero_center=True, use_highly_variable=True)

In [None]:
adata.uns['pca']['variance'].shape

In [None]:
plt.scatter(x=np.arange(200), y=np.cumsum(adata.uns['pca']['variance_ratio']))

## Extracting Embeddings

In [None]:
trace_data = dataset.Dataset(
    adata=adata[:, adata.var["highly_variable"]].copy(),
    name="Fly Atlas",
    hd_data_key="X_pca",
    verbose=True,
    hd_metric="euclidean",
)

In [None]:
# from https://github.com/vib-singlecell-nf/vsn-pipelines/blob/master/src/scenic/bin/export_to_loom.py#L235
def get_embedding_by_id(col_attrs, embedding_id):
    if str(embedding_id) == '-1':
        return np.asarray([[a[0], a[1]] for a in col_attrs['Embedding']], dtype=np.float32)
    x = col_attrs['Embeddings_X'][str(embedding_id)]
    y = col_attrs['Embeddings_Y'][str(embedding_id)]
    return np.asarray([[a[0], a[1]] for a in np.column_stack((x,y))], dtype=np.float32) 

In [None]:
for emb in global_attrs["MetaData"]["embeddings"]:
    id = emb['id']
    name = emb['name']
    name = name.replace("/", " ")
    trace_data.add_embedding(embedding=get_embedding_by_id(col_attrs, id),
                             name=name,
                             category="Original")


In [None]:
# reading an h5ad anndata object with '/' as key for obsm/var/obs throws an error
trace_data.adata.obsm["HVG PCA"] = trace_data.adata.obsm["HVG PC1/PC2"]  
del trace_data.adata.obsm["HVG PC1/PC2"] 

trace_data.adata.uns["HVG PCA"] = trace_data.adata.uns["HVG PC1/PC2"]
del trace_data.adata.uns["HVG PC1/PC2"]

trace_data.adata.uns['methods'] = {'Original': ['HVG t-SNE', 'HVG UMAP', 'HVG PCA']}

In [None]:
trace_data.compute_quality()
trace_data.print_quality()