# Combine Extracted R data into an Anndata

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from scipy.io import mmread
import scipy
import anndata
import re
import math
import umap
import scanpy

In [None]:
DATA_PATH = "data/"

MTX_PATH = DATA_PATH + 'HERY_SCT.mtx'
SEURAT_ANNO_PATH = DATA_PATH + 'HERY_seurat_anno.csv'
CELL_ANNO_PATH = DATA_PATH + 'cell_annotations_hery.txt'
VAR_PATH = DATA_PATH + 'HERY_var.csv'
UMAP_PATH = DATA_PATH + 'HERY_umap.csv'

SAVE_PATH = DATA_PATH + 'HERY_adata_anno_v2.h5ad'

## Read Data and Combine

In [None]:
# Read the SCT normalized counts
# Transpose so that rows are cells and columns are genes
expr_SCT = mmread(MTX_PATH).T.tocsc()

In [None]:
# Read the metadata from the seurat file
obs = pd.read_csv(SEURAT_ANNO_PATH, index_col=0)

# Convert the stage into a number, rather than a string
obs['hpf'] = obs['Stage'].apply(lambda x: x.split('h')[0])
obs['hpf'] = obs['hpf'].astype('int')

# Remove unnecessary columns
obs = obs[['hpf', 'seurat_clusters']].copy()

# Read the high level cell annotations and add it to the obs
cell_anno = pd.read_csv(CELL_ANNO_PATH, sep = "\t", index_col=0)
obs.loc[cell_anno.index, 'type'] = cell_anno['cell_type']

# Read the umap for the cells and add it on
umap = pd.read_csv(UMAP_PATH, index_col=0)
obs.loc[umap.index, 'UMAP_1'] = umap['UMAP_1']
obs.loc[umap.index, 'UMAP_2'] = umap['UMAP_2']

# Read the var
var = pd.read_csv(VAR_PATH, index_col=0)

# Remove the empty column
var = var[[]].copy()

In [None]:
print(cell_anno)

In [None]:
# Form the anndata object
adata = anndata.AnnData(X=expr_SCT, obs=obs, var=var)
adata.write_h5ad(SAVE_PATH, compression='gzip')

In [None]:
print(adata.obs.head(10))

## Plot Diagnostics

In [None]:
plt.figure(figsize=(10, 10))
plt.axis('off')

plt.title('UMAP colored by hpf')
plt.scatter(adata.obs.UMAP_1, adata.obs.UMAP_2, c=adata.obs.hpf)
plt.colorbar()