In [None]:
import scanpy as sc 
import metashells as ms
from metashells.simulation import create_simulated_adata

In [None]:
adata_with_labels_file = '/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/integrated_samples/scANVI_v3.h5ad'
adata = sc.read_h5ad(adata_with_labels_file)

In [None]:
adata

In [None]:
adata.X = adata.layers['counts']
adata.X.min(), adata.X.max()

In [None]:
sc.pl.violin(
    adata,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.4,
    multi_panel=True,
)


In [None]:
# Filter out cells with high mitochondrial gene percentage
adata = adata[adata.obs['pct_counts_mt'] < 5]
adata

### Pre-proc data

In [None]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

In [None]:
adata.X.min(), adata.X.max()

In [None]:
sc.tl.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_pcs=30)

In [None]:
sc.pl.embedding(adata, basis='X_pca', color=['sample_id'])

#### Subset to cell type of interest

In [None]:
adata_sex = adata[adata.obs['sex'] == 'F']
# pool the fast2b and fast2x cell types
adata = adata_sex[adata_sex.obs['C_scANVI'].isin(['Fast IIB', 'Fast IIX'])]
adata

In [None]:
sc.pl.embedding(adata, basis='X_pca', color=['sample_id', 'C_scANVI', 'batch'])

In [None]:
adata.obs['y'] = adata.obs['condition'].map({
    'WT': 0,
    'KO': 1
})
adata.obs['y'].value_counts()

# Harmony to correct for batch effect

In [None]:
sc.external.pp.harmony_integrate(adata, 'batch')
sc.pl.embedding(adata, basis='X_pca_harmony', color=['y', 'batch'])

In [None]:
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']

# Run SeaCells

In [None]:
import os 

results_dir = './meta_cell_outputs'
os.makedirs(results_dir, exist_ok=True)

In [None]:
from metashells.seashells import SeaShells 

conch = SeaShells(
    adata_full = adata, 
    results_dir = results_dir,
    cells_per_metacell = 75, # recommended is 75 but can go as low as ~20 if really needed
    sample_col = 'sample_id'
)
conch.run()

# Prep adata from metacells

In [None]:
adata.X.min(), adata.X.max()

In [None]:
adata.layers['norm_counts'] = adata.X

In [None]:
gex_df = adata.to_df(layer='norm_counts')
gex_df.head(3)

In [None]:
from glob import glob
import pandas as pd

files = glob(results_dir + '/*.csv')
gex_df['SEACell'] = 'NA'
gex_df

for f in files:
    seacells = pd.read_csv(f, index_col=0)
    seacells = seacells['SEACell'].astype(str) + '_' + os.path.basename(f).split('.')[0]
    gex_df.loc[seacells.index, 'SEACell'] = seacells

gex_df = gex_df.groupby('SEACell').mean()

In [None]:
display(gex_df)

In [None]:
y = [0 if 'WT' in i else 1 for i in gex_df.index]

In [None]:
sample_id = [i.split('_')[-2] if len(i.split('_')) >= 2 else 'NA' for i in gex_df.index]

In [None]:
adata = sc.AnnData(
    X = gex_df.values,
    var = pd.DataFrame(index=gex_df.columns),
    obs = pd.DataFrame({'y': y, 'sample_id': sample_id}, index=gex_df.index)
)
adata

In [None]:
adata.write_h5ad(f'/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/integrated_samples/seacells_F_FastIIB.h5ad')

## Visualize the metacells

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, inplace=True)

sc.tl.pca(adata, use_highly_variable=True)
sc.pp.neighbors(adata, use_rep='X_pca')
sc.tl.umap(adata)
     

In [None]:
# Convert the 'y' column to the pandas 'category' dtype
adata.obs['y'] = adata.obs['y'].astype('category')
adata.obs['sample_id'] = adata.obs['sample_id'].astype('category')
# The plotting command will now work correctly
sc.pl.umap(adata, color='y', palette='Set1', s=150)

In [None]:
sc.pl.umap(adata, color='sample_id', palette='Set1', s=150)