# Workflow to load cell ranger outs and prepare data for clustering and analysis

## Data loading, QC filtering and Normalization
### Removing doublets + ambient mRNA

In [3]:
import scanpy as sc
import scipy
import pandas as pd
import numpy as np
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import gc
import psutil
from typing import Optional, Tuple, Dict, Any
import logging

# Configure scanpy for large datasets
sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.n_jobs = -1  # Use all available cores

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
# Memory management utilities
def check_memory_usage():
    """Check current memory usage"""
    process = psutil.Process()
    memory_info = process.memory_info()
    memory_gb = memory_info.rss / (1024 ** 3)
    logger.info(f"Current memory usage: {memory_gb:.2f} GB")
    return memory_gb

def optimize_memory(adata):
    """Optimize memory usage for large datasets"""
    # Convert to sparse if dense
    if not hasattr(adata.X, 'toarray'):
        adata.X = scipy.sparse.csr_matrix(adata.X)
    
    # Optimize dtypes
    for col in adata.obs.columns:
        if adata.obs[col].dtype == 'object':
            adata.obs[col] = adata.obs[col].astype('category')
    
    # Force garbage collection
    gc.collect()
    return adata

In [None]:
matrix_dir = "/ocean/projects/cis240075p/asachan/datasets/TA_muscle/ERCC1_KO_mice/samples_2025/cellranger_aggr/count/filtered_gene_bc_matrix"
expected_samples: int = 8
# Load the full matrix
adata = sc.read_10x_mtx(
    matrix_dir,
    var_names='gene_symbols',
    cache=True,
    gex_only=True
)

# Make variable names unique
adata.var_names_unique()

# Extract sample information from barcodes
# Format: BARCODE-SAMPLE_ID
sample_ids = []
for barcode in adata.obs.index:
    if '-' in barcode:
        sample_id = barcode.split('-')[-1]
        sample_ids.append(f"sample_{sample_id}")
    else:
        sample_ids.append("sample_1")

adata.obs['sample_id'] = sample_ids
adata.obs['library_id'] = sample_ids
adata.obs['batch'] = 'new_2024'
adata.obs['dataset'] = 'new'

# Optimize memory
adata = optimize_memory(adata)

logger.info(f"Loaded {adata.n_obs} cells and {adata.n_vars} genes")
logger.info(f"Unique samples: {adata.obs['sample_id'].nunique()}")
logger.info(f"Expected samples: {expected_samples}")

check_memory_usage()

## Data integration

## Clustering and annotation