# Bioinformatics Toolkit Module Testing Notebook

This notebook provides a comprehensive testing framework for the different modules in the bioinformatics-toolkit. It allows testing each module independently and bypasses the s3_utils.py module by loading test data directly from scanpy's PBMC dataset.

## Setup and Environment

First, we'll set up the environment and import necessary libraries.

In [None]:
# Basic imports
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import anndata as ad

# Set plotting style
sc.settings.set_figure_params(dpi=100, facecolor='white')
plt.rcParams['figure.figsize'] = (8, 6)

# Ensure the repository is in the Python path
# This assumes the notebook is running in the container with the repository mounted at /data/repo
if '/data/repo' not in sys.path:
    sys.path.append('/data/repo')

# Import the bioinformatics toolkit modules
from python.sctools import dim_reduction, feature_selection, geneset, normalization, qc, spatial, visualization

# Print versions for reproducibility
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"ScanPy version: {sc.__version__}")
print(f"AnnData version: {ad.__version__}")

## Loading Test Data

Instead of using the s3_utils.py module, we'll load the PBMC test dataset directly from scanpy.

In [None]:
# Load the PBMC dataset from scanpy
adata = sc.datasets.pbmc3k()

# Basic information about the dataset
print(f"Dataset shape: {adata.shape}")
print(f"Number of cells: {adata.n_obs}")
print(f"Number of genes: {adata.n_vars}")

# Show the AnnData object
print("\nAnnData object:")
print(adata)

# Save a copy of the original data for comparison
adata_orig = adata.copy()

## Module 1: Quality Control (QC)

Test the QC module functionality for single-cell data.

In [None]:
# Reset to original data
adata = adata_orig.copy()

# Calculate QC metrics
qc.calculate_qc_metrics(adata)

# Show QC metrics
print("QC metrics in obs:")
print(adata.obs.filter(regex='n_genes|n_counts|percent').head())

# Plot QC metrics
fig, axs = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(adata.obs['n_genes_by_counts'], kde=True, ax=axs[0])
axs[0].set_title('Number of genes')
sns.histplot(adata.obs['total_counts'], kde=True, ax=axs[1])
axs[1].set_title('Total counts')
sns.histplot(adata.obs['pct_counts_mt'], kde=True, ax=axs[2])
axs[2].set_title('Percent mitochondrial')
plt.tight_layout()
plt.show()

# Filter cells based on QC metrics
qc_filtered = qc.filter_cells(
    adata,
    min_genes=200,
    max_genes=2500,
    min_counts=500,
    max_counts=20000,
    max_pct_mt=5
)

print(f"Original number of cells: {adata.n_obs}")
print(f"Number of cells after QC filtering: {qc_filtered.n_obs}")
print(f"Percentage of cells retained: {qc_filtered.n_obs/adata.n_obs*100:.2f}%")

## Module 2: Normalization

Test the normalization module functionality.

In [None]:
# Reset to QC filtered data
adata = qc_filtered.copy()

# Apply normalization
print("Before normalization:")
print(f"Mean counts per cell: {adata.X.mean():.2f}")
print(f"Standard deviation: {adata.X.std():.2f}")

# Test different normalization methods
methods = ['log1p', 'scran', 'scran_pearson']
for method in methods:
    try:
        print(f"\nApplying {method} normalization...")
        adata_norm = normalization.normalize_data(adata.copy(), method=method)
        print(f"After {method} normalization:")
        print(f"Mean: {adata_norm.X.mean():.2f}")
        print(f"Standard deviation: {adata_norm.X.std():.2f}")
    except Exception as e:
        print(f"Error with {method} normalization: {e}")

# Continue with log1p normalized data
adata_norm = normalization.normalize_data(adata, method='log1p')
adata = adata_norm.copy()

## Module 3: Feature Selection

Test the feature selection module functionality.

In [None]:
# Test highly variable gene selection
print("Selecting highly variable genes...")
adata_hvg = feature_selection.find_variable_genes(adata.copy(), n_top_genes=2000)

# Plot variable genes
sc.pl.highly_variable_genes(adata_hvg)

print(f"Number of highly variable genes selected: {adata_hvg.var['highly_variable'].sum()}")

# Test other feature selection methods if available
try:
    print("\nSelecting genes by variance...")
    adata_var = feature_selection.select_genes_by_variance(adata.copy(), n_top_genes=2000)
    print(f"Number of genes selected by variance: {adata_var.var['selected'].sum() if 'selected' in adata_var.var else 'N/A'}")
except Exception as e:
    print(f"Error with variance-based selection: {e}")

# Continue with HVG data
adata = adata_hvg.copy()

## Module 4: Dimension Reduction

Test the dimension reduction module functionality.

In [None]:
# Test PCA
print("Running PCA...")
adata_pca = dim_reduction.run_pca(adata.copy(), n_comps=50)
print(f"PCA components shape: {adata_pca.obsm['X_pca'].shape}")

# Plot PCA
sc.pl.pca(adata_pca, color='total_counts')

# Test UMAP
print("\nRunning UMAP...")
adata_umap = dim_reduction.run_umap(adata_pca.copy())
print(f"UMAP coordinates shape: {adata_umap.obsm['X_umap'].shape}")

# Plot UMAP
sc.pl.umap(adata_umap, color=['total_counts', 'n_genes_by_counts'])

# Test t-SNE
print("\nRunning t-SNE...")
try:
    adata_tsne = dim_reduction.run_tsne(adata_pca.copy())
    print(f"t-SNE coordinates shape: {adata_tsne.obsm['X_tsne'].shape}")
    sc.pl.tsne(adata_tsne, color=['total_counts', 'n_genes_by_counts'])
except Exception as e:
    print(f"Error running t-SNE: {e}")

# Continue with the UMAP data
adata = adata_umap.copy()