In [None]:
# Uninstall problematic versions
!pip uninstall pandas dask -y

# Install specific compatible versions
!pip install pandas==1.5.3
!pip install dask==2023.1.1
!pip install distributed==2023.1.1
!pip install pyscenic==0.12.1

# Restart kernel manually after this!

In [None]:
import scanpy as sc
import loompy as lp
import numpy as np
import pandas as pd

# Load your K562 data
adata = sc.read_10x_h5("leukemia_data/k562_10k_raw.h5")
print(f"‚úÖ Loaded: {adata.n_obs} cells, {adata.n_vars} genes")

# Fix: Make gene names unique (this was the warning!)
adata.var_names_make_unique()
print("‚úÖ Made gene names unique")

# Basic filtering (standard single-cell QC)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
print(f"‚úÖ After filtering: {adata.n_obs} cells, {adata.n_vars} genes")

# Normalize
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

# Save processed data
adata.write("k562_processed.h5ad")

# FIXED: Convert to loom format ‚Äî ensure numpy arrays
row_attrs = {
    "Gene": np.array(adata.var_names, dtype=str)
}
col_attrs = {
    "CellID": np.array(adata.obs_names, dtype=str)
}

# Create loom file
lp.create("k562.loom", adata.X.T.astype(np.float32), row_attrs, col_attrs)
print("‚úÖ Created k562.loom in /leukemia_project")
print(f"üìä Final: {adata.n_obs} cells, {adata.n_vars} genes")

In [None]:
!pyscenic grn \
    k562.loom \
    databases/human_TF_list.txt \
    -o grn.csv \
    --num_workers 8

In [None]:
!pyscenic ctx \
    grn.csv \
    databases/motif_ranking_10kb.feather \
    --annotations_fname databases/motif_ranking_10kb.feather \
    --expression_mtx_fname k562.loom \
    --output ctx.csv \
    --num_workers 8 \
    --mask_dropouts

In [None]:
!pyscenic aucell \
    k562.loom \
    ctx.csv \
    --output aucell.csv \
    --num_workers 8

In [None]:
import pandas as pd
import scanpy as sc

# Load AUCell results (TF activities per cell)
auc_mtx = pd.read_csv("aucell.csv", index_col=0)
print(f"‚úÖ AUCell matrix: {auc_mtx.shape[1]} TFs, {auc_mtx.shape[0]} cells")

# Get top 20 TFs by mean activity
top_tfs = auc_mtx.mean().sort_values(ascending=False).head(20)
print("\nüèÜ TOP 20 MASTER REGULATORS IN K562 LEUKEMIA:")
for i, (tf, score) in enumerate(top_tfs.items(), 1):
    print(f"{i:2d}. {tf:15s} (AUC: {score:.3f})")

# Save results
top_tfs.to_csv("k562_top_regulators.csv")
print("\n‚úÖ Saved to k562_top_regulators.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot top 10 TFs
plt.figure(figsize=(10, 6))
top_tfs.head(10).plot(kind='barh')
plt.xlabel('AUCell Score')
plt.title('Top 10 Master Regulators in K562 Leukemia Cells')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('k562_top_regulators.png')
plt.show()