Load the GEX data using Scanpy
Make sure you're in an environment with scanpy, pandas, and matplotlib installed

In [None]:
import scanpy as sc
import pandas as pd

adata = sc.read_10x_h5("filtered_feature_bc_matrix.h5")
adata.var_names_make_unique()

Basic Preprocessing

In [None]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Normalize and log transform
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]

# Scale and PCA
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

Clustering

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.5)  # Adjust resolution if needed

Find Marker Genes for Each Cluster

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

Save the markers for each cluster (optional, but potentially helpful)

In [None]:
marker_df = sc.get.rank_genes_groups_df(adata, group=None)
marker_df.to_csv("cluster_marker_genes.csv", index=False)

Automatic Cell Type Annotation (Using Marker Matching)
If you have a curated dictionary of marker genes:

In [None]:
# Example dictionary
marker_dict = {
    "T_cells": ["CD3D", "CD3E", "TRAC"],
    "B_cells": ["CD19", "CD79A"],
    "Myeloid": ["LYZ", "S100A8", "CD14"],
    "Ductal": ["KRT19", "CFTR", "MUC1"],
    "Endocrine": ["INS", "GCG", "SST"]
}

In [None]:
# Example dictionary with focus on pancreatic cancer and the cells that would be present there
marker_dict_pancreas = {
    # Exocrine
    "Acinar":    ["CPA1", "PRSS1", "AMY2A", "CEL", "CTRL"],
    "Ductal":    ["KRT19", "SOX9", "CFTR", "SLC4A4"],

    # Endocrine
    "Beta":      ["INS", "IAPP", "CDC20B"],
    "Alpha":     ["GCG", "ARX", "TTR"],
    "Delta":     ["SST", "HHEX"],
    "Gamma":     ["PPY"],
    "Epsilon":   ["GHRL"],

    # Stromal / Mesenchymal
    "Pancreatic_Stellate": ["RGS5", "ACTA2", "PDGFRB", "DES"],
    "Endothelial":          ["VWF", "PECAM1", "CDH5", "KDR"],
    "Fibroblast_CAF":       ["FAP", "PDPN", "POSTN", "COL1A1","THY1"],

    # Malignant epithelial
    "Malignant": ["EPCAM", "KRT19", "KRT18", "CEACAM5", "MUC1"],

    # Immune
    "T_cells":  ["CD3D", "CD3E", "CD8A", "CD4"],
    "B_cells":  ["CD19", "MS4A1", "CD79A"],
    "Myeloid":  ["LYZ", "CD14", "CD68", "CSF1R"],
    "NK_cells": ["NCAM1", "KLRD1", "GNLY"],
    "Dendritic": ["ITGAX", "CLEC9A", "CD1C"],
}

You can match top markers per cluster to this list:

In [None]:
def assign_cell_type(marker_df, marker_dict, top_n=10):
    cluster_types = {}
    for cluster in marker_df['group'].unique():
        genes = marker_df[marker_df['group'] == cluster].head(top_n)['names'].tolist()
        scores = {celltype: len(set(genes) & set(markers)) for celltype, markers in marker_dict.items()}
        best_match = max(scores, key=scores.get)
        cluster_types[cluster] = best_match if scores[best_match] > 0 else "Unknown"
    return cluster_types

cluster_annotations = assign_cell_type(marker_df, marker_dict)
print(cluster_annotations)

Then add to your AnnData object:

In [None]:
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_annotations)