# Analysis of 3k T cells from cancer

In [1]:
import warnings
warnings.filterwarnings("ignore")
import muon as mu
import numpy as np
import scanpy as sc
import scirpy as ir
from cycler import cycler
from matplotlib import cm as mpl_cm
from matplotlib import pyplot as plt
from mudata import MuData
sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [2]:
sc.logging.print_header()

scanpy==1.10.4 anndata==0.11.3 umap==0.5.7 numpy==2.1.3 scipy==1.15.1 pandas==2.2.3 scikit-learn==1.6.1 statsmodels==0.14.4 igraph==0.11.6 pynndescent==0.5.13


## Importing data

In [3]:
%cd data

E:\Python code\Machine learning\JupyterNote\Biology\scripy\data


In [4]:
dates = ["0516_","0605_"]
tissue = ["CNS_", "SPL_"]
adatas_tcr = {}
adatas_gex = {}

for D in dates:
    for T in tissue:
        sample = D+T 
        adata_tcr = ir.io.read_10x_vdj(
            sample + "filtered_contig_annotations.csv"
        )
        
        # Load the associated transcriptomics data
        adata_gex = sc.read_10x_h5(
            sample + "sample_filtered_feature_bc_matrix.h5", gex_only = False
        )
        adata_gex.var_names_make_unique()
        adata_df = adata_gex.to_df()
        
        df2 = adata_df.filter(regex='CMO')
        df2['mouse_id'] = df2.idxmax(axis=1)
        adata_gex.obs["mouse_id"] = df2['mouse_id']

        adata_gex.obs["date"] = D[:-1]
        adata_gex.obs["tissue"] = T[:-1]
        
        adata_gex.obs.index = adata_gex.obs.index + "_" + sample
        adata_tcr.obs.index = adata_tcr.obs.index + "_" + sample
    
        adatas_gex[sample] = adata_gex
        adatas_tcr[sample] = adata_tcr
        
        print(f"Loaded {sample}: GEX shape {adata_gex.shape}, TCR shape {adata_tcr.shape}")
        

reading 0516_CNS_sample_filtered_feature_bc_matrix.h5
 (0:00:01)
Loaded 0516_CNS_: GEX shape (4780, 33708), TCR shape (2637, 0)
reading 0516_SPL_sample_filtered_feature_bc_matrix.h5
 (0:00:06)
Loaded 0516_SPL_: GEX shape (34423, 33708), TCR shape (28337, 0)
reading 0605_CNS_sample_filtered_feature_bc_matrix.h5
 (0:00:05)
Loaded 0605_CNS_: GEX shape (27799, 33708), TCR shape (15058, 0)
reading 0605_SPL_sample_filtered_feature_bc_matrix.h5
 (0:00:05)
Loaded 0605_SPL_: GEX shape (39433, 33708), TCR shape (31628, 0)


In [5]:
import anndata
adata_gex = anndata.concat(adatas_gex, index_unique=None)  # Merge GEX
adata_tcr = anndata.concat(adatas_tcr, index_unique=None)  # Merge TCR

In [6]:
adata_gex.obs

Unnamed: 0,mouse_id,date,tissue
AAACCAAAGGGGAGCT-1_0516_CNS_,CMO318,0516,CNS
AAACCAAAGGTCGACT-1_0516_CNS_,CMO325,0516,CNS
AAACCAGCACGTAAAG-1_0516_CNS_,CMO318,0516,CNS
AAACCATTCACTACTC-1_0516_CNS_,CMO325,0516,CNS
AAACCATTCCTCCGGT-1_0516_CNS_,CMO325,0516,CNS
...,...,...,...
GTTGTGGGTGGATCGC-1_0605_SPL_,CMO304,0605,SPL
GTTGTGGGTGGCTGTA-1_0605_SPL_,CMO301,0605,SPL
GTTGTGGGTGGTCCCC-1_0605_SPL_,CMO301,0605,SPL
GTTGTGGGTGTCCCTG-1_0605_SPL_,CMO302,0605,SPL


In [15]:
##### load data
# adata_gex.write("all_gex.h5ad")
# adata_tcr.write("all_tcr.h5ad")

adata_gex = sc.read("all_gex.h5ad")
adata_tcr = sc.read("all_tcr.h5ad")


In [20]:
mdata = mu.MuData({"gex": adata_gex, "airr": adata_tcr})
type(mdata.obs)

pandas.core.frame.DataFrame

In [8]:
import pandas as pd
metadata_paths = {
    "CNS": "CNS_cell_type_annotation.csv",
    "Spleen": "Spleen_cell_type_annotation.csv"
}

# Read metadata files
metadata_list = []
for tissue, path in metadata_paths.items():
    meta = pd.read_csv(path, index_col=0)  # Ensure the first column is treated as the index

    # Extract run number from the index (barcode) → `_1` means "0516", `_2` means "0605"
    meta["run_num"] = meta.index.to_series().str.split("_").str[-1].map({"1": "0516", "2": "0605"})

    # Create the correct barcode format: "AAACCAAAGGGGAGCT-1_CNS_0605"
    meta["barcode"] = meta.index.to_series().str.replace("_1", "", regex=False).str.replace("_2", "", regex=False)
    meta["barcode"] = meta["barcode"] + "_" + tissue + "_" + meta["run_num"]

    metadata_list.append(meta)

# Combine metadata from both CNS and Spleen
metadata = pd.concat(metadata_list, ignore_index=False)  # Keep original index

# Merge metadata with `mdata.obs`
mdata.obs = mdata.obs.merge(metadata, left_index=True, right_on="barcode", how="left")

# Set the index to the barcode column for consistency
mdata.obs.set_index("barcode", inplace=True)

print("Metadata merged successfully!")


Metadata merged successfully!


In [16]:
metadata

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,nCount_HTO,nFeature_HTO,HTO_maxID,HTO_secondID,HTO_margin,HTO_classification,HTO_classification.global,hash.ID,percent.mt,integrated_nn_res.2,seurat_clusters,level_2_clusters,manual_cell_type,run_num,barcode
AAACCCCAGAGGGACC-1_1,CMO322,3975,2006,393,6,CMO322,CMO321,1.816809,CMO322,Singlet,CMO322,3.169811,30,30,30_b1,multiplet,0516,AAACCCCAGAGGGACC-1-1_CNS_0516
AAACCGACACAATGGA-1_1,CMO326,694,506,113,6,CMO326,CMO321,0.041624,Negative,Negative,Negative,3.025937,3,3,3_b6,Myeloid,0516,AAACCGACACAATGGA-1-1_CNS_0516
AAACCGTGTGGGCGTT-1_1,CMO323,7552,3470,293,6,CMO323,CMO326,2.096907,CMO323,Singlet,CMO323,5.203919,33,33,33_b0,multiplet,0516,AAACCGTGTGGGCGTT-1-1_CNS_0516
AAACGGAGTAGTTATG-1_1,CMO321,2394,1372,150,6,CMO321,CMO326,0.165287,CMO321,Singlet,CMO321,1.127820,9,9,9_b1,multiplet,0516,AAACGGAGTAGTTATG-1-1_CNS_0516
AAACGGAGTATCCGCT-1_1,CMO322,3016,1625,379,6,CMO322,CMO326,1.227573,CMO322_CMO326,Doublet,Doublet,2.486737,30,30,30_b1,multiplet,0516,AAACGGAGTATCCGCT-1-1_CNS_0516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTTGTGGGTGGATCGC-1_2,CMO304,5022,1971,1041,4,CMO304,CMO301,1.768105,CMO304,Singlet,CMO304,2.508961,2,2,2_b0,CD8+ T,0605,GTTGTGGGTGGATCGC-1-1_Spleen_0605
GTTGTGGGTGGCTGTA-1_2,CMO301,6649,2297,848,4,CMO301,CMO303,1.622641,CMO301,Singlet,CMO301,2.090540,6,6,6_b3,CD8+ T,0605,GTTGTGGGTGGCTGTA-1-1_Spleen_0605
GTTGTGGGTGGTCCCC-1_2,CMO301,568,394,386,4,CMO301,CMO303,0.837520,CMO301,Singlet,CMO301,2.464789,19,19,19_b0,multiplet,0605,GTTGTGGGTGGTCCCC-1-1_Spleen_0605
GTTGTGGGTGTCCCTG-1_2,CMO302,4213,2032,1978,4,CMO302,CMO303,2.139032,CMO302,Singlet,CMO302,2.658438,4,4,4_b1,CD4+ T,0605,GTTGTGGGTGTCCCTG-1-1_Spleen_0605


In [22]:
mdata.mod["airr"].obs.index

Index(['AAACCAAAGGGGAGCT-1_0516_CNS_', 'AAACCAGCACGTAAAG-1_0516_CNS_',
       'AAACCATTCCTCCGGT-1_0516_CNS_', 'AAACCCATCAGTATCG-1_0516_CNS_',
       'AAACCCCAGCCTAAGC-1_0516_CNS_', 'AAACCGTGTATCAGCG-1_0516_CNS_',
       'AAACCTCCAAAGCTTC-1_0516_CNS_', 'AAACCTCCACCGTTTC-1_0516_CNS_',
       'AAACCTGTCCAGCCCT-1_0516_CNS_', 'AAACGATCACAGTCGC-1_0516_CNS_',
       ...
       'GTTGTGCAGGTGAGTG-1_0605_SPL_', 'GTTGTGGGTAAGCGCC-1_0605_SPL_',
       'GTTGTGGGTACTACGG-1_0605_SPL_', 'GTTGTGGGTACTGTGG-1_0605_SPL_',
       'GTTGTGGGTAGCGTAT-1_0605_SPL_', 'GTTGTGGGTGGAGCGA-1_0605_SPL_',
       'GTTGTGGGTGGATCGC-1_0605_SPL_', 'GTTGTGGGTGGCTGTA-1_0605_SPL_',
       'GTTGTGGGTGTCCCTG-1_0605_SPL_', 'GTTGTGGGTGTTACCC-1_0605_SPL_'],
      dtype='object', name='cell_id', length=77660)

In [23]:
# Get common cell indices between gex and airr
common_cells = mdata.mod["gex"].obs.index.intersection(mdata.mod["airr"].obs.index)

# Subset gex to only include these cells
gex_subset = mdata.mod["gex"][common_cells, :].copy()

# Create a new MuData object to preserve alignment
mdata_new = mu.MuData({"gex": gex_subset, "airr": mdata.mod["airr"]})

# Copy previous metadata from the old mdata object
mdata_new.obs = mdata.obs.loc[mdata_new.mod["gex"].obs.index]

# Verify the new object
print(mdata_new)

MuData object with n_obs × n_vars = 77660 × 33708
  2 modalities
    gex:	77660 x 33708
      obs:	'mouse_id', 'date', 'tissue'
    airr:	77660 x 0
      obsm:	'airr'


In [25]:
mdata_new.write(f"all_integrated_mdata_with_TCR.h5mu")