In [None]:
# <<<<<<< Setting >>>>>>>

import numpy as np
import pandas as pd
import scanpy as sc
import gseapy
import matplotlib.pyplot as plt
import scvi
import scipy.sparse
import math
import os
import anndata
import matplotlib.patheffects as pe

sc.settings.verbosity = 3 

print(os.getcwd())
cwd = f"{os.getcwd()}"

In [None]:
# <<<<<<< Integrating scRNA-seq and scTCR-seq Data >>>>>>>

batch_list = ["Control_SP", "Control_TG"]  

for batch in batch_list:

    tcr_path = f"{cwd}/filtered_contig_annotations_{batch}_TCR.csv"
    h5_path = f"{cwd}/filtered_feature_bc_matrix_{batch}.h5"

    adata = sc.read_10x_h5(h5_path)
    adata.var_names_make_unique()
    adata.obs["batch"] = batch

    tcr_df = pd.read_csv(tcr_path)
    tcr_df = tcr_df[tcr_df["high_confidence"] & tcr_df["productive"]]
    tcr_df["barcode"] = tcr_df["barcode"].str.replace("-1", "", regex=False)
    tcr_df["joint_barcode"] = tcr_df["barcode"] + "-" + batch

    tra_df = (
        tcr_df[tcr_df["chain"] == "TRA"]
        .drop_duplicates(subset="joint_barcode", keep="first")
        [["joint_barcode", "v_gene", "j_gene", "cdr3"]]
        .rename(columns={"v_gene": "v_gene_TRA", "j_gene": "j_gene_TRA", "cdr3": "cdr3_TRA"})
    )

    trb_df = (
        tcr_df[tcr_df["chain"] == "TRB"]
        .drop_duplicates(subset="joint_barcode", keep="first")
        [["joint_barcode", "v_gene", "j_gene", "cdr3"]]
        .rename(columns={"v_gene": "v_gene_TRB", "j_gene": "j_gene_TRB", "cdr3": "cdr3_TRB"})
    )

    tcr_merged = pd.merge(tra_df, trb_df, on="joint_barcode", how="outer")

    adata.obs.index = adata.obs.index.str.replace("-1", "", regex=False)
    adata.obs["joint_barcode"] = adata.obs.index + "-" + batch
    adata.obs = adata.obs.set_index("joint_barcode")
    adata.obs = adata.obs.join(tcr_merged.set_index("joint_barcode"), how="left")

    print("Number of cells with TRB information::", adata.obs["cdr3_TRB"].notna().sum())
    print(adata.obs[["v_gene_TRA", "cdr3_TRA", "v_gene_TRB", "cdr3_TRB"]].dropna().head())

    adata.write(f"{cwd}/adata_combined_TCR_{batch}_raw.h5ad")

In [None]:
# <<<<<<< Merging multiple AnnData objects across batches >>>>>>>

samples = [
    {"path": f"{cwd}/adata_combined_TCR_Control_SP_raw.h5ad", "batch": "Control_SP"},
    {"path": f"{cwd}/adata_combined_TCR_Control_TG_raw.h5ad", "batch": "Control_TG"},
]

adatas = []
for sample in samples:
    ad = sc.read_h5ad(sample["path"])
    ad.obs["batch"] = sample["batch"]  
    adatas.append(ad)

adata_combined = adatas[0].concatenate(adatas[1:], batch_key="batch_group", index_unique=None)

print(adata_combined.shape)
print(adata_combined.obs["batch"].value_counts())

In [None]:
# <<<<<<<  Save (Optional) >>>>>>>

adata_combined.write(f"{cwd}/adata_combined_TCR_all_batchs_raw.h5ad")

In [None]:
# <<<<<<< Concatenate CDR3,V,J (TRA + TRB) >>>>>>>

adata_combined.obs["cdr3_TRA_TRB"] = adata_combined.obs["cdr3_TRA"].astype(str) + "_" + adata_combined.obs["cdr3_TRB"].astype(str)

adata_combined.obs["V_TRA_TRB"] = adata_combined.obs["v_gene_TRA"].astype(str) + "_" + adata_combined.obs["v_gene_TRB"].astype(str)

adata_combined.obs["J_TRA_TRB"] = adata_combined.obs["j_gene_TRA"].astype(str) + "_" + adata_combined.obs["j_gene_TRB"].astype(str)

In [None]:
# <<<<<<<  filter >>>>>>>

sc.pp.filter_cells(adata_combined, min_genes=200)
sc.pp.filter_genes(adata_combined, min_cells=3)

adata_combined.raw = adata_combined.copy()

In [None]:
# <<<<<<< Save  >>>>>>>

adata_combined.write(f"{cwd}/adata_combined_TCR_all_batchs.h5ad")

In [None]:
# <<<<<<< Reload >>>>>>>

adata = sc.read_h5ad(f"{cwd}/adata_combined_TCR_all_batchs.h5ad")

In [None]:
# <<<<<<< Create labels by gene expression >>>>>>>

def get_expr_or_zero(adata, gene):
    if gene in adata.var_names:
        x = adata[:, gene].X
        expr = x.toarray().flatten() if not isinstance(x, np.ndarray) else x.flatten()
        return expr > 0
    else:
        print(f"[skip] {gene} is not exist")
        return np.zeros(adata.n_obs, dtype=bool)
    
def get_multi_gene_expr(adata, genes):
    return np.logical_or.reduce([get_expr_or_zero(adata, g) for g in genes])


exp_TRA = get_multi_gene_expr(adata, ["Trac"]) > 0
exp_TRB = (
    (get_multi_gene_expr(adata, ["Trbc1"]) > 0) | 
    (get_multi_gene_expr(adata, ["Trbc2"]) > 0)
)
exp_TRB1 = get_multi_gene_expr(adata, ["Trbc1"]) > 0
exp_TRB2 = get_multi_gene_expr(adata, ["Trbc2"]) > 0
exp_TRAB = (
    (get_multi_gene_expr(adata, ["Trac"]) > 0 ) &
    (get_multi_gene_expr(adata, ["Trbc1"]) > 0) | 
    (get_multi_gene_expr(adata_combined, ["Trbc2"]) > 0)
)

exp_CD4 = get_multi_gene_expr(adata, ["Cd4"]) > 0
exp_CD8 = (
    (get_multi_gene_expr(adata, ["Cd8a"]) > 0) | 
    (get_multi_gene_expr(adata, ["Cd8b1"]) > 0)
)
exp_CD8a = get_multi_gene_expr(adata, ["Cd8a"]) > 0
exp_CD8b1 = get_multi_gene_expr(adata, ["Cd8b1"]) > 0

exp_Csf1r = get_multi_gene_expr(adata, ["Csf1r"])
exp_Zbtb46 = get_multi_gene_expr(adata, ["Zbtb46"])
exp_H2_Aa = get_multi_gene_expr(adata, ["H2-Aa"])
exp_Lyz2 = get_multi_gene_expr(adata, ["Lyz2"])
exp_Ms4a6c = get_multi_gene_expr(adata, ["Ms4a6c"]) 
exp_Aif1 = get_multi_gene_expr(adata, ["Aif1"])
exp_Tmem119 = get_multi_gene_expr(adata, ["Tmem119"])
exp_Sox9 = get_multi_gene_expr(adata, ["Sox9"])

exp_Ly6g = get_multi_gene_expr(adata, [
    "Ly6g6g", "Ly6g", "Ly6g6d", "Ly6g6e"
])

exp_Trgv_1_7 = get_multi_gene_expr(adata, [
    "Trgv1", "Trgv2", "Trgv3", "Trgv4", "Trgv5", "Trgv6", "Trgv7"
])

exp_Trdv = get_multi_gene_expr(adata, [
    "Trdv1", "Trdv2-1", "Trdv2-2", "Trdv3", "Trdv4", "Trdv5"
])


adata.obs["TRA_TRB_positive"] = exp_TRA & exp_TRB
adata.obs["TRA_TRB_positive_CD4_positive"] = exp_TRA & exp_TRB & exp_CD4
adata.obs["TRA_TRB_positive_CD8_positive"] = exp_TRA & exp_TRB & exp_CD8
adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"] = exp_TRA & exp_TRB & exp_CD4 & exp_CD8
adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"] = exp_TRA & exp_TRB & ~exp_CD4 & ~exp_CD8

adata.obs["monocyte_dendritic_cells_macrophages"] = exp_Csf1r | exp_Zbtb46 | exp_H2_Aa | exp_Lyz2 | exp_Ms4a6c 
adata.obs["microgria"] = exp_Aif1 | exp_Tmem119
adata.obs["astrocyte"] = exp_Sox9
adata.obs["neutrophil"] = exp_Ly6g
adata.obs["gd_T_cell"] = exp_Trgv_1_7 | exp_Trdv
adata.obs["gate_out_cell"] = exp_Csf1r | exp_Zbtb46 | exp_H2_Aa | exp_Lyz2 | exp_Ms4a6c | exp_Aif1 | exp_Tmem119 | exp_Sox9 | exp_Ly6g | exp_Trgv_1_7 | exp_Trdv 


In [None]:
# <<<<<<< Cell count >>>>>>>

C1 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C2 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C3 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

C4 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C5 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C6 = adata[
    (adata.obs["TRA_TRB_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C7 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C8 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C9 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

C10 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C11 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C12 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C13 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C14 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C15 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

C16 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C17 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C18 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C19 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C20 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C21 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

C22 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C23 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C24 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C25 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C26 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C27 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

C28 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C29 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C30 = adata[
    (adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C31 = adata[
    (adata.obs["monocyte_dendritic_cells_macrophages"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C32 = adata[
    (adata.obs["monocyte_dendritic_cells_macrophages"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C33 = adata[
    (adata.obs["monocyte_dendritic_cells_macrophages"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C34 = adata[
    (adata.obs["microgria"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C35 = adata[
    (adata.obs["microgria"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C36 = adata[
    (adata.obs["microgria"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C37 = adata[
    (adata.obs["astrocyte"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C38 = adata[
    (adata.obs["astrocyte"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C39 = adata[
    (adata.obs["astrocyte"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C40 = adata[
    (adata.obs["neutrophil"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C41 = adata[
    (adata.obs["neutrophil"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C42 = adata[
    (adata.obs["neutrophil"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C43 = adata[
    (adata.obs["gd_T_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C44 = adata[
    (adata.obs["gd_T_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C45 = adata[
    (adata.obs["gd_T_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C46 = adata[
    (adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C47 = adata[
    (adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C48 = adata[
    (adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C49 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C50 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) &  
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C51 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()    

C52 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

C53 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

C54 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()

# ==========

C55 = adata[
    (adata.obs["batch"] == "Control_TG")
].copy()

C56 = adata[
    (adata.obs["batch"] == "Control_SP")
].copy()

C57 = adata[
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()




subsets = [
    ("TRA+_TRB+_inSP", C1),
    ("TRA+_TRB+_inTG", C2),
    ("TRA+_TRB+_inSPTG", C3),
    ("TRA+_TRB+_gateout_inSP", C4),
    ("TRA+_TRB+_gateout_inTG", C5),
    ("TRA+_TRB+_gateout_inSPTG", C6),

    ("TRA+_TRB+_CD4+_inSP", C7),
    ("TRA+_TRB+_CD4+_inTG", C8),
    ("TRA+_TRB+_CD4+_inSPTG", C9),
    ("TRA+_TRB+_CD4+_gateout_inSP", C10),
    ("TRA+_TRB+_CD4+_gateout_inTG", C11),
    ("TRA+_TRB+_CD4+_gateout_inSPTG", C12),

    ("TRA+_TRB+_CD8+_inSP", C13),
    ("TRA+_TRB+_CD8+_inTG", C14),
    ("TRA+_TRB+_CD8+_inSPTG", C15),
    ("TRA+_TRB+_CD8+_gateout_inSP", C16),
    ("TRA+_TRB+_CD8+_gateout_inTG", C17),
    ("TRA+_TRB+_CD8+_gateout_inSPTG", C8),

    ("TRA+_TRB+_CD4+_CD8+_inSP", C19),
    ("TRA+_TRB+_CD4+_CD8+_inTG", C20),
    ("TRA+_TRB+_CD4+_CD8+_inSPTG", C21),
    ("TRA+_TRB+_CD4+_CD8+_gateout_inSP", C22),
    ("TRA+_TRB+_CD4+_CD8+_gateout_inTG", C23),
    ("TRA+_TRB+_CD4+_CD8+_gateout_inSPTG", C24),

    ("TRA+_TRB+_CD4-_CD8-_inSP", C25),
    ("TRA+_TRB+_CD4-_CD8-_inTG", C26),
    ("TRA+_TRB+_CD4-_CD8-_inSPTG", C27),
    ("TRA+_TRB+_CD4-_CD8-_gateout_inSP", C28),
    ("TRA+_TRB+_CD4-_CD8-_gateout_inTG", C29),
    ("TRA+_TRB+_CD4-_CD8-_gateout_inSPTG", C30),

    ("monocyte_dendritic_cells_macrophages_inSP", C31),
    ("monocyte_dendritic_cells_macrophages_inTG", C32),
    ("monocyte_dendritic_cells_macrophages__inSPTG", C33),

    ("microgria_inSP", C34),
    ("microgria_inTG", C35),
    ("microgria_inSPTG", C36),

    ("astrocyte_inSP", C37),
    ("astrocyte_inTG", C38),
    ("astrocyte_inSPTG", C39),

    ("neutrophil_inSP", C40),
    ("neutrophil_inTG", C41),
    ("neutrophil_inSPTG", C42),
    
    ("gd_T_cell_inSP", C43),
    ("gd_T_cell_inTG", C44),
    ("gd_T_cell_inSPTG", C45),
    
    ("gate_out_cell_inSP", C46),
    ("gate_out_cell_inTG", C47),
    ("gate_out_cell_inSPTG", C48),

    ("TRA+_TRB+_CD4+_CD8-_gateout_inSP", C49),
    ("TRA+_TRB+_CD4+_CD8-_gateout_inTG", C50),
    ("TRA+_TRB+_CD4+_CD8-_gateout_inSPTG", C51),
    ("TRA+_TRB+_CD8+DP+DN_gateout_inSP", C52),
    ("TRA+_TRB+_CD8+DP+DN_gateout_inTG", C53),
    ("TRA+_TRB+_CD8+DP+DN_gateout_inSPTG", C54),

    ("all_cells_inSP", C55),
    ("all_cells_inTG", C56),
    ("all_cells_inSPTG", C57)
]



results = []
for name, subset in subsets:
    print(f"{name}:  {subset.n_obs}")
    results.append({"Subset": name, "Cell_Count": subset.n_obs})

df_counts = pd.DataFrame(results)
df_counts.to_csv("cell_counts_summary.csv", index=False)

In [None]:
# <<<<<<< Define subsets for analysis >>>>>>>

subset1 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

subset2 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) &  
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

subset3 = adata[
    (adata.obs["TRA_TRB_positive_CD4_positive"]) & 
    (~adata.obs["TRA_TRB_positive_CD4_CD8_double_positive"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()    


subset4 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_TG")
].copy()

subset5 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    (adata.obs["batch"] == "Control_SP")
].copy()

subset6 = adata[
    (adata.obs["TRA_TRB_positive_CD8_positive"] | adata.obs["TRA_TRB_positive_CD4_CD8_double_negative"]) & 
    (~adata.obs["gate_out_cell"]) & 
    ((adata.obs["batch"] == "Control_TG") | (adata.obs["batch"] == "Control_SP"))
].copy()
 


subsets = [
    ("mouse_CD4_TG", subset1),
    ("mouse_CD4_SP", subset2),
    ("mouse_CD4_TG_SP", subset3),
    ("mouse_CD8+DP+DN_TG", subset4),
    ("mouse_CD8+DP+DN_SP", subset5),
    ("mouse_CD8+DP+DN_TG_SP", subset6)
]

In [None]:
# <<<<<<< Define gene lists for analysis >>>>>>>

gene_list = [
    "Cd3d", "Cd3e", "Cd3g", "Cd4", "Cd8a", "Cd8b1", "Cd44", "Sell", "Ccr7",
    "Klf2", "S1pr1", "Nr4a1", "Nr4a2", "Nr4a3", "Jak1", "Jak2", "Jak3",
    "Isg15", "Isg20", "Rora", "Tbx21", "Tcf7", "Tox", "Gata3", "Cxcr3",
    "Cxcr6", "Cd69", "Il17a", "Ifng", "Il4", "Il10", "Il21", "Cxcr5",
    "Pdcd1", "Foxp3", "Ctla4", "Cx3cr1", "Gzma", "Gzmb", "Gzmk", "Prf1",
    "Ccl5", "Itgae", "Itga1"
]

existing_genes = [g for g in gene_list if g in adata.var_names]
missing_genes = [g for g in gene_list if g not in adata.var_names]
if missing_genes:
    print("The following genes were not found:")
    print(missing_genes)



gene_list_2 = [
    "Cd3d", "Cd3e", "Cd3g", "Cd4", "Cd8a", "Cd8b1", "Cd44", "Sell", "Ccr7",
    "Klf2", "S1pr1", "Nr4a1", "Nr4a2", "Nr4a3", "Jak1", "Jak2", "Jak3",
    "Isg15", "Isg20", "Rora", "Zfp683", "Tbx21", "Tcf7", "Tox", "Gata3", "Cxcr3",
    "Cxcr6", "Cd69", "Il17a", "Ifng", "Il10", "Il21", "Cxcr5",
    "Pdcd1", "Foxp3", "Ctla4", "Cx3cr1", "Gzma", "Gzmb", "Gzmk", "Prf1",
    "Ccl5", "Itgae", "Itga1"
]

existing_genes_2 = [g for g in gene_list_2 if g in adata.var_names]
missing_genes_2 = [g for g in gene_list_2 if g not in adata.var_names]
if missing_genes_2:
    print("The following genes were not found:")
    print(missing_genes_2)



genes_list_CD4TGSP = [
"Isg15", "Isg20", "Gbp6", "Gbp9", "Ccl5", "Ifi27l2a", "Irf7", "Oas1a"       #"Cxcl10"
]   

existing_genes_CD4TGSP = [g for g in genes_list_CD4TGSP if g in adata.var_names]
missing_genes_CD4TGSP = [g for g in genes_list_CD4TGSP if g not in adata.var_names]
if missing_genes_CD4TGSP:
    print("The following genes were not found:")
    print(missing_genes_CD4TGSP)



genes_list_CD4TG = [
"Oas1a", "Ifi27l2a", "Gbp6", "Gbp9", "Ifi47","Zbp1",  "Irf7", "Isg15", "Parp14"
]

existing_genes_CD4TG = [g for g in genes_list_CD4TG if g in adata.var_names]
missing_genes_CD4TG = [g for g in genes_list_CD4TG if g not in adata.var_names]
if missing_genes_CD4TG:
    print("The following genes were not found:")
    print(missing_genes_CD4TG)



genes_list_CD8TGSP = [
"Ccl5", "Ifitm1", "Ifitm2", "Isg15", "Isg20", "Ifitm3"
]

existing_genes_CD8TGSP = [g for g in genes_list_CD8TGSP if g in adata.var_names]
missing_genes_CD8TGSP = [g for g in genes_list_CD8TGSP if g not in adata.var_names]
if missing_genes_CD8TGSP:
    print("The following genes were not found:")
    print(missing_genes_CD8TGSP)



genes_list_CD8TG = [
"Isg15"
]

existing_genes_CD8TG = [g for g in genes_list_CD8TG if g in adata.var_names]
missing_genes_CD8TG = [g for g in genes_list_CD8TG if g not in adata.var_names]
if missing_genes_CD8TG:
    print("The following genes were not found:")
    print(missing_genes_CD8TG)

In [None]:
# <<<<<<< subset analysis >>>>>>>

for name, subset in subsets:

    print(f"Processing: {name}(Num of cells: {subset.n_obs})")

    # ======= pre-analysis of subset =======

    subset.raw = subset.copy()

    sc.pp.normalize_total(subset, target_sum=1e4)

    subset.raw = subset

    sc.pp.log1p(subset)

    sc.pp.highly_variable_genes(subset, n_top_genes=2000)

    sc.pp.scale(subset, max_value=10)

    sc.tl.pca(subset)

    sc.pp.neighbors(subset)

    sc.tl.umap(subset)

    sc.tl.leiden(subset, resolution=0.5)


    # ======= Cluster =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - Cluster analysis")

    #UMAP-cluster
    sc.pl.umap(
        subset, 
        color="leiden", 
        show=True, 
        save=f"{name}_subset_cluster_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="leiden", 
        method="wilcoxon"
    )
    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False, 
        save=f"{name}_subset_cluster_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]
    groups = deg_result["names"].dtype.names  

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cluster_DEG_results.xlsx") as writer:
        for group in groups:
            df = pd.DataFrame({
                "gene": deg_result["names"][group],
                "logfoldchanges": deg_result["logfoldchanges"][group],
                "pvals": deg_result["pvals"][group],
                "pvals_adj": deg_result["pvals_adj"][group],
                "scores": deg_result["scores"][group],
            })
            df.to_excel(writer, sheet_name=f"cluster_{group}", index=False)
    
    #UMAP-batch
    n_batches = subset.obs["batch"].nunique()
    if n_batches >= 2:

        subset.obs["batch"] = subset.obs["batch"].astype("category")
        subset.obs["batch"] = subset.obs["batch"].cat.reorder_categories(
        ["Control_TG", "Control_SP"], ordered=True
        )

        custom_palette = ["#0033ff", "#B0B0B0"] 

        print(f"{name}: {n_batches} batches have been detected. Save the UMAP plots color-coded by batch.")
        sc.pl.umap(
            subset, 
            color="batch", 
            palette=custom_palette, 
            show=True, 
            save=f"{name}_subset_batch_UMAP.pdf"
        )

    else:
        print(f"{name}: Since there is only one batch type, batch UMAP has been skipped.")
        

    # ======= cdr3 =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3")

    cdr3_df = pd.read_csv(f"{cwd}/cdr3_summary_tetTG_-_NaTG.csv")

    cdr3_types = [
        ("CD4TRA", "cdr3_TRA"),
        ("CD4TRB", "cdr3_TRB"),
        ("CD8TRA", "cdr3_TRA"),
        ("CD8TRB", "cdr3_TRB"),
    ]

    subset.obs["cdr3_positive"] = "NA"

    for label, subset_col in cdr3_types:
        cdr3_list = cdr3_df[f"{label}_cdr3"].dropna().unique()
        cdr3_set = set(cdr3_list)

        condition = (
            subset.obs[subset_col].isin(cdr3_set) &
            (subset.obs["cdr3_positive"] == "NA")  
        )

        subset.obs.loc[condition, "cdr3_positive"] = "cdr3_true"

    subset.obs["cdr3_positive"] = pd.Categorical(subset.obs["cdr3_positive"], categories=["cdr3_true", "NA"])

    #UMAP
    sc.pl.umap(
        subset, 
        color="cdr3_positive", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_cdr3_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="cdr3_positive", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_cdr3_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "cdr3_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/mouse_{name}_subset_cdr3_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= cdr3_TRA or cdr3_TRB =======

    for chain, chain_label in [("TRA", "cdr3_TRA"), ("TRB", "cdr3_TRB")]:
    
        print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3_{chain} ")

        label_col = f"{chain_label}_positive"
        subset.obs[label_col] = "NA"

        relevant_cols = [col for col in cdr3_df.columns if col.endswith(f"{chain}_cdr3")]
        cdr3_combined_set = set()

        for col in relevant_cols:
            cdr3_combined_set.update(cdr3_df[col].dropna().unique())

        condition = subset.obs[chain_label].isin(cdr3_combined_set)
        subset.obs.loc[condition, label_col] = f"{chain_label}_true"

        subset.obs[label_col] = pd.Categorical(subset.obs[label_col])

        #UMAP
        sc.pl.umap(
            subset, 
            color=label_col, 
            palette=["red", "lightgray"], 
            show=True, 
            save=f"{name}_subset_{chain_label}_UMAP.pdf"
        )

        #DEG
        sc.tl.rank_genes_groups(
            subset, 
            groupby=label_col, 
            method="wilcoxon"
        )

        sc.pl.rank_genes_groups(
            subset,
            n_genes=25, 
            sharey=False, 
            show=False,
            save=f"{name}_subset_{chain_label}_DEG_fig.pdf"
        )

        group = f"{chain_label}_true"
        deg_result = subset.uns["rank_genes_groups"]
        df_chain = pd.DataFrame({
            "gene": deg_result["names"][group],
            "logfoldchanges": deg_result["logfoldchanges"][group],
            "pvals": deg_result["pvals"][group],
            "pvals_adj": deg_result["pvals_adj"][group],
            "scores": deg_result["scores"][group],
        })

        with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_{chain_label}_DEG_results.xlsx") as writer:
            df_chain.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= cdr3-cdr3 combo =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3-cdr3 combo")

    df = pd.read_csv(f"{cwd}/cdr3_summary_tetTG_-_NaTG_forcdr3combo.csv")

    combo_list = [
        f"{tra}_{trb}" 
        for tra in df["TRA_cdr3"].dropna().unique()
        for trb in df["TRB_cdr3"].dropna().unique()
    ]
    
    combo_set = set(combo_list)

    subset.obs["cdr3-cdr3_combo"] = subset.obs["cdr3_TRA_TRB"].apply(
        lambda x: f"cdr3-cdr3_true" if x in combo_set else "NA"
    )

    subset.obs["cdr3-cdr3_combo"] = pd.Categorical(subset.obs["cdr3-cdr3_combo"], categories=["cdr3-cdr3_true", "NA"])

    print(subset.obs["cdr3-cdr3_combo"].cat.categories)

    order = subset.obs["cdr3-cdr3_combo"] == "NA"
    na_cells = subset.obs[order].index
    true_cells = subset.obs[~order].index

    new_order = na_cells.tolist() + true_cells.tolist()

    subset = subset[new_order, :]


    true_cells = subset.obs.loc[subset.obs["cdr3-cdr3_combo"] == "cdr3-cdr3_true", ["cdr3_TRA_TRB", "v_gene_TRA", "j_gene_TRA", "cdr3_TRA", "v_gene_TRB", "j_gene_TRB", "cdr3_TRB"]]

    combo_freq = true_cells["cdr3_TRA_TRB"].value_counts().reset_index()
    combo_freq.columns = ["cdr3_combo", "frequency"]

    combo_details = true_cells.drop_duplicates(subset=["cdr3_TRA_TRB"]).rename(columns={"cdr3_TRA_TRB": "cdr3_combo"})

    result_df = pd.merge(combo_freq, combo_details, on="cdr3_combo", how="left")

    result_df.to_excel(f"{cwd}/figures/mouse_{name}_subset_cdr3-cdr3_list_frequency.xlsx", index=False)

    #UMAP
    sc.pl.umap(
        subset, 
        color="cdr3-cdr3_combo", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_cdr3-cdr3_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="cdr3-cdr3_combo", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_cdr3-cdr3_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "cdr3-cdr3_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cdr3-cdr3_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    #dotplot
    dp = sc.pl.dotplot(
        subset,
        var_names=existing_genes_2,
        groupby="cdr3-cdr3_combo",
        standard_scale='var',
        color_map="GnBu",
        show=True,
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_.pdf")


    #dotplot and violin plot
    custom_palette = {"cdr3-cdr3_true": "#1f77b4", "NA": "#B0B0B0"}

    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TG.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TG.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TG.pdf")
    
    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TG.pdf"
    )


    #volcano plot
    deg_result = subset.uns["rank_genes_groups"]
    group = "cdr3-cdr3_true"
    df_deg = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })
                
    df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
    df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

    def plot_volcano(df, name, cwd, label_genes=True, xlim=None, ylim=None, label_specific_genes=None, label_suffix=None, base_ns_color="#B0B0B0", base_sig_color="#d62728", highlight_color="#1f77b4"):

        df = df.copy()

        if label_specific_genes is not None:
            hi = {g.upper() for g in label_specific_genes}
            df["is_highlight"] = df["gene"].str.upper().isin(hi)
            genes_to_label = df[df["is_highlight"]]
            if label_suffix is None:
                label_suffix = "specificgenes"

        elif label_genes:
            df["is_highlight"] = df["significant"]
            genes_to_label = df[df["significant"]]
            if label_suffix is None:
                label_suffix = "label"

        else:
            df["is_highlight"] = False
            genes_to_label = pd.DataFrame()
            if label_suffix is None:
                label_suffix = "nolabel"

        df["_ns"]  = ~df["significant"]
        df["_sig"] = df["significant"] & ~df["is_highlight"]
        df["_hi"]  = df["is_highlight"]

        def _draw(ax, fix_range=False):
            point_size = 24  

            ax.scatter(
                df.loc[df["_ns"], "logfoldchanges"],
                df.loc[df["_ns"], "-log10_pval"],
                c=base_ns_color, s=point_size, alpha=0.7, edgecolors="none", zorder=1
            )

            ax.scatter(
                df.loc[df["_sig"], "logfoldchanges"],
                df.loc[df["_sig"], "-log10_pval"],
                c=base_sig_color, s=point_size, alpha=0.9, edgecolors="none", zorder=2
            )

            ax.scatter(
                df.loc[df["_hi"], "logfoldchanges"],
                df.loc[df["_hi"], "-log10_pval"],
                c=highlight_color, s=point_size, alpha=1.0, edgecolors="none", zorder=4
            )

            ax.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
            ax.axvline(-2, linestyle='--', color='black', linewidth=0.8)
            ax.axvline( 2, linestyle='--', color='black', linewidth=0.8)

            if not genes_to_label.empty:

                to_annot = genes_to_label.sort_values("-log10_pval", ascending=False)
                offsets = [(10, 10)]  

                for i, (_, r) in enumerate(to_annot.iterrows()):
                    dx, dy = offsets[i % len(offsets)]
                    x0, y0 = r["logfoldchanges"], r["-log10_pval"]
                    x1, y1 = x0 + dx * 0.05, y0 + dy * 0.05

                    ax.text(
                        x1, y1, r["gene"],
                        fontsize=9, weight="bold", color="black", zorder=6,
                        path_effects=[pe.withStroke(linewidth=3, foreground="white")]
                    )

                    ax.plot([x0, x1], [y0, y1], color="black", linewidth=0.8, zorder=5)

            ax.set_xlabel("log2 Fold Change")
            ax.set_ylabel("-log10(p-value)")
            ax.set_title(f"{name}_subset (cdr3-cdr3_true vs rest)")


            if fix_range:
                if xlim is not None: ax.set_xlim(xlim)
                if ylim is not None: ax.set_ylim(ylim)
                ax.autoscale(enable=False)
                ax.margins(x=0, y=0)

        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        _draw(ax, fix_range=False)
        plt.tight_layout()
        plt.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}.pdf")
        plt.close()

        fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)
        _draw(ax, fix_range=True)
        fig.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}_fixedrange.pdf", bbox_inches="tight")
        plt.close(fig)



    plot_volcano(df_deg, name, cwd, label_genes=True, xlim=[-10, 10])
    plot_volcano(df_deg, name, cwd, label_genes=False, xlim=[-10, 10])


    specificgenesCD4_TG = [
        "Gbp9", "Gbp6", "Oas1a", "Ifi27l2a", "Ifi47", "Parp14", "Zbp1", "Irf7", "Isg15", "Cxcr3", "Icos", "Stat1"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG, 
        label_suffix = "specificgenesCD4_TG"
    )


    specificgenesCD4_TG_SP = [
        "Itgb1", "Icos", "Isg15", "Bhlhe40", "Cxcr3", "Ifng", "Nr4a1", "Gbp6", "Oas1a", "Isg20", "Nkg7", "Gbp9", "Icam1", "Ccl5", "Hif1a", "Ifi27l2a", "Irf7", "Zbp1", "S1pr1", "Il7r", "Klf2", "Bcl2", "Ccr7", "Sell"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG_SP, 
        label_suffix = "specificgenesCD4_TG_SP"
    )


    specificgenesCD8_DP_DN_TG = [
        "Trbv13-3", "Gzmb", "Gzma", "Itga1", "Ccr2", "Ly6a", "Isg15", "Cd8b1", "Gzmk", "Ccl5", "Ctla4", "Ifng", "Cd8a", "S1pr1", "Klf2", "Bcl2", "Tcf7", "Ccr7", "Sell", "Eomes", "Tox"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG, 
        label_suffix = "specificgenesCD8_DP_DN_TG"
    )


    specificgenesCD8_DP_DN_TG_SP = [""
    "Gzma", "Ccl5", "S100a6", "Gzmb", "Trbv13-3", "Ifitm1", "Itga1", "Litaf", "Ifng", "Gzmk", "Ccr2", "Bhlhe40", "Nr4a2", "Ctla4", "Ccl4", "Ifitm2", "Tbx21", "Ccr5", "Isg15", "Cd38", "Icos", "Cxcr6", "Isg20", "Nr4a1", "Nr4a3", "Ifitm3", "Nkg7", "S1pr1", "Klf2", "Itga6", "Tox", "Ccr7", "Tcf7", "Sell", "Ccr9", "Tnfsf8"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG_SP, 
        label_suffix = "specificgenesCD8_DP_DN_TG_SP"
    )


    # ======= v =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - V")

    V_df = pd.read_csv(f"{cwd}/selected_V.csv")

    V_types = [
        ("CD4_TRA_V", "v_gene_TRA"),
        ("CD4_TRB_V", "v_gene_TRB"),
        ("CD8_TRA_V", "v_gene_TRA"),
        ("CD8_TRB_V", "v_gene_TRB"),
    ]

    subset.obs["V_positive"] = "NA"

    for label, subset_col in V_types:
        V_list = V_df[label].dropna().unique()
        V_set = set(V_list)

        condition = (
            subset.obs[subset_col].isin(V_set) &
            (subset.obs["V_positive"] == "NA")  
        )

        subset.obs.loc[condition, "V_positive"] = "V_true"

    subset.obs["V_positive"] = pd.Categorical(subset.obs["V_positive"], categories=["V_true", "NA"])
    print(subset.obs["V_positive"].cat.categories)

    #UMAP
    sc.pl.umap(
        subset, 
        color="V_positive", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_V_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="V_positive", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_V_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "V_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_V_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= v-v combo =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - v-v combo")

    df = pd.read_csv(f"{cwd}/selected_V_TRA_TRB_forVcombo.csv")
    combo_list = [f"{tra}_{trb}" for tra in df["TRA_V"].dropna().unique()
                                for trb in df["TRB_V"].dropna().unique()]
    combo_set = set(combo_list)

    subset.obs["V-V_combo"] = subset.obs["V_TRA_TRB"].apply(
        lambda x: f"V-V_true" if x in combo_set else "NA"
    )
    subset.obs["V-V_combo"] = pd.Categorical(subset.obs["V-V_combo"], categories=["V-V_true", "NA"])
    print(subset.obs["V-V_combo"].cat.categories)

    #UMAP
    sc.pl.umap(
        subset, 
        color="V-V_combo", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_V-V_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="V-V_combo", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_V-V_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "V-V_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_V-V_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)

In [None]:
# <<<<<<< Define Subsets_2nd for analysis >>>>>>>

exclude_clusters = ["5"]
exclude_cells = subset3.obs[subset3.obs["leiden"].isin(exclude_clusters)].index
subset_2nd_1 = subset3[~subset3.obs_names.isin(exclude_cells), :].copy()
subset_2nd_2 = subset1[~subset1.obs_names.isin(exclude_cells), :].copy()


exclude_clusters = ["1", "5"]
exclude_cells = subset6.obs[subset6.obs["leiden"].isin(exclude_clusters)].index
subset_2nd_3 = subset6[~subset6.obs_names.isin(exclude_cells), :].copy()
subset_2nd_4 = subset4[~subset4.obs_names.isin(exclude_cells), :].copy()


subsets_2nd = [
    ("mouse_CD4_TG+SP_exclude_cluster_5", subset_2nd_1),
    ("mouse_CD4_TG_exclude_cluster_5", subset_2nd_2),
    ("mouse_CD8+DP+DN_TG+SP_exclude_cluster_1_5", subset_2nd_3),
    ("mouse_CD8+DP+DN_TG_exclude_cluster_1_5", subset_2nd_4),
]


for name, subset in subsets_2nd:
    print(f"{name}:  {subset.n_obs}")

In [None]:
# <<<<<<< Subsets_2nd analysis >>>>>>>

for name, subset in subsets_2nd:

    #UMAP
    sc.pl.umap(
        subset,
        color="leiden",
        show=True,
        save=f"{name}_subset_cluster_UMAP.pdf"
    )

    #DEG
    cluster_counts = subset.obs["leiden"].value_counts()
    valid_clusters = cluster_counts[cluster_counts > 3].index.tolist()
        
    if len(valid_clusters) >= 2:
        subset_filtered = subset[subset.obs["leiden"].isin(valid_clusters), :].copy()
        sc.tl.rank_genes_groups(
            subset_filtered, 
            groupby="leiden", 
            method="wilcoxon"
        )
        sc.pl.rank_genes_groups(
            subset_filtered, 
            n_genes=25, 
            sharey=False, 
            show=False, 
            save=f"{name}_subset_cluster_DEG_fig.pdf"
        )

        deg_result = subset_filtered.uns["rank_genes_groups"]
        groups = deg_result["names"].dtype.names  

        with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cluster_DEG_results.xlsx") as writer:
            for group in groups:
                df = pd.DataFrame({
                    "gene": deg_result["names"][group],
                    "logfoldchanges": deg_result["logfoldchanges"][group],
                    "pvals": deg_result["pvals"][group],
                    "pvals_adj": deg_result["pvals_adj"][group],
                    "scores": deg_result["scores"][group],
                })
                df.to_excel(writer, sheet_name=f"cluster_{group}", index=False)
    else:
        print(f" {name}: DEG analysis was skipped because there were fewer than two valid clusters.")

            

    #UMAP-batch
    n_batches = subset.obs["batch"].nunique()
    if n_batches >= 2:
        subset.obs["batch"] = subset.obs["batch"].astype("category")
        subset.obs["batch"] = subset.obs["batch"].cat.reorder_categories(
        ["Control_TG", "Control_SP"], ordered=True
        )
        custom_palette = ["#0033ff", "#B0B0B0"]  
        print(f"{name}: {n_batches} batches have been detected. Save the UMAP plots color-coded by batch.")
        sc.pl.umap(
            subset, 
            color="batch", 
            palette=custom_palette, 
            show=True, 
            save=f"{name}_subset_batch_UMAP.pdf"
        )
    else:
        print(f"{name}: Since there is only one batch type, batch UMAP has been skipped.")



    # ======= cdr3-cdr3 combo =======

    order = subset.obs["cdr3-cdr3_combo"] == "NA"
    na_cells = subset.obs[order].index
    true_cells = subset.obs[~order].index

    new_order = na_cells.tolist() + true_cells.tolist()

    subset = subset[new_order, :]

    #UMAP
    sc.pl.umap(
        subset, 
        color="cdr3-cdr3_combo", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_cdr3-cdr3_UMAP.pdf"
    )

    #DEG
    cluster_counts = subset.obs["leiden"].value_counts()
    valid_clusters = cluster_counts[cluster_counts > 3].index.tolist()

    if len(valid_clusters) >= 2:

        subset_filtered = subset[subset.obs["leiden"].isin(valid_clusters), :].copy()
        sc.tl.rank_genes_groups(
            subset_filtered, 
            groupby="cdr3-cdr3_combo", 
            method="wilcoxon"
        )
        sc.pl.rank_genes_groups(
            subset_filtered, 
            n_genes=25, 
            sharey=False, 
            show=False , 
            save=f"{name}_subset_cdr3-cdr3_DEG_fig.pdf"
        )

        deg_result = subset_filtered.uns["rank_genes_groups"]

        group = "cdr3-cdr3_true"
        df = pd.DataFrame({
            "gene": deg_result["names"][group],
            "logfoldchanges": deg_result["logfoldchanges"][group],
            "pvals": deg_result["pvals"][group],
            "pvals_adj": deg_result["pvals_adj"][group],
            "scores": deg_result["scores"][group],
        })

        with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cdr3-cdr3_DEG_results.xlsx") as writer:
            df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)
            
    else:
        print(f"{name}: DEG analysis was skipped because there were fewer than two valid clusters.")



    #dotplot
    dp = sc.pl.dotplot(
        subset_filtered,
        var_names=existing_genes_2,
        groupby="cdr3-cdr3_combo",
        standard_scale='var',
        color_map="GnBu",
        show=True,
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_.pdf")

    
    #dotplot and violin plot
    custom_palette = {"cdr3-cdr3_true": "#1f77b4", "NA": "#B0B0B0"}

    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TG.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TG.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TG.pdf")
    
    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TG.pdf"
    )


    #volcano plot
    deg_result = subset_filtered.uns["rank_genes_groups"]
    group = "cdr3-cdr3_true"
    df_deg = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })


    df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
    df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

    def plot_volcano(df, name, cwd, label_genes=True, xlim=None, ylim=None, label_specific_genes=None, label_suffix=None, base_ns_color="#B0B0B0", base_sig_color="#d62728", highlight_color="#1f77b4"):

        df = df.copy()

        if label_specific_genes is not None:
            hi = {g.upper() for g in label_specific_genes}
            df["is_highlight"] = df["gene"].str.upper().isin(hi)
            genes_to_label = df[df["is_highlight"]]
            if label_suffix is None:
                label_suffix = "specificgenes"

        elif label_genes:
            df["is_highlight"] = df["significant"]
            genes_to_label = df[df["significant"]]
            if label_suffix is None:
                label_suffix = "label"

        else:
            df["is_highlight"] = False
            genes_to_label = pd.DataFrame()
            if label_suffix is None:
                label_suffix = "nolabel"

        df["_ns"]  = ~df["significant"]
        df["_sig"] = df["significant"] & ~df["is_highlight"]
        df["_hi"]  = df["is_highlight"]

        def _draw(ax, fix_range=False):
            point_size = 24 

            ax.scatter(
                df.loc[df["_ns"], "logfoldchanges"],
                df.loc[df["_ns"], "-log10_pval"],
                c=base_ns_color, s=point_size, alpha=0.7, edgecolors="none", zorder=1
            )
            ax.scatter(
                df.loc[df["_sig"], "logfoldchanges"],
                df.loc[df["_sig"], "-log10_pval"],
                c=base_sig_color, s=point_size, alpha=0.9, edgecolors="none", zorder=2
            )
            ax.scatter(
                df.loc[df["_hi"], "logfoldchanges"],
                df.loc[df["_hi"], "-log10_pval"],
                c=highlight_color, s=point_size, alpha=1.0, edgecolors="none", zorder=4
            )

            ax.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
            ax.axvline(-2, linestyle='--', color='black', linewidth=0.8)
            ax.axvline( 2, linestyle='--', color='black', linewidth=0.8)

            if not genes_to_label.empty:
                to_annot = genes_to_label.sort_values("-log10_pval", ascending=False)
                offsets = [(10, 10)]  
                for i, (_, r) in enumerate(to_annot.iterrows()):
                    dx, dy = offsets[i % len(offsets)]
                    x0, y0 = r["logfoldchanges"], r["-log10_pval"]
                    x1, y1 = x0 + dx * 0.05, y0 + dy * 0.05

                    ax.text(
                        x1, y1, r["gene"],
                        fontsize=9, weight="bold", color="black", zorder=6,
                        path_effects=[pe.withStroke(linewidth=3, foreground="white")]
                    )
                    ax.plot([x0, x1], [y0, y1], color="black", linewidth=0.8, zorder=5)

            ax.set_xlabel("log2 Fold Change")
            ax.set_ylabel("-log10(p-value)")
            ax.set_title(f"{name}_subset (cdr3-cdr3_true vs rest)")


            if fix_range:
                if xlim is not None: ax.set_xlim(xlim)
                if ylim is not None: ax.set_ylim(ylim)
                ax.autoscale(enable=False)
                ax.margins(x=0, y=0)

        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        _draw(ax, fix_range=False)
        plt.tight_layout()
        plt.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}.pdf")
        plt.close()

        fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)
        _draw(ax, fix_range=True)
        fig.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}_fixedrange.pdf", bbox_inches="tight")
        plt.close(fig)



    plot_volcano(df_deg, name, cwd, label_genes=True, xlim=[-10, 10])
    plot_volcano(df_deg, name, cwd, label_genes=False, xlim=[-10, 10])



    specificgenesCD4_TG = [
        "Gbp9", "Gbp6", "Oas1a", "Ifi27l2a", "Ifi47", "Parp14", "Zbp1", "Irf7", "Isg15", "Cxcr3", "Icos", "Stat1"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG, 
        label_suffix = "specificgenesCD4_TG"
    )


    specificgenesCD4_TG_SP = [
        "Itgb1", "Icos", "Isg15", "Bhlhe40", "Cxcr3", "Ifng", "Nr4a1", "Gbp6", "Oas1a", "Isg20", "Nkg7", "Gbp9", "Icam1", "Ccl5", "Hif1a", "Ifi27l2a", "Irf7", "Zbp1", "S1pr1", "Il7r", "Klf2", "Bcl2", "Ccr7", "Sell"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG_SP, 
        label_suffix = "specificgenesCD4_TG_SP"
    )


    specificgenesCD8_DP_DN_TG = [
        "Trbv13-3", "Gzmb", "Gzma", "Itga1", "Ccr2", "Ly6a", "Isg15", "Cd8b1", "Gzmk", "Ccl5", "Ctla4", "Ifng", "Cd8a", "S1pr1", "Klf2", "Bcl2", "Tcf7", "Ccr7", "Sell", "Eomes", "Tox"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG, 
        label_suffix = "specificgenesCD8_DP_DN_TG"
    )


    specificgenesCD8_DP_DN_TG_SP = [""
    "Gzma", "Ccl5", "S100a6", "Gzmb", "Trbv13-3", "Ifitm1", "Itga1", "Litaf", "Ifng", "Gzmk", "Ccr2", "Bhlhe40", "Nr4a2", "Ctla4", "Ccl4", "Ifitm2", "Tbx21", "Ccr5", "Isg15", "Cd38", "Icos", "Cxcr6", "Isg20", "Nr4a1", "Nr4a3", "Ifitm3", "Nkg7", "S1pr1", "Klf2", "Itga6", "Tox", "Ccr7", "Tcf7", "Sell", "Ccr9", "Tnfsf8"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG_SP, 
        label_suffix = "specificgenesCD8_DP_DN_TG_SP"
    )

In [None]:

# ======= "CD4_TG", subset1 =======

target_combos = [
    "CAVSAGYGSSGNKLIF_CASSYQGATGQLYF",
]

cdr3_true_cells = subset1[subset1.obs["cdr3-cdr3_combo"] == "cdr3-cdr3_true", :].copy()

cdr3_true_cells.obs["combo_specific"] = np.where(
    cdr3_true_cells.obs["cdr3_TRA_TRB"].isin(target_combos),
    "combo_match",
    "combo_other"
)
cdr3_true_cells.obs["combo_specific"] = pd.Categorical(cdr3_true_cells.obs["combo_specific"], categories=["combo_match", "combo_other"])

sc.tl.rank_genes_groups(
    cdr3_true_cells, 
    groupby="combo_specific", 
    method="wilcoxon"
)

deg_result = cdr3_true_cells.uns["rank_genes_groups"]
group = "combo_match"

df_deg = pd.DataFrame({
    "gene": deg_result["names"][group],
    "logfoldchanges": deg_result["logfoldchanges"][group],
    "pvals": deg_result["pvals"][group],
    "pvals_adj": deg_result["pvals_adj"][group],
    "scores": deg_result["scores"][group],
})
df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

df_deg.to_excel(f"{cwd}/figures/mouse_CD4_TG_subset_target-cdr3-combo_vs_others_DEG_result.xlsx", index=False)

df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

def plot_volcano(df, name, cwd, label_genes=True, xlim=[-10, 10], ylim=None):
    plt.figure(figsize=(8, 6))
    plt.scatter(
        df["logfoldchanges"], df["-log10_pval"],
        c=df["significant"].map({True: "red", False: "gray"}),
        alpha=0.7, edgecolors="none"
    )
    if label_genes:
        for _, row in df[df["significant"]].iterrows():
            plt.text(
                row["logfoldchanges"],
                row["-log10_pval"],
                row["gene"],
                fontsize=8,
                ha='center',
                va='bottom'
            )
    plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)  
    plt.axvline(-2, linestyle='--', color='black', linewidth=0.8) 
    plt.axvline(2, linestyle='--', color='black', linewidth=0.8)   
    plt.xlabel("log2 Fold Change")
    plt.ylabel("-log10(p-value)")
    plt.title(f"{name}_subset (cdr3-cdr3_true vs rest)")
    plt.tight_layout()

    label_suffix = "label" if label_genes else "nolabel"
    plt.savefig(f"{cwd}/figures/mouse_CD4_TG_subset_target-cdr3-combo_vs_others_volcano_{label_suffix}.pdf")
    plt.close()

    fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)                    
    plt.scatter(
        df["logfoldchanges"], df["-log10_pval"],
        c=df["significant"].map({True: "red", False: "gray"}),
        alpha=0.7, edgecolors="none"
    )
    if label_genes:
        for _, row in df[df["significant"]].iterrows():
            plt.text(
                row["logfoldchanges"],
                row["-log10_pval"],
                row["gene"],
                fontsize=8,
                ha='center',
                va='bottom'
            )

    plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
    plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
    plt.axvline(2, linestyle='--', color='black', linewidth=0.8)
    plt.xlabel("log2 Fold Change")
    plt.ylabel("-log10(p-value)")
    plt.title(f"{name}_subset (cdr3-cdr3_true vs rest)")

    if xlim is not None:
        plt.xlim(xlim)
    if ylim is not None:
        plt.ylim(ylim)
    ax.autoscale(enable=False)  
    ax.margins(x=0, y=0) 

    plt.savefig(f"{cwd}/figures/mouse_CD4_TG_subset_target-cdr3-combo_vs_others_volcano_{label_suffix}_fixedrange.pdf", bbox_inches="tight")
    plt.close()

plot_volcano(df_deg, name, cwd, label_genes=True)
plot_volcano(df_deg, name, cwd, label_genes=False)

    





fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)
plt.figure(figsize=(9, 6))
plt.scatter(
    df_deg["logfoldchanges"], df_deg["-log10_pval"],
    c=df_deg["significant"].map({True: "red", False: "gray"}),
    alpha=0.7, edgecolors="none"
)
plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
plt.axvline(2, linestyle='--', color='black', linewidth=0.8)

texts = []
for _, row in df_deg[df_deg["significant"]].iterrows():
    texts.append(plt.text(row["logfoldchanges"], row["-log10_pval"], row["gene"], fontsize=8))

plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(p-value)")
plt.title(f"Volcano Plot: CD4_TG  target-cdr3-combo vs others")
plt.xlim(-10, 10)
ax.autoscale(enable=False)  
ax.margins(x=0, y=0) 


combo_text = "\n".join(target_combos)
plt.text(
    0.95, 0.05,
    f"Target combos:\n{combo_text}",
    ha='right', va='bottom',
    fontsize=8,
    transform=plt.gca().transAxes,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
)

plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.savefig(f"{cwd}/figures/mouse_CD4_TG_subset_target-cdr3-combo_vs_others_volcano_label_2.pdf")
plt.close()

fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)
plt.figure(figsize=(9, 6))
plt.scatter(
    df_deg["logfoldchanges"], df_deg["-log10_pval"],
    c=df_deg["significant"].map({True: "red", False: "gray"}),
    alpha=0.7, edgecolors="none"
)
plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
plt.axvline(2, linestyle='--', color='black', linewidth=0.8)
plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(p-value)")
plt.title(f"Volcano Plot: CD4_TG  target-cdr3-combo vs others")
plt.xlim(-10, 10)
ax.autoscale(enable=False)  
ax.margins(x=0, y=0) 

plt.text(
    0.95, 0.05,
    f"Target combos:\n{combo_text}",
    ha='right', va='bottom',
    fontsize=8,
    transform=plt.gca().transAxes,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
)

plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.savefig(f"{cwd}/figures/mouse_CD4_TG_subset_target-cdr3-combo_vs_others_volcano_nolabel_2.pdf")
plt.close()



sc.pl.dotplot(
    cdr3_true_cells,
    var_names=existing_genes_2,
    groupby="combo_specific",
    standard_scale='var',
    color_map="GnBu",
    show=True,          
    save = "mouse_CD4_TG_subset_target-cdr3-combo_vs_others_dotplot.pdf"     
)


# ======= "CD8+DP+DN_TG", subset4 =======


target_combos = [
    "CAASSNYNVLYF_CASRGQGAETLYF",
    "CAASGSNNRIFF_CASRGQGQNTLYF",
    "CAASTNYNVLYF_CASRGQGISDYTF",
    "CAASAHYSNNRLTL_CASSLRGSGNTLYF",
    "CAAGGQGTGSKLSF_CASSYWGSQNTLYF",
    "CALGSNYNVLYF_CASRGQGNTEVFF",
]

cdr3_true_cells = subset4[subset4.obs["cdr3-cdr3_combo"] == "cdr3-cdr3_true", :].copy()

cdr3_true_cells.obs["combo_specific"] = np.where(
    cdr3_true_cells.obs["cdr3_TRA_TRB"].isin(target_combos),
    "combo_match",
    "combo_other"
)
cdr3_true_cells.obs["combo_specific"] = pd.Categorical(cdr3_true_cells.obs["combo_specific"], categories=["combo_match", "combo_other"])

sc.tl.rank_genes_groups(
    cdr3_true_cells, 
    groupby="combo_specific", 
    method="wilcoxon"
)

deg_result = cdr3_true_cells.uns["rank_genes_groups"]
group = "combo_match"

df_deg = pd.DataFrame({
    "gene": deg_result["names"][group],
    "logfoldchanges": deg_result["logfoldchanges"][group],
    "pvals": deg_result["pvals"][group],
    "pvals_adj": deg_result["pvals_adj"][group],
    "scores": deg_result["scores"][group],
})
df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

df_deg.to_excel(f"{cwd}/figures/mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_DEG_result.xlsx", index=False)

df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

def plot_volcano(df, name, cwd, label_genes=True, xlim=[-10, 10], ylim=None):
    plt.figure(figsize=(8, 6))
    plt.scatter(
        df["logfoldchanges"], df["-log10_pval"],
        c=df["significant"].map({True: "red", False: "gray"}),
        alpha=0.7, edgecolors="none"
    )
    if label_genes:
        for _, row in df[df["significant"]].iterrows():
            plt.text(
                row["logfoldchanges"],
                row["-log10_pval"],
                row["gene"],
                fontsize=8,
                ha='center',
                va='bottom'
            )
    plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)  
    plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)  
    plt.axvline(2, linestyle='--', color='black', linewidth=0.8)   
    plt.xlabel("log2 Fold Change")
    plt.ylabel("-log10(p-value)")
    plt.title(f"{name}_subset (cdr3-cdr3_true vs rest)")
    plt.tight_layout()

    label_suffix = "label" if label_genes else "nolabel"
    plt.savefig(f"{cwd}/figures/mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_volcano_{label_suffix}.pdf")
    plt.close()

    fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)                    
    plt.scatter(
        df["logfoldchanges"], df["-log10_pval"],
        c=df["significant"].map({True: "red", False: "gray"}),
        alpha=0.7, edgecolors="none"
    )
    if label_genes:
        for _, row in df[df["significant"]].iterrows():
            plt.text(
                row["logfoldchanges"],
                row["-log10_pval"],
                row["gene"],
                fontsize=8,
                ha='center',
                va='bottom'
            )

    plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
    plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
    plt.axvline(2, linestyle='--', color='black', linewidth=0.8)
    plt.xlabel("log2 Fold Change")
    plt.ylabel("-log10(p-value)")
    plt.title(f"{name}_subset (cdr3-cdr3_true vs rest)")

    if xlim is not None:
        plt.xlim(xlim)
    if ylim is not None:
        plt.ylim(ylim)
    ax.autoscale(enable=False)  
    ax.margins(x=0, y=0) 

    plt.savefig(f"{cwd}/figures/mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_volcano_{label_suffix}_fixedrange.pdf", bbox_inches="tight")
    plt.close()

plot_volcano(df_deg, name, cwd, label_genes=True)
plot_volcano(df_deg, name, cwd, label_genes=False)


fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)

plt.figure(figsize=(9, 6))
plt.scatter(
    df_deg["logfoldchanges"], df_deg["-log10_pval"],
    c=df_deg["significant"].map({True: "red", False: "gray"}),
    alpha=0.7, edgecolors="none"
)
plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
plt.axvline(2, linestyle='--', color='black', linewidth=0.8)

texts = []
for _, row in df_deg[df_deg["significant"]].iterrows():
    texts.append(plt.text(row["logfoldchanges"], row["-log10_pval"], row["gene"], fontsize=8))

plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(p-value)")
plt.title(f"Volcano Plot: CD8+DP+DN_TG  target-cdr3-combo vs others")
plt.xlim(-10, 10)
ax.autoscale(enable=False)  
ax.margins(x=0, y=0) 

combo_text = "\n".join(target_combos)
plt.text(
    0.95, 0.05,
    f"Target combos:\n{combo_text}",
    ha='right', va='bottom',
    fontsize=8,
    transform=plt.gca().transAxes,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
)

plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.savefig(f"{cwd}/figures/mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_volcano_label_2.pdf")
plt.close()

fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)

plt.figure(figsize=(9, 6))
plt.scatter(
    df_deg["logfoldchanges"], df_deg["-log10_pval"],
    c=df_deg["significant"].map({True: "red", False: "gray"}),
    alpha=0.7, edgecolors="none"
)
plt.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
plt.axvline(-2, linestyle='--', color='black', linewidth=0.8)
plt.axvline(2, linestyle='--', color='black', linewidth=0.8)
plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(p-value)")
plt.title(f"Volcano Plot: CD8+DP+DN_TG  target-cdr3-combo vs others")
plt.xlim(-10, 10)
ax.autoscale(enable=False)  
ax.margins(x=0, y=0) 

plt.text(
    0.95, 0.05,
    f"Target combos:\n{combo_text}",
    ha='right', va='bottom',
    fontsize=8,
    transform=plt.gca().transAxes,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.6)
)

plt.tight_layout(rect=[0, 0.1, 1, 1])
plt.savefig(f"{cwd}/figures/mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_volcano_nolabel_2.pdf")
plt.close()


sc.pl.dotplot(
    cdr3_true_cells,
    var_names=existing_genes_2,
    groupby="combo_specific",
    standard_scale='var',
    color_map="GnBu",
    show=True,  
    save = "mouse_CD8+DP+DN_TG_subset_target-cdr3-combo_vs_others_dotplot.pdf"
)


In [None]:
# ======================= gene expression dotplot (subsets) =======================



subsets_of_a_cell = []
for name, subset in subsets:
    subset_cp = subset.copy()
    subset_cp.obs["subset_name"] = name
    subset_cp.obs_names = [f"{name}_{i}" for i in subset_cp.obs_names]  
    subsets_of_a_cell.append(subset_cp)

adata_subsets_of_a_cell = anndata.concat(subsets_of_a_cell, join="outer")

dp = sc.pl.dotplot(
    adata_subsets_of_a_cell,
    var_names=existing_genes,
    groupby="subset_name",
    standard_scale='var',
    color_map="GnBu",
    show=True,
    return_fig=True
)

dp.savefig(f"{cwd}/figures/mouse_dotplot_by_subset.pdf")

In [None]:
# ======================= gene expression dotplot (each subset) =======================


for i, gene_list in enumerate([existing_genes, existing_genes_2]):

    for name, subset in subsets:
        dp = sc.pl.dotplot(
            subset,
            var_names=gene_list,
            groupby="leiden",
            standard_scale='var',
            color_map="GnBu",
            show=True,
            return_fig=True
        )

        dp.savefig(f"{cwd}/figures/{name}_subset_cluster_dotplot_{i}.pdf")


    for name, subset in subsets_2nd:
        dp = sc.pl.dotplot(
            subset,
            var_names=gene_list,
            groupby="leiden",
            standard_scale='var',
            color_map="GnBu",
            show=True,
            return_fig=True
        )

        dp.savefig(f"{cwd}/figures/{name}_subset_cluster_dotplot_{i}.pdf")

In [None]:
# <<<<<<< Define Subsets_3rd for analysis >>>>>>>

subset_3rd_1 = subset3[
    (subset3.obs["leiden"].isin(["2"]))
].copy()   


subset_3rd_2 = subset6[
    (subset6.obs["leiden"].isin(["2"]))
].copy()


subsets_3rd = [
    ("mouse_CD4_TG_SP_Cluster2", subset_3rd_1),
    ("mouse_CD8+DP+DN_TG_SP_Cluster2", subset_3rd_2),
]

for name, subset in subsets_3rd:
    print(f"{name}:  {subset.n_obs}")

In [None]:
# <<<<<<< Subsets_3rd analysis >>>>>>>

for name, subset in subsets_3rd:

    print(f"Processing: {name}(Num of cells: {subset.n_obs})")

    # ======= pre-analysis of subset =======

    subset = subset.raw.to_adata()

    sc.pp.normalize_total(subset, target_sum=1e4)

    subset.raw = subset

    sc.pp.log1p(subset)

    sc.pp.highly_variable_genes(subset, n_top_genes=2000)

    sc.pp.scale(subset, max_value=10)

    sc.tl.pca(subset)

    sc.pp.neighbors(subset)

    sc.tl.umap(subset)

    sc.tl.leiden(subset, resolution=0.5)


    # ======= Cluster =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - Cluster analysis")

    #UMAP-cluster
    sc.pl.umap(
        subset, 
        color="leiden", 
        show=True, 
        save=f"{name}_subset_cluster_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="leiden", 
        method="wilcoxon"
    )
    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False, 
        save=f"{name}_subset_cluster_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]
    groups = deg_result["names"].dtype.names  

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cluster_DEG_results.xlsx") as writer:
        for group in groups:
            df = pd.DataFrame({
                "gene": deg_result["names"][group],
                "logfoldchanges": deg_result["logfoldchanges"][group],
                "pvals": deg_result["pvals"][group],
                "pvals_adj": deg_result["pvals_adj"][group],
                "scores": deg_result["scores"][group],
            })
            df.to_excel(writer, sheet_name=f"cluster_{group}", index=False)
    
    #UMAP-batch
    n_batches = subset.obs["batch"].nunique()
    if n_batches >= 2:

        subset.obs["batch"] = subset.obs["batch"].astype("category")
        subset.obs["batch"] = subset.obs["batch"].cat.reorder_categories(
        ["Control_TG", "Control_SP"], ordered=True
        )

        custom_palette = ["#0033ff", "#B0B0B0"] 

        print(f"{name}: {n_batches} batches have been detected. Save the UMAP plots color-coded by batch.")
        sc.pl.umap(
            subset, 
            color="batch", 
            palette=custom_palette, 
            show=True, 
            save=f"{name}_subset_batch_UMAP.pdf"
        )

    else:
        print(f"{name}: Since there is only one batch type, batch UMAP has been skipped.")
        

    # ======= cdr3 =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3")

    cdr3_df = pd.read_csv(f"{cwd}/cdr3_summary_tetTG_-_NaTG.csv")

    cdr3_types = [
        ("CD4TRA", "cdr3_TRA"),
        ("CD4TRB", "cdr3_TRB"),
        ("CD8TRA", "cdr3_TRA"),
        ("CD8TRB", "cdr3_TRB"),
    ]

    subset.obs["cdr3_positive"] = "NA"

    for label, subset_col in cdr3_types:
        cdr3_list = cdr3_df[f"{label}_cdr3"].dropna().unique()
        cdr3_set = set(cdr3_list)

        condition = (
            subset.obs[subset_col].isin(cdr3_set) &
            (subset.obs["cdr3_positive"] == "NA")  
        )

        subset.obs.loc[condition, "cdr3_positive"] = "cdr3_true"

    subset.obs["cdr3_positive"] = pd.Categorical(subset.obs["cdr3_positive"], categories=["cdr3_true", "NA"])

    #UMAP
    sc.pl.umap(
        subset, 
        color="cdr3_positive", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_cdr3_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="cdr3_positive", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_cdr3_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "cdr3_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/mouse_{name}_subset_cdr3_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= cdr3_TRA or cdr3_TRB =======

    for chain, chain_label in [("TRA", "cdr3_TRA"), ("TRB", "cdr3_TRB")]:
    
        print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3_{chain} ")

        label_col = f"{chain_label}_positive"
        subset.obs[label_col] = "NA"

        relevant_cols = [col for col in cdr3_df.columns if col.endswith(f"{chain}_cdr3")]
        cdr3_combined_set = set()

        for col in relevant_cols:
            cdr3_combined_set.update(cdr3_df[col].dropna().unique())

        condition = subset.obs[chain_label].isin(cdr3_combined_set)
        subset.obs.loc[condition, label_col] = f"{chain_label}_true"

        subset.obs[label_col] = pd.Categorical(subset.obs[label_col])

        #UMAP
        sc.pl.umap(
            subset, 
            color=label_col, 
            palette=["red", "lightgray"], 
            show=True, 
            save=f"{name}_subset_{chain_label}_UMAP.pdf"
        )

        #DEG
        sc.tl.rank_genes_groups(
            subset, 
            groupby=label_col, 
            method="wilcoxon"
        )

        sc.pl.rank_genes_groups(
            subset,
            n_genes=25, 
            sharey=False, 
            show=False,
            save=f"{name}_subset_{chain_label}_DEG_fig.pdf"
        )

        group = f"{chain_label}_true"
        deg_result = subset.uns["rank_genes_groups"]
        df_chain = pd.DataFrame({
            "gene": deg_result["names"][group],
            "logfoldchanges": deg_result["logfoldchanges"][group],
            "pvals": deg_result["pvals"][group],
            "pvals_adj": deg_result["pvals_adj"][group],
            "scores": deg_result["scores"][group],
        })

        with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_{chain_label}_DEG_results.xlsx") as writer:
            df_chain.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= cdr3-cdr3 combo =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - cdr3-cdr3 combo")

    df = pd.read_csv(f"{cwd}/cdr3_summary_tetTG_-_NaTG_forcdr3combo.csv")

    combo_list = [
        f"{tra}_{trb}" 
        for tra in df["TRA_cdr3"].dropna().unique()
        for trb in df["TRB_cdr3"].dropna().unique()
    ]
    
    combo_set = set(combo_list)

    subset.obs["cdr3-cdr3_combo"] = subset.obs["cdr3_TRA_TRB"].apply(
        lambda x: f"cdr3-cdr3_true" if x in combo_set else "NA"
    )

    subset.obs["cdr3-cdr3_combo"] = pd.Categorical(subset.obs["cdr3-cdr3_combo"], categories=["cdr3-cdr3_true", "NA"])

    print(subset.obs["cdr3-cdr3_combo"].cat.categories)

    order = subset.obs["cdr3-cdr3_combo"] == "NA"
    na_cells = subset.obs[order].index
    true_cells = subset.obs[~order].index

    new_order = na_cells.tolist() + true_cells.tolist()

    subset = subset[new_order, :]


    true_cells = subset.obs.loc[subset.obs["cdr3-cdr3_combo"] == "cdr3-cdr3_true", ["cdr3_TRA_TRB", "v_gene_TRA", "j_gene_TRA", "cdr3_TRA", "v_gene_TRB", "j_gene_TRB", "cdr3_TRB"]]

    combo_freq = true_cells["cdr3_TRA_TRB"].value_counts().reset_index()
    combo_freq.columns = ["cdr3_combo", "frequency"]

    combo_details = true_cells.drop_duplicates(subset=["cdr3_TRA_TRB"]).rename(columns={"cdr3_TRA_TRB": "cdr3_combo"})

    result_df = pd.merge(combo_freq, combo_details, on="cdr3_combo", how="left")

    result_df.to_excel(f"{cwd}/figures/mouse_{name}_subset_cdr3-cdr3_list_frequency.xlsx", index=False)

    #UMAP
    sc.pl.umap(
        subset, 
        color="cdr3-cdr3_combo", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_cdr3-cdr3_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="cdr3-cdr3_combo", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_cdr3-cdr3_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "cdr3-cdr3_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_cdr3-cdr3_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    #dotplot
    dp = sc.pl.dotplot(
        subset,
        var_names=existing_genes_2,
        groupby="cdr3-cdr3_combo",
        standard_scale='var',
        color_map="GnBu",
        show=True,
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_.pdf")


    #dotplot and violin plot
    custom_palette = {"cdr3-cdr3_true": "#1f77b4", "NA": "#B0B0B0"}

    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD4TG.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD4TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD4TG.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TGSP.pdf")

    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TGSP, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TGSP.pdf"
    )


    dp = sc.pl.dotplot(
        subset, 
        var_names=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        standard_scale='var', 
        color_map="GnBu", 
        show=True, 
        return_fig=True
    )
    dp.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_dotplot_CD8TG.pdf")
    
    sc.pl.violin(
        subset, 
        keys=existing_genes_CD8TG, 
        groupby="cdr3-cdr3_combo", 
        palette=custom_palette, 
        stripplot=True, 
        jitter=0.25, 
        size=2, 
        rotation=90, 
        show=True, 
        save=f"_{name}_subset_cdr3-cdr3_violinplot_CD8TG.pdf"
    )


    #volcano plot
    deg_result = subset.uns["rank_genes_groups"]
    group = "cdr3-cdr3_true"
    df_deg = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })
                
    df_deg["-log10_pval"] = -np.log10(df_deg["pvals"])
    df_deg["significant"] = (df_deg["pvals"] < 0.05) & (abs(df_deg["logfoldchanges"]) > 2)

    def plot_volcano(df, name, cwd, label_genes=True, xlim=None, ylim=None, label_specific_genes=None, label_suffix=None, base_ns_color="#B0B0B0", base_sig_color="#d62728", highlight_color="#1f77b4"):

        df = df.copy()

        if label_specific_genes is not None:
            hi = {g.upper() for g in label_specific_genes}
            df["is_highlight"] = df["gene"].str.upper().isin(hi)
            genes_to_label = df[df["is_highlight"]]
            if label_suffix is None:
                label_suffix = "specificgenes"

        elif label_genes:
            df["is_highlight"] = df["significant"]
            genes_to_label = df[df["significant"]]
            if label_suffix is None:
                label_suffix = "label"

        else:
            df["is_highlight"] = False
            genes_to_label = pd.DataFrame()
            if label_suffix is None:
                label_suffix = "nolabel"

        df["_ns"]  = ~df["significant"]
        df["_sig"] = df["significant"] & ~df["is_highlight"]
        df["_hi"]  = df["is_highlight"]

        def _draw(ax, fix_range=False):
            point_size = 24  

            ax.scatter(
                df.loc[df["_ns"], "logfoldchanges"],
                df.loc[df["_ns"], "-log10_pval"],
                c=base_ns_color, s=point_size, alpha=0.7, edgecolors="none", zorder=1
            )

            ax.scatter(
                df.loc[df["_sig"], "logfoldchanges"],
                df.loc[df["_sig"], "-log10_pval"],
                c=base_sig_color, s=point_size, alpha=0.9, edgecolors="none", zorder=2
            )

            ax.scatter(
                df.loc[df["_hi"], "logfoldchanges"],
                df.loc[df["_hi"], "-log10_pval"],
                c=highlight_color, s=point_size, alpha=1.0, edgecolors="none", zorder=4
            )

            ax.axhline(-np.log10(0.05), linestyle='--', color='black', linewidth=0.8)
            ax.axvline(-2, linestyle='--', color='black', linewidth=0.8)
            ax.axvline( 2, linestyle='--', color='black', linewidth=0.8)

            if not genes_to_label.empty:

                to_annot = genes_to_label.sort_values("-log10_pval", ascending=False)
                offsets = [(10, 10)]  

                for i, (_, r) in enumerate(to_annot.iterrows()):
                    dx, dy = offsets[i % len(offsets)]
                    x0, y0 = r["logfoldchanges"], r["-log10_pval"]
                    x1, y1 = x0 + dx * 0.05, y0 + dy * 0.05

                    ax.text(
                        x1, y1, r["gene"],
                        fontsize=9, weight="bold", color="black", zorder=6,
                        path_effects=[pe.withStroke(linewidth=3, foreground="white")]
                    )

                    ax.plot([x0, x1], [y0, y1], color="black", linewidth=0.8, zorder=5)

            ax.set_xlabel("log2 Fold Change")
            ax.set_ylabel("-log10(p-value)")
            ax.set_title(f"{name}_subset (cdr3-cdr3_true vs rest)")


            if fix_range:
                if xlim is not None: ax.set_xlim(xlim)
                if ylim is not None: ax.set_ylim(ylim)
                ax.autoscale(enable=False)
                ax.margins(x=0, y=0)

        plt.figure(figsize=(8, 6))
        ax = plt.gca()
        _draw(ax, fix_range=False)
        plt.tight_layout()
        plt.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}.pdf")
        plt.close()

        fig, ax = plt.subplots(figsize=(8, 6), constrained_layout=False)
        _draw(ax, fix_range=True)
        fig.savefig(f"{cwd}/figures/{name}_subset_cdr3-cdr3_volcano_plot_{label_suffix}_fixedrange.pdf", bbox_inches="tight")
        plt.close(fig)



    plot_volcano(df_deg, name, cwd, label_genes=True, xlim=[-10, 10])
    plot_volcano(df_deg, name, cwd, label_genes=False, xlim=[-10, 10])


    specificgenesCD4_TG = [
        "Gbp9", "Gbp6", "Oas1a", "Ifi27l2a", "Ifi47", "Parp14", "Zbp1", "Irf7", "Isg15", "Cxcr3", "Icos", "Stat1"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG, 
        label_suffix = "specificgenesCD4_TG"
    )


    specificgenesCD4_TG_SP = [
        "Itgb1", "Icos", "Isg15", "Bhlhe40", "Cxcr3", "Ifng", "Nr4a1", "Gbp6", "Oas1a", "Isg20", "Nkg7", "Gbp9", "Icam1", "Ccl5", "Hif1a", "Ifi27l2a", "Irf7", "Zbp1", "S1pr1", "Il7r", "Klf2", "Bcl2", "Ccr7", "Sell"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD4_TG_SP, 
        label_suffix = "specificgenesCD4_TG_SP"
    )


    specificgenesCD8_DP_DN_TG = [
        "Trbv13-3", "Gzmb", "Gzma", "Itga1", "Ccr2", "Ly6a", "Isg15", "Cd8b1", "Gzmk", "Ccl5", "Ctla4", "Ifng", "Cd8a", "S1pr1", "Klf2", "Bcl2", "Tcf7", "Ccr7", "Sell", "Eomes", "Tox"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG, 
        label_suffix = "specificgenesCD8_DP_DN_TG"
    )


    specificgenesCD8_DP_DN_TG_SP = [""
    "Gzma", "Ccl5", "S100a6", "Gzmb", "Trbv13-3", "Ifitm1", "Itga1", "Litaf", "Ifng", "Gzmk", "Ccr2", "Bhlhe40", "Nr4a2", "Ctla4", "Ccl4", "Ifitm2", "Tbx21", "Ccr5", "Isg15", "Cd38", "Icos", "Cxcr6", "Isg20", "Nr4a1", "Nr4a3", "Ifitm3", "Nkg7", "S1pr1", "Klf2", "Itga6", "Tox", "Ccr7", "Tcf7", "Sell", "Ccr9", "Tnfsf8"
    ]
    plot_volcano(
        df_deg, 
        name, 
        cwd, 
        label_genes=False, 
        xlim=[-10, 10], 
        label_specific_genes=specificgenesCD8_DP_DN_TG_SP, 
        label_suffix = "specificgenesCD8_DP_DN_TG_SP"
    )


    # ======= v =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - V")

    V_df = pd.read_csv(f"{cwd}/selected_V.csv")

    V_types = [
        ("CD4_TRA_V", "v_gene_TRA"),
        ("CD4_TRB_V", "v_gene_TRB"),
        ("CD8_TRA_V", "v_gene_TRA"),
        ("CD8_TRB_V", "v_gene_TRB"),
    ]

    subset.obs["V_positive"] = "NA"

    for label, subset_col in V_types:
        V_list = V_df[label].dropna().unique()
        V_set = set(V_list)

        condition = (
            subset.obs[subset_col].isin(V_set) &
            (subset.obs["V_positive"] == "NA")  
        )

        subset.obs.loc[condition, "V_positive"] = "V_true"

    subset.obs["V_positive"] = pd.Categorical(subset.obs["V_positive"], categories=["V_true", "NA"])
    print(subset.obs["V_positive"].cat.categories)

    #UMAP
    sc.pl.umap(
        subset, 
        color="V_positive", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_V_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="V_positive", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_V_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "V_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_V_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)


    # ======= v-v combo =======

    print(f"Processing: {name}(Num of cells: {subset.n_obs}) - v-v combo")

    df = pd.read_csv(f"{cwd}/selected_V_TRA_TRB_forVcombo.csv")
    combo_list = [f"{tra}_{trb}" for tra in df["TRA_V"].dropna().unique()
                                for trb in df["TRB_V"].dropna().unique()]
    combo_set = set(combo_list)

    subset.obs["V-V_combo"] = subset.obs["V_TRA_TRB"].apply(
        lambda x: f"V-V_true" if x in combo_set else "NA"
    )
    subset.obs["V-V_combo"] = pd.Categorical(subset.obs["V-V_combo"], categories=["V-V_true", "NA"])
    print(subset.obs["V-V_combo"].cat.categories)

    #UMAP
    sc.pl.umap(
        subset, 
        color="V-V_combo", 
        palette=["red", "lightgray"], 
        show=True, 
        save=f"{name}_subset_V-V_UMAP.pdf"
    )

    #DEG
    sc.tl.rank_genes_groups(
        subset, 
        groupby="V-V_combo", 
        method="wilcoxon"
    )

    sc.pl.rank_genes_groups(
        subset, 
        n_genes=25, 
        sharey=False, 
        show=False , 
        save=f"{name}_subset_V-V_DEG_fig.pdf"
    )

    deg_result = subset.uns["rank_genes_groups"]

    group = "V-V_true"
    df = pd.DataFrame({
        "gene": deg_result["names"][group],
        "logfoldchanges": deg_result["logfoldchanges"][group],
        "pvals": deg_result["pvals"][group],
        "pvals_adj": deg_result["pvals_adj"][group],
        "scores": deg_result["scores"][group],
    })

    with pd.ExcelWriter(f"{cwd}/figures/{name}_subset_V-V_DEG_results.xlsx") as writer:
        df.to_excel(writer, sheet_name=f"{group}_vs_rest", index=False)
    

# Rename in Tarminal

for f in *mouse*; do
  new=$(echo "$f" | sed 's/^.*mouse/mouse/')
  mv "$f" "$new"
done

# Rename in PowerShell

Get-ChildItem -File *mouse* | ForEach-Object {
    $newName = $_.Name -replace '.*mouse','mouse'
    if ($_.Name -ne $newName) {
        Rename-Item -LiteralPath $_.FullName -NewName $newName
        Write-Output "$($_.Name) → $newName"
    }
}