## data process

In [None]:
import pandas as pd
import os

base_path = "/ihome/ylee/yiz133/Code/Scirpy/scripy/data"
cns_path = f"{base_path}/0605_CNS_filtered_contig_annotations.csv"
spleen_path= f"{base_path}/0605_SPL_filtered_contig_annotations.csv"

# Strict filtering for the TCRs based on scirpy analysis
filtered_meta = pd.read_csv(f"{base_path}/strict_filtered_meta.csv")
barcodes_set = set(filtered_meta.iloc[:,0])
cns_0605_ids = {b.split("_")[0] for b in barcodes_set if "0605_CNS" in b}
spleen_0605_ids = {b.split("_")[0] for b in barcodes_set if "0605_SPL" in b}


# Define output directories
output_dir = "/ihome/ylee/yiz133/Code/deepTCR/DeepTCR"
cns_dir = os.path.join(output_dir, "CNS")
spleen_dir = os.path.join(output_dir, "Spleen")

# Create folders if they don't exist
os.makedirs(cns_dir, exist_ok=True)
os.makedirs(spleen_dir, exist_ok=True)

# Function to filter for high-confidence, productive TCR Beta (TRB) chains
def strict_filter_tcr_beta(df, ids):
    return df[
        (df["barcode"].isin(ids)) &
        (df["high_confidence"] == True) &
        (df["productive"] == True) &
        (df["chain"] == "TRB")
    ][["cdr3", "v_gene","j_gene"]].dropna()

    # Remove multiple V-Beta/J-Beta notations (keep only the first one if multiple exist)
    df["v_gene"] = df["v_gene"].apply(lambda x: x.split("+")[0] if "+" in x else x)
    df["j_gene"] = df["j_gene"].apply(lambda x: x.split("+")[0] if "+" in x else x)

    return df

# Apply filtering
cns_filtered = strict_filter_tcr_beta(pd.read_csv(cns_path),cns_0605_ids)
spleen_filtered = strict_filter_tcr_beta(pd.read_csv(spleen_path),spleen_0605_ids)

# Add count column (each row initially counts as 1)
cns_filtered["count"] = 1
spleen_filtered["count"] = 1

# Resolve V-Beta/J-Beta conflicts by selecting the most common one per CDR3
def resolve_vj_conflicts(group):
    v_gene = group["v_gene"].value_counts().idxmax() if len(group["v_gene"].unique()) > 1 else group["v_gene"].iloc[0]
    j_gene = group["j_gene"].value_counts().idxmax() if len(group["j_gene"].unique()) > 1 else group["j_gene"].iloc[0]
    count = group["count"].sum()  # Sum counts for duplicate CDR3s
    return pd.Series([count, v_gene, j_gene], index=["count", "v_gene", "j_gene"])

# Process CNS and Spleen separately
cns_final = cns_filtered.groupby("cdr3").apply(resolve_vj_conflicts).reset_index()
spleen_final = spleen_filtered.groupby("cdr3").apply(resolve_vj_conflicts).reset_index()

# Reorder columns to match format: beta, counts, v_beta, j_beta
cns_final = cns_final[["cdr3", "count", "v_gene", "j_gene"]]
spleen_final = spleen_final[["cdr3", "count", "v_gene", "j_gene"]]

# Save CNS and Spleen data separately
cns_output_file = os.path.join(cns_dir, "CNS.tsv")
spleen_output_file = os.path.join(spleen_dir, "Spleen.tsv")

# Ensure headers are included
cns_final.to_csv(cns_output_file, sep="\t", index=False, header=["beta", "counts", "v_beta", "j_beta"])
spleen_final.to_csv(spleen_output_file, sep="\t", index=False, header=["beta", "counts", "v_beta", "j_beta"])

print(f"Saved CNS sequences to: {cns_output_file}")
print(f"Saved Spleen sequences to: {spleen_output_file}")


# data process old

In [None]:
import pandas as pd
import os

# File Paths
cns_path = "/global/scratch/users/zhoufd/TCR/data/CNS_0516_config_5prime_2runs/outs/per_sample_outs/CNS_0516_config_5prime_2runs/vdj_t/filtered_contig_annotations.csv"
spleen_path = "/global/scratch/users/zhoufd/TCR/data/Spleen_0516_config_5prime_2runs/outs/per_sample_outs/Spleen_0516_config_5prime_2runs/vdj_t/filtered_contig_annotations.csv"

# Define output directories
output_dir = "/global/scratch/users/zhoufd/TCR/saved_data/DeepTCR_input"
cns_dir = os.path.join(output_dir, "CNS")
spleen_dir = os.path.join(output_dir, "Spleen")

# Create folders if they don't exist
os.makedirs(cns_dir, exist_ok=True)
os.makedirs(spleen_dir, exist_ok=True)

# Function to filter for high-confidence, productive TCR Beta (TRB) chains
def filter_tcr_beta(df):
    df = df[
        (df["is_cell"] == True) &
        (df["high_confidence"] == True) &
        (df["productive"] == True) &
        (df["chain"] == "TRB")
    ][["cdr3", "v_gene", "j_gene"]].dropna()

    # Remove multiple V-Beta/J-Beta notations (keep only the first one if multiple exist)
    df["v_gene"] = df["v_gene"].apply(lambda x: x.split("+")[0] if "+" in x else x)
    df["j_gene"] = df["j_gene"].apply(lambda x: x.split("+")[0] if "+" in x else x)

    return df

# Apply filtering
cns_filtered = filter_tcr_beta(pd.read_csv(cns_path))
spleen_filtered = filter_tcr_beta(pd.read_csv(spleen_path))

# Add count column (each row initially counts as 1)
cns_filtered["count"] = 1
spleen_filtered["count"] = 1

# Resolve V-Beta/J-Beta conflicts by selecting the most common one per CDR3
def resolve_vj_conflicts(group):
    v_gene = group["v_gene"].value_counts().idxmax() if len(group["v_gene"].unique()) > 1 else group["v_gene"].iloc[0]
    j_gene = group["j_gene"].value_counts().idxmax() if len(group["j_gene"].unique()) > 1 else group["j_gene"].iloc[0]
    count = group["count"].sum()  # Sum counts for duplicate CDR3s
    return pd.Series([count, v_gene, j_gene], index=["count", "v_gene", "j_gene"])

# Process CNS and Spleen separately
cns_final = cns_filtered.groupby("cdr3").apply(resolve_vj_conflicts).reset_index()
spleen_final = spleen_filtered.groupby("cdr3").apply(resolve_vj_conflicts).reset_index()

# Reorder columns to match format: beta, counts, v_beta, j_beta
cns_final = cns_final[["cdr3", "count", "v_gene", "j_gene"]]
spleen_final = spleen_final[["cdr3", "count", "v_gene", "j_gene"]]

# Save CNS and Spleen data separately
cns_output_file = os.path.join(cns_dir, "CNS.tsv")
spleen_output_file = os.path.join(spleen_dir, "Spleen.tsv")

# Ensure headers are included
cns_final.to_csv(cns_output_file, sep="\t", index=False, header=["beta", "counts", "v_beta", "j_beta"])
spleen_final.to_csv(spleen_output_file, sep="\t", index=False, header=["beta", "counts", "v_beta", "j_beta"])

print(f"Saved CNS sequences to: {cns_output_file}")
print(f"Saved Spleen sequences to: {spleen_output_file}")

