In [None]:
## clean the raw input TCR data
import os
import pandas as pd
import numpy as np


data_dir = "./"  # raw TCR data directory

for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        try:
            df = pd.read_csv(file_path)

            # check aminoAcid column
            if "aminoAcid" not in df.columns:
                print(f"Skipped (no aminoAcid column): {file}")
                continue

            # clean aminoAcid（dump *、"nan"/"NaN"）
            df["aminoAcid"] = df["aminoAcid"].replace(to_replace=r".*\*.*", value=np.nan, regex=True)
            df["aminoAcid"] = df["aminoAcid"].replace(["", "nan", "NaN", "NA"], np.nan)

            # delet NA
            df = df[df["aminoAcid"].notna()]

            # file name without extension
            base_name = os.path.splitext(file)[0]

            # 
            if "Count_1" in df.columns:
                df_c1 = df[df["Count_1"].notna()].copy()
                if "Count_2" in df_c1.columns:
                    df_c1.drop(columns=["Count_2"], inplace=True)
                output_c1 = os.path.join(data_dir, f"{base_name}_Count_1.csv")
                df_c1.to_csv(output_c1, index=False)
                print(f"Saved: {output_c1}")

            # 
            if "Count_2" in df.columns:
                df_c2 = df[df["Count_2"].notna()].copy()
                if "Count_1" in df_c2.columns:
                    df_c2.drop(columns=["Count_1"], inplace=True)
                output_c2 = os.path.join(data_dir, f"{base_name}_Count_2.csv")
                df_c2.to_csv(output_c2, index=False)
                print(f"Saved: {output_c2}")

        except Exception as e:
            print(f"Error processing {file}: {e}")


In [None]:
# summarize the cleaned data 
import os
import pandas as pd
import numpy as np


data_dir = "./"  # input data directory
all_counts = []

# 
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        try:
            df = pd.read_csv(file_path)

            if "aminoAcid" not in df.columns:
                print(f"Skipped (no aminoAcid column): {file}")
                continue

            df["aminoAcid"] = df["aminoAcid"].replace(to_replace=r".*\*.*", value=np.nan, regex=True)
            df["aminoAcid"] = df["aminoAcid"].replace(["", "nan", "NaN", "NA"], np.nan)

            if "Count_1" in df.columns:
                part1 = df.loc[df["aminoAcid"].notna() & df["Count_1"].notna(), "Count_1"]
            else:
                part1 = pd.Series([], dtype=float)

            if "Count_2" in df.columns:
                part2 = df.loc[df["aminoAcid"].notna() & df["Count_2"].notna(), "Count_2"]
            else:
                part2 = pd.Series([], dtype=float)


            combined = pd.concat([part1, part2], ignore_index=True)
            all_counts.append(combined)

        except Exception as e:
            print(f"Error processing {file}: {e}")

total_data = pd.concat(all_counts, ignore_index=True)

if total_data.empty:
    print("No valid data found across all files.")
else:
    #
    percent_100 = (total_data > 100).mean() * 100
    percent_200 = (total_data > 200).mean() * 100
    percent_500 = (total_data > 500).mean() * 100
    mean_val    = total_data.mean()
    median_val  = total_data.median()
    q1          = total_data.quantile(0.25)
    q3          = total_data.quantile(0.75)
    iqr_val     = q3 - q1
    p95         = total_data.quantile(0.95)
    p99         = total_data.quantile(0.99)
    max_val     = total_data.max()
    valid_n     = len(total_data)

    summary_df = pd.DataFrame([{
        "Percent > 100": percent_100,
        "Percent > 200": percent_200,
        "Percent > 500": percent_500,
        "Mean": mean_val,
        "Median": median_val,
        "Q1": q1,
        "Q3": q3,
        "IQR": iqr_val,
        "P95": p95,
        "P99": p99,
        "Max": max_val,
        "ValidCount": valid_n
    }], index=["AllFiles_Filtered_aminoAcid"])

    print(summary_df)
    summary_df.to_csv("/Users/awu10/Desktop/TCR_Subsampling/Pre/ProcessedData/CSV_file/summary_filtered_combined.csv")


                             Percent > 100  Percent > 200  Percent > 500  \
AllFiles_Filtered_aminoAcid       0.053183       0.028777       0.014206   

                                 Mean  Median   Q1   Q3  IQR  P95  P99  \
AllFiles_Filtered_aminoAcid  1.791094     1.0  1.0  1.0  0.0  3.0  8.0   

                                 Max  ValidCount  
AllFiles_Filtered_aminoAcid  40409.0     1372623  
