# modularized

In [140]:
import pandas as pd
import os
def process_tmb_msi(csv_file_path, clinical_file_path, tmb_threshold=10, msi_threshold=2):
    """
    Processes a CSV of patient IDs and a clinical data file to mark TMB_HIGH and MSI_HIGH.
    
    :param csv_file_path: Path to the CSV file with an 'ID' column.
    :param clinical_file_path: Path to the clinical data text file (tab-separated).
    :param tmb_threshold: Numeric threshold above which TMB_HIGH is set to 1.
    :param msi_threshold: Numeric threshold above which MSI_HIGH is set to 1.
    :return: A pandas DataFrame with additional columns 'TMB_HIGH' and 'MSI_HIGH'.
    """
    
    # Load the CSV file
    csv_df = pd.read_csv(csv_file_path)
    # Standardize CSV IDs to the first 12 characters (typical TCGA patient ID length)
    csv_df["ID"] = csv_df["ID"].str.slice(0, 12)
    
    # Load clinical data (skip rows as needed if the file has a 4-line header)
    clinical_df = pd.read_csv(clinical_file_path, sep='\t', skiprows=4)
    
    # Clean up and standardize column names in both DataFrames
    clinical_df.columns = clinical_df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
    csv_df.columns = csv_df.columns.str.strip().str.replace(r'\s+', '_', regex=True)
    
    # Make sure the clinical PATIENT_ID is also sliced to 12 characters if it includes sample suffixes
    # If your clinical file *already* has 12-char IDs, then you can comment this out.
    clinical_df['PATIENT_ID'] = clinical_df['PATIENT_ID'].str.slice(0, 12)
    
    # Initialize TMB_HIGH and MSI_HIGH as 0
    csv_df['TMB_HIGH'] = 0
    csv_df['MSI_HIGH'] = 0
    
    # Convert TMB_NONSYNONYMOUS to numeric (coerce errors to NaN)
    clinical_df['TMB_NONSYNONYMOUS'] = pd.to_numeric(clinical_df['TMB_NONSYNONYMOUS'], errors='coerce')
    
    # Identify TMB-high patients
    tmb_high_ids = clinical_df.loc[
        clinical_df['TMB_NONSYNONYMOUS'] > tmb_threshold, 'PATIENT_ID'
    ].unique()
    
    # Mark these in the CSV DataFrame
    csv_df.loc[csv_df['ID'].isin(tmb_high_ids), 'TMB_HIGH'] = 1
    
    # Convert MSI_SCORE_MANTIS to numeric (coerce errors to NaN)
    clinical_df['MSI_SCORE_MANTIS'] = pd.to_numeric(clinical_df['MSI_SCORE_MANTIS'], errors='coerce')
    
    # Identify MSI-high patients
    msi_high_ids = clinical_df.loc[
        clinical_df['MSI_SCORE_MANTIS'] > msi_threshold, 'PATIENT_ID'
    ].unique()
    
    # Mark these in the CSV DataFrame
    csv_df.loc[csv_df['ID'].isin(msi_high_ids), 'MSI_HIGH'] = 1
    
    return csv_df


# Example: You have 4 sets of file pairs
file_pairs = [
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/Bladder_Labels.csv',
      r'/data/temporary/amirhosein/bladder_project/blca_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/Breast_Labels.csv',
      r'/data/temporary/amirhosein/breast_Project(subdirectories)/brca_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/Prostate_Labels.csv',
      r'/data/temporary/amirhosein/Prostate_Project/prad_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/Skin_Labels.csv',
      r'/data/temporary/amirhosein/skin_project/skcm_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/Colon_labels.csv',
      r'/data/temporary/amirhosein/gastrointestinal_project(no_subdirectories)/coadread_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
    (
      r'/data/temporary/amirhosein/mutation_project/TP53_labels/lungld_Labels.csv',
      r'/data/temporary/amirhosein/lung_project_diagnostics/luad_tcga_pan_can_atlas_2018/data_clinical_sample.txt'
    ),
]

for i, (csv_path, clinical_path) in enumerate(file_pairs, start=1):
    # You can adjust these thresholds as needed
    processed_df = process_tmb_msi(csv_path, clinical_path, tmb_threshold=10, msi_threshold=2)
    
    # Extract the CSV file’s name (without extension) to append to "all_three_labels"
    df_name = os.path.splitext(os.path.basename(csv_path))[0]  # e.g. "Bladder_Labels"
    
    # Construct the output file name
    # Example: "/data/temporary/amirhosein/mutation_project/all_three_labelsBladder_Labels.csv"
    out_file = f'/data/temporary/amirhosein/mutation_project/all_labels/all_three_labels{df_name}.csv'
    
    # Save the processed DataFrame
    processed_df.to_csv(out_file, index=False)
    
    # Print a summary
    print(f"Processed file pair {i} -> {df_name}")
    print(f"Saved to: {out_file}")
    print("Shape:", processed_df.shape)
    print("TMB_HIGH value counts:")
    print(processed_df["TMB_HIGH"].value_counts())
    print("MSI_HIGH value counts:")
    print(processed_df["MSI_HIGH"].value_counts())
    print("--------")

Processed file pair 1 -> Bladder_Labels
Saved to: /data/temporary/amirhosein/mutation_project/all_labels/all_three_labelsBladder_Labels.csv
Shape: (341, 4)
TMB_HIGH value counts:
TMB_HIGH
0    246
1     95
Name: count, dtype: int64
MSI_HIGH value counts:
MSI_HIGH
0    341
Name: count, dtype: int64
--------
Processed file pair 2 -> Breast_Labels
Saved to: /data/temporary/amirhosein/mutation_project/all_labels/all_three_labelsBreast_Labels.csv
Shape: (1119, 4)
TMB_HIGH value counts:
TMB_HIGH
0    1087
1      32
Name: count, dtype: int64
MSI_HIGH value counts:
MSI_HIGH
0    1119
Name: count, dtype: int64
--------
Processed file pair 3 -> Prostate_Labels
Saved to: /data/temporary/amirhosein/mutation_project/all_labels/all_three_labelsProstate_Labels.csv
Shape: (448, 4)
TMB_HIGH value counts:
TMB_HIGH
0    445
1      3
Name: count, dtype: int64
MSI_HIGH value counts:
MSI_HIGH
0    448
Name: count, dtype: int64
--------
Processed file pair 4 -> Skin_Labels
Saved to: /data/temporary/amirhosei

In [139]:
import numpy as np
tes = pd.read_csv(r'/data/temporary/amirhosein/breast_Project(subdirectories)/brca_tcga_pan_can_atlas_2018/data_clinical_sample.txt', sep='\t')
np.max(tes["MSI MANTIS Score"][4:].astype("float"))

0.8283