"""
Copyright 2026 Zsolt Bedőházi

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedGroupKFold
import ast

In [None]:
RND_SEED = 30

### load merged_df_latest.csv that was generated earlier in "generate_multi_strat_common_test.ipynb" and it is already filtered, cleaned

In [None]:
merged_df = pd.read_csv("merged_df_latest.csv", index_col=0)

In [None]:
merged_df.shape

In [None]:
merged_df.head(3)

### remove stage 4

In [None]:
merged_df = merged_df[merged_df['stage'].isin([1, 2, 3])]
merged_df.reset_index(inplace=True, drop=True)

In [None]:
merged_df.shape

In [None]:
804 - 790 # same amount of biopsy bag and patient removed

In [None]:
merged_df.head(3)

In [None]:
merged_df.shape

In [None]:
merged_df['biopsy_id'].nunique()

In [None]:
merged_df['patient_ngsci_id'].nunique()

###  check — for patients who have multiple biopsies — whether all their biopsies have the same stage value

In [None]:
# Step 1: Group by patient and collect unique stages
patient_stage_sets = (
    merged_df.groupby('patient_ngsci_id')['stage']
    .apply(lambda x: set(x.dropna()))
)

In [None]:
# Step 2: Count number of biopsies per patient
biopsy_counts = merged_df.groupby('patient_ngsci_id')['biopsy_id'].nunique()

In [None]:
# Step 3: Filter to patients with more than one biopsy
patients_with_multiple_biopsies = biopsy_counts[biopsy_counts > 1].index

In [None]:
# Step 4: Among them, check which patients have >1 unique stage
patients_with_inconsistent_staging = patient_stage_sets.loc[patients_with_multiple_biopsies]
patients_with_different_stages = patients_with_inconsistent_staging[patients_with_inconsistent_staging.apply(lambda x: len(x) > 1)]

In [None]:
# Final Output
print("Number of patients with multiple biopsies and inconsistent stage labels:", len(patients_with_different_stages))
print("These patient IDs are:", patients_with_different_stages.index.tolist())

In [None]:
for patient_id, stages in patients_with_different_stages.items():
    print(f"Patient {patient_id} has multiple stages: {stages}")

### load  cancer-staging.csv for more info on the stage

In [None]:
cancer_staging_df = pd.read_csv("cancer-staging.csv")

In [None]:
cancer_staging_df

In [None]:
cancer_staging_df['patient_ngsci_id'].nunique()

In [None]:
cancer_staging_df['biopsy_id'].nunique()

In [None]:
cancer_staging_df[cancer_staging_df["patient_ngsci_id"]=="105ce742-8d3b-4294-8995-9f660b467345"]

In [None]:
cancer_staging_df[cancer_staging_df["assessment_type"]=="Clinical"]['biopsy_id'].nunique()

In [None]:
cancer_staging_df[cancer_staging_df["assessment_type"]=="Pathological"]['biopsy_id'].nunique()

In [None]:
sorted(cancer_staging_df["stage"].unique())

### verify for each biopsy_id that the Pathological stage is less or more severe than the Clinical stage

In [None]:
# Step 1: Define stage severity order
stage_order = ['0', 'IA', 'IB', 'IIA', 'IIB', 'IIIA', 'IIIB', 'IIIC', 'IV']
stage_rank = {stage: i for i, stage in enumerate(stage_order)}

# Step 2: Filter to only Clinical and Pathological rows
staging_df = cancer_staging_df[cancer_staging_df['assessment_type'].isin(['Clinical', 'Pathological'])]

# Step 3: Map stages to severity rank
staging_df['stage_rank'] = staging_df['stage'].map(stage_rank)

# Step 4: Pivot so we can compare Clinical vs Pathological for each biopsy_id
pivoted = staging_df.pivot_table(
    index='biopsy_id',
    columns='assessment_type',
    values='stage_rank',
    aggfunc='first'  # if duplicates exist, take the first
)

# Step 5: Check for violations where Pathological is less severe than Clinical
violations = pivoted[
    (pivoted['Clinical'].notnull()) & 
    (pivoted['Pathological'].notnull()) & 
    (pivoted['Pathological'] < pivoted['Clinical'])
]

print("Number of violations:", len(violations))

In [None]:
stage_rank

In [None]:
violations

In [None]:
(violations["Clinical"]-violations["Pathological"])

In [None]:
cancer_staging_df[cancer_staging_df["biopsy_id"]=="fc9d3b25-8a80-47e8-a665-6bdd1294d903"]

### map stage values in cancer-staging.csv to 0,1,2,3,4

In [None]:
cancer_staging_df = pd.read_csv("cancer-staging.csv")

In [None]:
cancer_staging_df.head()

In [None]:
cancer_staging_df.info()

In [None]:
# map cancer stage to 0 - 4:
def stage_to_int(stage):
    if stage == "0":
        return 0
    elif stage == "IA" or stage == "IB":
        return 1
    elif stage == "IIA" or stage == "IIB":
        return 2
    elif stage == "IIIA" or stage == "IIIB" or stage == "IIIC":
        return 3
    elif stage == "IV":
        return 4
    else:
        return np.nan

cancer_staging_df["stage_staging_mapped"] = cancer_staging_df["stage"].apply(stage_to_int)

In [None]:
cancer_staging_df["stage"].unique()

In [None]:
cancer_staging_df["stage_staging_mapped"].unique()

In [None]:
cancer_staging_df.head()

In [None]:
cancer_staging_df[cancer_staging_df["patient_ngsci_id"]=="f87a9872-2d4e-4eaf-a04f-f41bed310801"]

In [None]:
merged_df[merged_df["patient_ngsci_id"]=="f87a9872-2d4e-4eaf-a04f-f41bed310801"][["patient_ngsci_id","biopsy_id","slide_id","stage"]]

### for patients with more than one biopsy, check if the stage_staging_mapped value is the same across all biopsies of that patient

In [None]:
cancer_staging_df

In [None]:
cancer_staging_df["patient_ngsci_id"].nunique()

In [None]:
cancer_staging_df["biopsy_id"].nunique()

In [None]:
# Step 1: Filter only patients with more than one unique biopsy_id
biopsy_counts = cancer_staging_df.groupby('patient_ngsci_id')['biopsy_id'].nunique()
multi_biopsy_patients = biopsy_counts[biopsy_counts > 1].index

# Step 2: Filter cancer_staging_df to those patients only
multi_biopsy_df = cancer_staging_df[cancer_staging_df['patient_ngsci_id'].isin(multi_biopsy_patients)]

# Step 3: Create helper function to check if all stage_staging_mapped values are equal per patient
def has_consistent_stage(df):
    return df['stage_staging_mapped'].nunique() == 1

# Step 4: Check consistency separately for Clinical and Pathological

# Clinical
clinical_consistency = (
    multi_biopsy_df[multi_biopsy_df['assessment_type'] == 'Clinical']
    .groupby('patient_ngsci_id')
    .apply(has_consistent_stage)
)

# Pathological
pathological_consistency = (
    multi_biopsy_df[multi_biopsy_df['assessment_type'] == 'Pathological']
    .groupby('patient_ngsci_id')
    .apply(has_consistent_stage)
)

# Step 5: Summarize results
print(f"Clinical stage consistent across biopsies: {clinical_consistency.sum()} out of {len(clinical_consistency)} patients")
print(f"Pathological stage consistent across biopsies: {pathological_consistency.sum()} out of {len(pathological_consistency)} patients")

# Step 6: (Optional) Show inconsistent patients
inconsistent_clinical = clinical_consistency[clinical_consistency == False].index.tolist()
inconsistent_pathological = pathological_consistency[pathological_consistency == False].index.tolist()


In [None]:
inconsistent_clinical = clinical_consistency[clinical_consistency == False].index.tolist()
inconsistent_pathological = pathological_consistency[pathological_consistency == False].index.tolist()

In [None]:
# Clinical inconsistencies
clinical_inconsistent_df = cancer_staging_df[
    (cancer_staging_df['assessment_type'] == 'Clinical') &
    (cancer_staging_df['patient_ngsci_id'].isin(inconsistent_clinical))
].sort_values(['patient_ngsci_id', 'biopsy_id'])

# Pathological inconsistencies
pathological_inconsistent_df = cancer_staging_df[
    (cancer_staging_df['assessment_type'] == 'Pathological') &
    (cancer_staging_df['patient_ngsci_id'].isin(inconsistent_pathological))
].sort_values(['patient_ngsci_id', 'biopsy_id'])


In [None]:
cols_to_show = ['patient_ngsci_id', 'biopsy_id', 'assessment_type', 'stage', 'stage_staging_mapped', 'stage_dt']

print("Clinical inconsistencies:")
display(clinical_inconsistent_df[cols_to_show])

print("Pathological inconsistencies:")
display(pathological_inconsistent_df[cols_to_show])


### filter cancer-staging csv with merged_df based on the biopsy_id to use only the study dataset

In [None]:
cancer_staging_df = cancer_staging_df[cancer_staging_df["biopsy_id"].isin(merged_df["biopsy_id"])]

In [None]:
cancer_staging_df["patient_ngsci_id"].nunique()

In [None]:
cancer_staging_df["biopsy_id"].nunique()

In [None]:
# Step 1: Filter only patients with more than one unique biopsy_id
biopsy_counts = cancer_staging_df.groupby('patient_ngsci_id')['biopsy_id'].nunique()
multi_biopsy_patients = biopsy_counts[biopsy_counts > 1].index

# Step 2: Filter cancer_staging_df to those patients only
multi_biopsy_df = cancer_staging_df[cancer_staging_df['patient_ngsci_id'].isin(multi_biopsy_patients)]

# Step 3: Create helper function to check if all stage_staging_mapped values are equal per patient
def has_consistent_stage(df):
    return df['stage_staging_mapped'].nunique() == 1

# Step 4: Check consistency separately for Clinical and Pathological

# Clinical
clinical_consistency = (
    multi_biopsy_df[multi_biopsy_df['assessment_type'] == 'Clinical']
    .groupby('patient_ngsci_id')
    .apply(has_consistent_stage)
)

# Pathological
pathological_consistency = (
    multi_biopsy_df[multi_biopsy_df['assessment_type'] == 'Pathological']
    .groupby('patient_ngsci_id')
    .apply(has_consistent_stage)
)

# Step 5: Summarize results
print(f"Clinical stage consistent across biopsies: {clinical_consistency.sum()} out of {len(clinical_consistency)} patients")
print(f"Pathological stage consistent across biopsies: {pathological_consistency.sum()} out of {len(pathological_consistency)} patients")

# Step 6: (Optional) Show inconsistent patients
inconsistent_clinical = clinical_consistency[clinical_consistency == False].index.tolist()
inconsistent_pathological = pathological_consistency[pathological_consistency == False].index.tolist()


### merge cancer-staging csv with merged_df based on the biopsy_id to use only the study dataset

In [None]:
# Merge on biopsy_id as that is the common key
df_with_assessment = merged_df.merge(
    cancer_staging_df[['biopsy_id', 'stage', 'stage_staging_mapped', 'assessment_type']],
    on='biopsy_id',
    suffixes=('_merged', '_staging'),
    how='left'
)

In [None]:
df_with_assessment[df_with_assessment["patient_ngsci_id"]=="f87a9872-2d4e-4eaf-a04f-f41bed310801"][["patient_ngsci_id","biopsy_id","slide_id","stage_merged","stage_staging","stage_staging_mapped","assessment_type"]]

In [None]:
df_with_assessment.shape

In [None]:
df_with_assessment["biopsy_id"].nunique(), df_with_assessment["patient_ngsci_id"].nunique()

### Check if all biopsies in merged_df (790 total) have both assessment types (Clinical & Pathological) in df_with_assessment

In [None]:
# Count assessment types per biopsy_id
assessment_counts = df_with_assessment.groupby('biopsy_id')['assessment_type'].nunique()

# How many have both Clinical and Pathological?
both_types_count = (assessment_counts == 2).sum()

# How many are missing one?
missing_one_count = (assessment_counts < 2).sum()

# How many are missing entirely? (should be 0 if merge is clean)
total_biopsies = df_with_assessment['biopsy_id'].nunique()

print(f"Total biopsies: {total_biopsies}")
print(f"Biopsies with BOTH Clinical and Pathological: {both_types_count}")
print(f"Biopsies with ONLY ONE assessment type: {missing_one_count}")

In [None]:
biopsies_missing_one = assessment_counts[assessment_counts < 2].index.tolist()
print("Biopsies with only one assessment type:", len(biopsies_missing_one))

In [None]:
Counter(df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)][["patient_ngsci_id","biopsy_id","slide_id","stage_merged","stage_staging","stage_staging_mapped","assessment_type"]]["assessment_type"])

In [None]:
# for 75 biopsy only clinical stage is given, for 39 only pathological is given

In [None]:
df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)][["patient_ngsci_id","biopsy_id","slide_id","stage_merged","stage_staging","stage_staging_mapped","assessment_type"]]

In [None]:
only_clinical_index = df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)]["assessment_type"].apply(lambda x: x=="Clinical")
only_pathological_index = df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)]["assessment_type"].apply(lambda x: x=="Pathological")

In [None]:
plt.hist(df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)].loc[only_clinical_index].stage_merged)

In [None]:
plt.hist(df_with_assessment[df_with_assessment['biopsy_id'].isin(biopsies_missing_one)].loc[only_pathological_index].stage_merged)

In [None]:
df_with_assessment.shape

In [None]:
np.array_equal( df_with_assessment[df_with_assessment["assessment_type"]=="Clinical"]["stage_staging_mapped"].values,
                df_with_assessment[df_with_assessment["assessment_type"]=="Clinical"]["stage_merged"].values)

In [None]:
(df_with_assessment[df_with_assessment["assessment_type"]=="Pathological"]["stage_staging_mapped"].values - \
         df_with_assessment[df_with_assessment["assessment_type"]=="Pathological"]["stage_merged"].values).sum()

### determine whether the stage_merged column in merged_df more closely matches Clinical or Pathological staging

In [None]:
# Pivot the stage_staging_mapped column to compare directly
pivoted_stages = df_with_assessment.pivot_table(
    index='biopsy_id',
    columns='assessment_type',
    values='stage_staging_mapped',
    aggfunc='first'  # If duplicates exist, just take the first
)

In [None]:
pivoted_stages

In [None]:
# Add back the stage_merged value for comparison
# Get one row per biopsy_id from df_with_assessment
stage_merged_values = df_with_assessment.drop_duplicates(subset='biopsy_id')[['biopsy_id', 'stage_merged']]
stage_merged_values = stage_merged_values.set_index('biopsy_id')

# Convert stage_merged to int for comparison, if needed
stage_merged_values['stage_merged'] = stage_merged_values['stage_merged'].astype(float)

In [None]:
# Merge into pivoted dataframe
comparison_df = pivoted_stages.join(stage_merged_values)

In [None]:
comparison_df

In [None]:
comparison_df['matches_clinical'] = comparison_df['stage_merged'] == comparison_df['Clinical']
comparison_df['matches_pathological'] = comparison_df['stage_merged'] == comparison_df['Pathological']

In [None]:
n_match_clinical = comparison_df['matches_clinical'].sum()
n_match_pathological = comparison_df['matches_pathological'].sum()

print(f"Biopsies where stage_merged matches Clinical stage: {n_match_clinical}")
print(f"Biopsies where stage_merged matches Pathological stage: {n_match_pathological}")

In [None]:
mismatches = comparison_df[
    (comparison_df['matches_pathological'] == False)
]

print(f"Biopsies where stage_merged doesn't match Pathological: {len(mismatches)}")

In [None]:
mismatches

In [None]:
cancer_staging_df[cancer_staging_df["biopsy_id"]=="1260096f-2f15-47dd-b04e-a91a81faccb3"]

In [None]:
merged_df[merged_df["biopsy_id"]=="1260096f-2f15-47dd-b04e-a91a81faccb3"].stage

In [None]:
merged_df[merged_df["biopsy_id"]=="0d7bc5fd-eef9-419a-9f35-afb83932a6a6"].stage

In [None]:
mismatches[mismatches["Pathological"].isin([0,1,2,3])]  # 22 biopsies

In [None]:
# Add a column to indicate whether merged stage came from Clinical
comparison_df['stage_source'] = comparison_df.apply(
    lambda row: 'Pathological' if row['matches_pathological'] else (
        'Clinical' if row['matches_clinical'] else 'Unknown'
    ),
    axis=1
)

In [None]:
comparison_df

In [None]:
comparison_df[comparison_df["stage_source"]=="Clinical"]

In [None]:
comparison_df[(comparison_df["stage_source"]=="Clinical") & (~comparison_df["Pathological"].isin([0,1,2,3,4])) ]

In [None]:
Counter(comparison_df[(comparison_df["stage_source"]=="Clinical") & (~comparison_df["Pathological"].isin([0,1,2,3,4])) ]["Clinical"])

In [None]:
comparison_df[(~comparison_df["Clinical"].isin([0,1,2,3,4])) & (comparison_df["Pathological"].isin([0,1,2,3,4])) ].shape

In [None]:
# 790 biopises originally in the study, nanuscript
# 676 where both assessment type is available -> 790-676=114, losing 114 biopsies
#
# 97 where source is clinical, but 22 where both type is available, for those 22, stage can to be changed to pathological, 
# therefore 97-22=75 so only 75 biopsies where only clinical stage is availble, those can be exluced -> 790-75=715, losing only 75 biopsies.
# difference is the biopsies for which only pathological stage is available, which is exactly 39 biopsies.

In [None]:
merged_df.shape

In [None]:
comparison_df.shape

In [None]:
comparison_df

In [None]:
comparison_df[comparison_df['Pathological'].isna()].shape

### get final per-patient table using Pathological stage as the final stage label for downstream analysis

In [None]:
merged_df.shape, comparison_df.shape

In [None]:
merged_df_raw = merged_df.copy()
merged_df_raw['slide_id'] = merged_df_raw['slide_id'].apply(eval)

In [None]:
# Drop index if needed to merge cleanly
comparison_df = comparison_df.reset_index()

# Merge to enrich the biopsy-level dataframe
merged_enriched = merged_df_raw.merge(
    comparison_df,
    on='biopsy_id',
    how='left'
)

In [None]:
merged_enriched.shape

In [None]:
# Keep only biopsies where Pathological is known
filtered_biopsies = merged_enriched[
    ~( (merged_enriched['stage_source'] == 'Clinical') & (merged_enriched['Pathological'].isna()) )
]

In [None]:
filtered_biopsies.shape

In [None]:
filtered_biopsies['slide_id'].explode().count()

In [None]:
# Group per patient
final_df = filtered_biopsies.groupby('patient_ngsci_id').agg({
    'biopsy_id': lambda x: list(x),  # now biopsy_id is a list of strings
    'slide_id': lambda x: sum(x, []),  # flatten list of lists
    'stage': 'first',  # original stage
    'Clinical': 'first',
    'Pathological': 'first',
    'stage_merged': 'first',
    'matches_clinical': 'first',
    'matches_pathological': 'first',
    'stage_source': 'first',
    'age': 'first',
    'race': 'first',
    'mortality': 'first',
}).reset_index()

In [None]:
final_df.shape

In [None]:
final_df['slide_id'].explode().count()

In [None]:
c, v = np.unique(final_df.patient_ngsci_id.values, return_counts=True)
v.shape

In [None]:
Counter(final_df.stage.values)

In [None]:
final_df.head()

In [None]:
merged_enriched['patient_ngsci_id'].nunique(), filtered_biopsies['patient_ngsci_id'].nunique()

In [None]:
merged_enriched['patient_ngsci_id'].nunique() - filtered_biopsies['patient_ngsci_id'].nunique()

In [None]:
patients_before = set(merged_enriched['patient_ngsci_id'].unique())

In [None]:
patients_after = set(filtered_biopsies['patient_ngsci_id'].unique())

In [None]:
patients_removed_completely = patients_before - patients_after
print(f"Number of patients completely removed: {len(patients_removed_completely)}")

In [None]:
# Step 4a: Biopsy counts per patient before and after
biopsies_per_patient_before = merged_enriched.groupby('patient_ngsci_id')['biopsy_id'].nunique()
biopsies_per_patient_after = filtered_biopsies.groupby('patient_ngsci_id')['biopsy_id'].nunique()

# Step 4b: Find patients where biopsy count was reduced but not zero
patients_partially_affected = [
    pid for pid in biopsies_per_patient_before.index
    if pid in biopsies_per_patient_after.index and
       biopsies_per_patient_before[pid] > biopsies_per_patient_after[pid]
]

print(f"Number of patients partially affected (some biopsies removed): {len(patients_partially_affected)}")

In [None]:
final_df

### generate column for stratification: stage, age, race, mortality

In [None]:
final_df['Pathological'] = final_df['Pathological'].astype(int)

In [None]:
np.unique(final_df.race.values, return_counts=True)

In [None]:
np.unique(final_df.mortality.values, return_counts=True)

In [None]:
final_df.age.min(), final_df.age.max()

In [None]:
plt.hist(final_df.age, bins=50);

In [None]:
quantiles = final_df.age.quantile([0, 1/3, 2/3, 1]).tolist()
quantiles

In [None]:
final_df['age_categories'] = pd.cut(final_df['age'], bins=quantiles, labels=[0, 1, 2], include_lowest=True)

In [None]:
#old: stge, age, race, mortality 

final_df['stratify_col'] = final_df["Pathological"].astype(str) + "_" + \
                            final_df['age_categories'].astype(str) + "_" + \
                            final_df['race'].astype(str) + "_" + \
                            final_df['mortality'].astype(str)

cv_split_dir_name = "cv_splits_multi_stratified_sklearn_s_a_r_mo_paper_patients_rev"
os.makedirs(cv_split_dir_name, exist_ok=True)

In [None]:
final_df['stratify_col']

### Generate local test set (25%)

In [None]:
n_splits = int(1 // (0.2))

X = final_df[['biopsy_id', 'patient_ngsci_id']]
y = final_df['stratify_col']

cv = StratifiedGroupKFold(n_splits=n_splits)

In [None]:
train_splits = []
val_splits = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=final_df['patient_ngsci_id'])):
    train_set = final_df.iloc[train_idx]
    val_set = final_df.iloc[val_idx]
    
    print(f"Fold {fold + 1}")
    print("Train Set:      ", train_set.shape, np.unique(train_set.Pathological.values, return_counts=True))
    print("Validation Set: ", val_set.shape, np.unique(val_set.Pathological.values, return_counts=True))
    print("-" * 40)
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)

In [None]:
len(train_splits), len(val_splits)

In [None]:
final_df_test = final_df.iloc[val_splits[0]]

In [None]:
final_df_test.shape, Counter(final_df_test.Pathological.values)

In [None]:
final_df_test.to_csv(f"{cv_split_dir_name}/test_split_multi_stratified.csv", index=False)

In [None]:
final_df_rest = final_df.iloc[ ~np.in1d(final_df.index.values, val_splits[0]) ].reset_index(drop=True)

In [None]:
final_df_rest.shape

In [None]:
Counter(final_df_rest.Pathological.values)

### Generate train-val folds

In [None]:
n_splits = 5

X = final_df_rest[['biopsy_id', 'patient_ngsci_id']]
y = final_df_rest['stratify_col']

cv = StratifiedGroupKFold(n_splits=n_splits)

In [None]:
train_splits = []
val_splits = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=final_df_rest['patient_ngsci_id'])):
    train_set = final_df_rest.iloc[train_idx]
    val_set = final_df_rest.iloc[val_idx]
    
    print(f"Fold {fold + 1}")
    print("Train Set:      ", train_set.shape, np.unique(train_set.Pathological.values, return_counts=True))
    print("Validation Set: ", val_set.shape, np.unique(val_set.Pathological.values, return_counts=True))
    print("-" * 40)
    
    train_splits.append(train_idx)
    val_splits.append(val_idx)

In [None]:
# check if there is any overlap in the val sets
for i in range(n_splits-1):
    print(list(set(val_splits[0]) & set(val_splits[i+1])))

In [None]:
for s in range(n_splits):
    # save train set
    final_df_rest.iloc[train_splits[s]].to_csv(f"{cv_split_dir_name}/train_split_multi_stratified_{s}.csv", index=False)
    
    # save val set
    final_df_rest.iloc[val_splits[s]].to_csv(f"{cv_split_dir_name}/val_split_multi_stratified_{s}.csv", index=False)

In [None]:
final_df.to_csv(f"{cv_split_dir_name}/final_df.csv", index=False)

### Exclude slides after tsne dbscan filtering

In [None]:
filtered_biopsies.patient_ngsci_id.nunique(), filtered_biopsies.biopsy_id.explode().count(), filtered_biopsies.slide_id.explode().count()

In [None]:
# Load the saved .npy file that contains the filtered slides from tsne clusters
slide_ids_isolated_cluster_tsne_1_2_3 = np.load('../cv_splits_paper/slide_ids_isolated_cluster_tsne_1_2_3.npy', allow_pickle=True)

In [None]:
slide_ids_isolated_cluster_tsne_1_2_3.shape

In [None]:
# exclude slides

excluded_slides_cluster_tsne_1_2_3 = set(slide_ids_isolated_cluster_tsne_1_2_3)

# Function to filter slide_id list per row
def filter_slide_ids(slide_list):
    if isinstance(slide_list, list):
        filtered = [slide for slide in slide_list if slide not in excluded_slides_cluster_tsne_1_2_3]
        return filtered if filtered else np.nan  # Return NaN if empty list
    return slide_list  # In case it's already NaN

# Make a copy of the original DataFrame
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3 = filtered_biopsies.copy()

# Apply the filtering function to the slide_id column
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3['slide_id'] = filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].apply(filter_slide_ids)

In [None]:
original_total_slides = filtered_biopsies['slide_id'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()
filtered_total_slides = filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()

print(f"Original total slides: {original_total_slides}")
print(f"Filtered total slides: {filtered_total_slides}")
print(f"Slides removed: {original_total_slides - filtered_total_slides}")

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.slide_id.isna().sum()
# one biopsy (419bf87b-6795-48f1-b0f9-00127b3cb13f) that alone belongs to one patient, and one biopsy that is part of a patient with 2 biopsies

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3[filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.slide_id.isna()]

In [None]:
# remove patinets with no slides, and save df

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3 = filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3[~filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.slide_id.isna()]

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id.nunique(), filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.biopsy_id.explode().count(), filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.slide_id.explode().count()

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.shape

In [None]:
filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.to_csv("merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.csv", index=False)

In [None]:
# Group per patient
final_df_with_excluded_slides_cluster_tsne_1_2_3 = filtered_biopsies_with_excluded_slides_cluster_tsne_1_2_3.groupby('patient_ngsci_id').agg({
    'biopsy_id': lambda x: list(x),  # now biopsy_id is a list of strings
    'slide_id': lambda x: sum(x, []),  # flatten list of lists
    'stage': 'first',  # original stage
    'Clinical': 'first',
    'Pathological': 'first',
    'stage_merged': 'first',
    'matches_clinical': 'first',
    'matches_pathological': 'first',
    'stage_source': 'first',
    'age': 'first',
    'race': 'first',
    'mortality': 'first',
}).reset_index()

In [None]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.shape

In [None]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id.nunique(), final_df_with_excluded_slides_cluster_tsne_1_2_3.biopsy_id.explode().count(), final_df_with_excluded_slides_cluster_tsne_1_2_3.slide_id.explode().count()

In [None]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.shape, final_df_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id.nunique()

In [None]:
final_df_with_excluded_slides_cluster_tsne_1_2_3.to_csv(f"{cv_split_dir_name}/final_df_with_excluded_slides_cluster_tsne_1_2_3.csv", index=False)