"""
Copyright 2026 Zsolt Bedőházi

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import Counter

import ast

# SEMMELWEIS DATASET (training, internal test)

In [2]:
biopsy_df_sote = pd.read_csv('cv_splits_sote_multi_stratified_sklearn_s_a_paper_patients_rev/biopsy_df.csv')
labels_df_sote = pd.read_csv('../labels_df.csv')
biopsy_df_sote.shape, labels_df_sote.shape

((286, 19), (329, 18))

In [None]:
biopsy_df_sote

In [None]:
labels_df_sote

In [5]:
patients_unique_sote, patients_count_sote = np.unique(biopsy_df_sote.base_slide_number, return_counts=True)
patients_unique_sote.shape, patients_count_sote.shape

((286,), (286,))

In [6]:
slides_unique_sote, slides_count_sote = np.unique(biopsy_df_sote.base_slide_number, return_counts=True)
slides_unique_sote.shape, slides_count_sote.shape, len(biopsy_df_sote)

((286,), (286,), 286)

In [7]:
slide_id_counts_sote = len(labels_df_sote)
slide_id_counts_sote

329

In [8]:
def print_biopsybag_and_patient_stats_sote(biopsy_df_sote):
    
    stages, counts = np.unique(biopsy_df_sote.clinical_stage.values, return_counts=True)
    print(f"Biopsy bags - stages: {stages}, counts:  {counts},         total: {counts.sum()}")
    
    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Biopsy bags - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

    stages, counts = np.unique(biopsy_df_sote.clinical_stage.values, return_counts=True)
    print(f"Patients    - stages: {stages}, counts:  {counts},         total: {counts.sum()}")

    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Patients    - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

In [9]:
print("PRIVATE TEST SET - SEMMELWEIS:")
print("\nTOTAL PATIENTS:   ", patients_count_sote.shape[0])
print("TOTAL BIOPSY BAGS:", slides_count_sote.shape[0])
print("TOTAL SLIDES:     ", slide_id_counts_sote)
print_biopsybag_and_patient_stats_sote(biopsy_df_sote)

PRIVATE TEST SET - SEMMELWEIS:

TOTAL PATIENTS:    286
TOTAL BIOPSY BAGS: 286
TOTAL SLIDES:      329
Biopsy bags - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Biopsy bags - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0
Patients    - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Patients    - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0


In [None]:
list(labels_df_sote)

In [None]:
np.unique( labels_df_sote.Grade.values, return_counts=True )

In [None]:
np.unique( labels_df_sote['ER%'].values, return_counts=True )

In [None]:
np.unique( labels_df_sote['PR%'].values, return_counts=True )

In [None]:
uqs, cs = np.unique( labels_df_sote['Her2'].values, return_counts=True )
print([ (uqs[k], cs[k]) for k in range(uqs.shape[0]) ] )

In [None]:
cs[2], np.unique( labels_df_sote['Her2'].values )[2]

In [None]:
cs[3], np.unique( labels_df_sote['Her2'].values )[14]

In [17]:
# checking
((labels_df_sote['ER%'].values == '0') | (labels_df_sote['ER%'].values == '100') &\
(labels_df_sote['PR%'].values == '0') | (labels_df_sote['PR%'].values == '100') &\
(labels_df_sote['Her2'].values == '0') | (labels_df_sote['Her2'].values == 'neg') ).sum()

np.int64(159)

In [18]:
np.unique( labels_df_sote['HER2 FISH'].values, return_counts=True )

(array(['-', 'ampl', 'ampl.', 'no ampl.', 'no ampl. , 17 triszomia'],
       dtype=object),
 array([245,   2,  28,  53,   1]))

In [None]:
labels_df_sote['Patients age'].describe()

In [20]:
biopsy_df_sote.shape, labels_df_sote.shape

((286, 19), (329, 18))

In [21]:
internal_test_df = pd.read_csv("cv_splits_sote_multi_stratified_sklearn_s_a_paper_patients_rev/test_split_multi_stratified.csv")
internal_test_df.shape

(72, 19)

In [22]:
training_df = biopsy_df_sote[~biopsy_df_sote.base_slide_number.isin(internal_test_df.base_slide_number)]
training_df.shape

(214, 19)

In [23]:
patients_unique, patients_count = np.unique(biopsy_df_sote.base_slide_number, return_counts=True)
patients_unique.shape, patients_count.shape

((286,), (286,))

In [24]:
slides_count = biopsy_df_sote['Slide number'].str.split(',').explode().count()
slides_count

np.int64(329)

In [25]:
patients_unique_training, patients_count_training = np.unique(training_df.base_slide_number, return_counts=True)
patients_unique_training.shape, patients_count_training.shape

((214,), (214,))

In [26]:
slides_count_training = training_df['Slide number'].str.split(',').explode().count()
slides_count_training

np.int64(247)

In [27]:
patients_unique_internal_test, patients_count_internal_test = np.unique(internal_test_df.base_slide_number, return_counts=True)
patients_unique_internal_test.shape, patients_count_internal_test.shape

((72,), (72,))

In [28]:
slides_count_internal_test = internal_test_df['Slide number'].str.split(',').explode().count()
slides_count_internal_test

np.int64(82)

In [29]:
print("SEMMELWEIS DATASET:")
print("\nTOTAL PATIENTS:   ", patients_count_sote.shape[0])
print("TOTAL BIOPSY BAGS:", patients_count_sote.shape[0])
print("TOTAL SLIDES:     ", slides_count)
print_biopsybag_and_patient_stats_sote(biopsy_df_sote)


print("\nTRAINING SET:")
print("\nTOTAL PATIENTS:   ", patients_count_training.shape[0])
print("TOTAL BIOPSY BAGS:", patients_count_training.shape[0])
print("TOTAL SLIDES:     ", slides_count_training.sum())
print_biopsybag_and_patient_stats_sote(training_df)

print("\nINETRNAL TEST SET:")
print("\nTOTAL PATIENTS:   ", patients_count_internal_test.shape[0])
print("TOTAL BIOPSY BAGS:", patients_count_internal_test.shape[0])
print("TOTAL SLIDES:     ", slides_count_internal_test.sum())
print_biopsybag_and_patient_stats_sote(internal_test_df)

SEMMELWEIS DATASET:

TOTAL PATIENTS:    286
TOTAL BIOPSY BAGS: 286
TOTAL SLIDES:      329
Biopsy bags - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Biopsy bags - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0
Patients    - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Patients    - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0

TRAINING SET:

TOTAL PATIENTS:    214
TOTAL BIOPSY BAGS: 214
TOTAL SLIDES:      247
Biopsy bags - stages: [1 2 3], counts:  [ 63 108  43],         total: 214
Biopsy bags - stages: [1 2 3], percent: [29.44 50.47 20.09], total: 100.0
Patients    - stages: [1 2 3], counts:  [ 63 108  43],         total: 214
Patients    - stages: [1 2 3], percent: [29.44 50.47 20.09], total: 100.0

INETRNAL TEST SET:

TOTAL PATIENTS:    72
TOTAL BIOPSY BAGS: 72
TOTAL SLIDES:      82
Biopsy bags - stages: [1 2 3], counts:  [21 37 14],         total: 72
Biopsy bags - stages: [1 2 3], percent: [29.17 51.39 19.44], total: 100.0
Pa