"""
Copyright 2026 Zsolt Bedőházi, András M. Biricz

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from datetime import datetime
from dateutil.relativedelta import relativedelta
from collections import Counter

import ast

# Nightingale DATASET

In [2]:
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3 = pd.read_csv("merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.csv")
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3['slide_id'] = merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].apply(ast.literal_eval)

In [3]:
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.shape

(713, 51)

In [4]:
patients_unique, patients_count = np.unique(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.patient_ngsci_id, return_counts=True)
patients_unique.shape, patients_count.shape

((574,), (574,))

In [5]:
biopsies_unique, biopsies_count = np.unique(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.biopsy_id, return_counts=True)
biopsies_unique.shape, biopsies_count.shape, len(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.biopsy_id.values)

((713,), (713,), 713)

In [6]:
slide_id_counts = merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].explode().count()
slide_id_counts

np.int64(9489)

In [7]:
def print_biopsybag_and_patient_stats(data_df):
    
    stages, counts = np.unique(data_df.Pathological.values, return_counts=True)
    print(f"Biopsy bags - stages: {stages}, counts:  {counts},         total: {counts.sum()}")
    
    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Biopsy bags - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

    stages, counts = np.unique(data_df.drop_duplicates(subset='patient_ngsci_id', keep='first').Pathological.values, return_counts=True)
    print(f"Patients    - stages: {stages}, counts:  {counts},         total: {counts.sum()}")

    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Patients    - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

In [8]:
print("STUDY/CONTEST DATASET:")
print("\nTOTAL PATIENTS:   ", patients_count.shape[0])
print("TOTAL BIOPSY BAGS:", biopsies_count.shape[0])
print("TOTAL SLIDES:     ", slide_id_counts.sum())
print_biopsybag_and_patient_stats(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

STUDY/CONTEST DATASET:

TOTAL PATIENTS:    574
TOTAL BIOPSY BAGS: 713
TOTAL SLIDES:      9489
Biopsy bags - stages: [1. 2. 3.], counts:  [586  90  37],         total: 713
Biopsy bags - stages: [1. 2. 3.], percent: [82.19 12.62  5.19], total: 100.0
Patients    - stages: [1. 2. 3.], counts:  [478  70  26],         total: 574
Patients    - stages: [1. 2. 3.], percent: [83.28 12.2   4.52], total: 100.0


## CHARACTERISTICS FOR THE Nightingale DATASET

In [10]:
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3 = merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.groupby('patient_ngsci_id').agg({
    'biopsy_id': list,
    'slide_id': lambda s: sum(s, []),  # concatenate lists directly
    **{col: 'first' for col in merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.columns
       if col not in ['patient_ngsci_id', 'biopsy_id', 'slide_id']}
}).reset_index()

In [11]:
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.shape

(574, 51)

In [12]:
np.array(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3['slide_id'].explode()).shape

(9489,)

## stage 

In [13]:
Counter(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.stage)

Counter({1: 462, 2: 82, 3: 30})

## Clinical information 

## Sex

In [14]:
merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3.shape[0]

574

## Age

In [15]:
def analyze_age(df):
    unknown_age_count = df[df.age == 0].shape[0]
    known_age_count = df[df.age != 0].shape[0]
    age_description = df[df.age != 0].age.describe()
    
    return unknown_age_count, known_age_count, age_description

In [16]:
unknown_age_count, known_age_count, age_description = analyze_age(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

# Display the results
print(f"Count of unknown ages (age = 0): {unknown_age_count}")
print(f"Count of known ages (age != 0): {known_age_count}")
print("\nAge Description for non-zero ages:")
print(age_description)

Count of unknown ages (age = 0): 9
Count of known ages (age != 0): 565

Age Description for non-zero ages:
count    565.000000
mean      62.171681
std       11.230211
min       26.000000
25%       55.000000
50%       63.000000
75%       71.000000
max       86.000000
Name: age, dtype: float64


## Mortality

In [18]:
def analyze_mortality(df):
    mortality_count = df.mortality.sum()
    
    non_mortality_count = df.mortality.shape[0] - mortality_count
    
    mortality_rate = mortality_count / df.mortality.shape[0]
    non_mortality_rate = non_mortality_count / df.mortality.shape[0]
    
    return mortality_count, non_mortality_count, mortality_rate, non_mortality_rate

In [19]:
mortality_count, non_mortality_count, mortality_rate, non_mortality_rate = analyze_mortality(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

print(f"Mortality Count: {mortality_count}")
print(f"Non-Mortality Count: {non_mortality_count}")
print(f"Mortality Rate: {mortality_rate:.2%}")
print(f"Non-Mortality Rate: {non_mortality_rate:.2%}")

Mortality Count: 47
Non-Mortality Count: 527
Mortality Rate: 8.19%
Non-Mortality Rate: 91.81%


## Survival time - maybe to be removed from table, as biops_dt is unique for a biopsy, but biopsies were merged

In [20]:
def analyze_followup_death(df):
    death_dates = pd.to_datetime(df.death_dt)
    biopsy_dates = pd.to_datetime(df.biopsy_dt)
    
    followup_death = death_dates - biopsy_dates
    
    non_missing_count = followup_death.notna().sum()
    missing_count = followup_death.isna().sum()
    
    followup_death_mon = followup_death[followup_death.notna()].dt.days / 30.44
    
    followup_death_description = followup_death_mon.describe()
    
    return non_missing_count, missing_count, followup_death_description

In [21]:
non_missing_count, missing_count, followup_death_description = analyze_followup_death(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

# Display the results
print(f"Non-missing follow-up count: {non_missing_count}")
print(f"Missing follow-up count: {missing_count}")
print("\nFollow-up time in months (for non-missing values):")
print(followup_death_description)

Non-missing follow-up count: 47
Missing follow-up count: 527

Follow-up time in months (for non-missing values):
count    47.000000
mean     38.099365
std      19.723964
min       1.314060
25%      28.285151
50%      38.173456
75%      49.556505
max      88.272011
dtype: float64


## DEMOGRAPHICS 

## Race

In [22]:
def analyze_race(df):
    # Define the mapping of race codes to descriptions
    race_mapping = {
        1: "White or Caucasian",
        2: "Black or African American",
        3: "American Indian or Alaska Native",
        4: "Asian",
        5: "Native Hawaiian or Pacific Islander",
        8: "Other",
        9: "Unknown"
    }
    
    # Count the occurrences of each race code
    race_counts = dict(Counter(df.race))
    
    # Calculate the total number of entries
    total_count = df.shape[0]
    
    # Create a list to store the formatted results
    results = []
    cumulative_percentage = 0
    
    # Loop through the race codes in the defined order and calculate proportions
    for i, code in enumerate(sorted(race_mapping.keys())):
        count = race_counts.get(code, 0)
        proportion = count / total_count * 100 if total_count > 0 else 0
        
        # Round the proportion to two decimal places
        rounded_proportion = round(proportion, 2)
        
        # For the last item, adjust the proportion to ensure the total sums to 100
        if i == len(race_mapping) - 1:
            rounded_proportion = 100 - cumulative_percentage
        
        cumulative_percentage += rounded_proportion
        race_name = race_mapping[code]
        results.append((race_name, count, rounded_proportion))
    
    # Print the results in a readable format
    print(f"{'Race':<40}{'Count':<10}{'Proportion (%)':<15}")
    print("-" * 65)
    for race_name, count, proportion in results:
        print(f"{race_name:<40}{count:<10}{proportion:.2f}")
    
    return results

In [23]:
race_results = analyze_race(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

Race                                    Count     Proportion (%) 
-----------------------------------------------------------------
White or Caucasian                      499       86.93
Black or African American               4         0.70
American Indian or Alaska Native        1         0.17
Asian                                   23        4.01
Native Hawaiian or Pacific Islander     0         0.00
Other                                   23        4.01
Unknown                                 24        4.18


## Ethnicity

In [24]:
def analyze_ethnicity(df):
    # Define the mapping of ethnicity codes to descriptions
    ethnicity_mapping = {
        0: "Non-Hispanic or Latino",
        1: "Hispanic or Latino",
        9: "Unknown"
    }
    
    # Count the occurrences of each ethnicity code
    ethnicity_counts = dict(Counter(df.ethnicity))
    
    # Calculate the total number of entries
    total_count = df.shape[0]
    
    # Create a list to store the formatted results
    results = []
    cumulative_percentage = 0
    
    # Loop through the ethnicity codes in the defined order and calculate proportions
    for i, code in enumerate(sorted(ethnicity_mapping.keys())):
        count = ethnicity_counts.get(code, 0)
        proportion = count / total_count * 100 if total_count > 0 else 0
        
        # Round the proportion to two decimal places
        rounded_proportion = round(proportion, 2)
        
        # For the last item, adjust the proportion to ensure the total sums to 100
        if i == len(ethnicity_mapping) - 1:
            rounded_proportion = 100 - cumulative_percentage
        
        cumulative_percentage += rounded_proportion
        ethnicity_name = ethnicity_mapping[code]
        results.append((ethnicity_name, count, rounded_proportion))
    
    # Print the results in a readable format
    print(f"{'Ethnicity':<30}{'Count':<10}{'Proportion (%)':<15}")
    print("-" * 55)
    for ethnicity_name, count, proportion in results:
        print(f"{ethnicity_name:<30}{count:<10}{proportion:.2f}")
    
    return results

In [25]:
ethnicity_results = analyze_ethnicity(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

Ethnicity                     Count     Proportion (%) 
-------------------------------------------------------
Non-Hispanic or Latino        510       88.85
Hispanic or Latino            24        4.18
Unknown                       40        6.97


## Mortality vs stage

In [26]:
def analyze_mortality_by_stage(df, stages=[1, 2, 3]):
    # List to store results for each stage
    stage_results = []
    total_entries = df.mortality.shape[0]
    total_mortality = df.mortality.sum()
    total_proportion = total_mortality / total_entries if total_entries > 0 else 0

    # Analyze each specified stage
    for stage in stages:
        filt = (df.Pathological.values == stage)
        stage_count = filt.sum()
        stage_mortality = df[filt].mortality.sum()
        stage_proportion = stage_mortality / stage_count if stage_count > 0 else 0
        stage_results.append((stage, stage_count, stage_mortality, stage_proportion))
    
    # Print results for each stage
    print(f"{'Stage':<10}{'Count':<10}{'Mortality':<10}{'Proportion (%)':<15}")
    print("-" * 50)
    for stage, count, mortality, proportion in stage_results:
        print(f"{stage:<10}{count:<10}{mortality:<10}{proportion:.2%}")
    
    # Print total results for the entire dataset
    print("\nTotal")
    print(f"{'Count':<10}{'Mortality':<10}{'Proportion (%)':<15}")
    print("-" * 30)
    print(f"{total_entries:<10}{total_mortality:<10}{total_proportion:.2%}")
    
    return stage_results, (total_entries, total_mortality, total_proportion)

In [27]:
stage_results, total_results = analyze_mortality_by_stage(merged_df_latest_with_excluded_slides_cluster_tsne_1_2_3)

Stage     Count     Mortality Proportion (%) 
--------------------------------------------------
1         478       24        5.02%
2         70        9         12.86%
3         26        14        53.85%

Total
Count     Mortality Proportion (%) 
------------------------------
574       47        8.19%


# Semmelweis DATASET

In [28]:
biopsy_df_sote = pd.read_csv('private_external_test_set/biopsy_df.csv')
labels_df_sote = pd.read_csv('private_external_test_set/labels_df.csv')
biopsy_df_sote.shape, labels_df_sote.shape

((286, 17), (329, 18))

In [None]:
biopsy_df_sote

In [None]:
labels_df_sote

In [31]:
patients_unique_sote, patients_count_sote = np.unique(biopsy_df_sote.base_slide_number, return_counts=True)
patients_unique_sote.shape, patients_count_sote.shape

((286,), (286,))

In [32]:
biopsies_unique_sote, biopsies_count_sote = np.unique(biopsy_df_sote.base_slide_number, return_counts=True)
biopsies_unique_sote.shape, biopsies_count_sote.shape, len(biopsy_df_sote)

((286,), (286,), 286)

In [33]:
slide_id_counts_sote = len(labels_df_sote)
slide_id_counts_sote

329

In [34]:
def print_biopsybag_and_patient_stats_sote(biopsy_df_sote, labels_df_sote):
    
    stages, counts = np.unique(biopsy_df_sote.clinical_stage.values, return_counts=True)
    print(f"Biopsy bags - stages: {stages}, counts:  {counts},         total: {counts.sum()}")
    
    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Biopsy bags - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

    stages, counts = np.unique(biopsy_df_sote.clinical_stage.values, return_counts=True)
    print(f"Patients    - stages: {stages}, counts:  {counts},         total: {counts.sum()}")

    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Patients    - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

In [35]:
print("PRIVATE TEST SET - SEMMELWEIS:")
print("\nTOTAL PATIENTS:   ", patients_count_sote.shape[0])
print("TOTAL BIOPSY BAGS:", biopsies_count_sote.shape[0])
print("TOTAL SLIDES:     ", slide_id_counts_sote)
print_biopsybag_and_patient_stats_sote(biopsy_df_sote, labels_df_sote)

PRIVATE TEST SET - SEMMELWEIS:

TOTAL PATIENTS:    286
TOTAL BIOPSY BAGS: 286
TOTAL SLIDES:      329
Biopsy bags - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Biopsy bags - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0
Patients    - stages: [1 2 3], counts:  [ 84 145  57],         total: 286
Patients    - stages: [1 2 3], percent: [29.37 50.7  19.93], total: 100.0


In [None]:
list(labels_df_sote)

In [37]:
np.unique( labels_df_sote.Grade.values, return_counts=True )

(array([1, 2, 3]), array([ 60, 135, 134]))

In [None]:
np.unique( labels_df_sote['ER%'].values, return_counts=True )

In [None]:
np.unique( labels_df_sote['PR%'].values, return_counts=True )

In [None]:
uqs, cs = np.unique( labels_df_sote['Her2'].values, return_counts=True )
print([ (uqs[k], cs[k]) for k in range(uqs.shape[0]) ] )

In [None]:
cs[2], np.unique( labels_df_sote['Her2'].values )[2]

In [None]:
cs[3], np.unique( labels_df_sote['Her2'].values )[14]

In [None]:
# checking
((labels_df_sote['ER%'].values == '0') | (labels_df_sote['ER%'].values == '100') &\
(labels_df_sote['PR%'].values == '0') | (labels_df_sote['PR%'].values == '100') &\
(labels_df_sote['Her2'].values == '0') | (labels_df_sote['Her2'].values == 'neg') ).sum()

In [None]:
np.unique( labels_df_sote['HER2 FISH'].values, return_counts=True )

In [None]:
labels_df_sote['Patients age'].describe()

# TCGA-BRCA TEST SET

In [46]:
biopsy_df_tcga = pd.read_csv('tcga_brca_test_set/biopsy_df.csv')
labels_df_tcga = pd.read_csv('tcga_brca_test_set/labels_df.csv')
biopsy_df_tcga.shape, labels_df_tcga.shape

((678, 2), (731, 3))

In [47]:
biopsy_df_tcga

Unnamed: 0,biopsy_id,stage
0,TCGA-3C-AALI,2
1,TCGA-3C-AALJ,2
2,TCGA-3C-AALK,1
3,TCGA-4H-AAAK,3
4,TCGA-5T-A9QA,2
...,...,...
673,TCGA-S3-AA15,2
674,TCGA-UL-AAZ6,2
675,TCGA-XX-A899,3
676,TCGA-XX-A89A,2


In [48]:
labels_df_tcga

Unnamed: 0,slide_id,biopsy_id,stage
0,TCGA-3C-AALI-01Z-00-DX1.F6E9A5DF-D8FB-45CF-B4B...,TCGA-3C-AALI,2
1,TCGA-3C-AALI-01Z-00-DX2.CF4496E0-AB52-4F3E-BDF...,TCGA-3C-AALI,2
2,TCGA-3C-AALJ-01Z-00-DX1.777C0957-255A-42F0-9EE...,TCGA-3C-AALJ,2
3,TCGA-3C-AALJ-01Z-00-DX2.62DFE56B-B84C-40F9-962...,TCGA-3C-AALJ,2
4,TCGA-3C-AALK-01Z-00-DX1.4E6EB156-BB19-410F-878...,TCGA-3C-AALK,1
...,...,...,...
726,TCGA-S3-AA15-01Z-00-DX2.915A4F90-25CB-4535-99C...,TCGA-S3-AA15,2
727,TCGA-UL-AAZ6-01Z-00-DX1.0488628B-C06B-4A1D-919...,TCGA-UL-AAZ6,2
728,TCGA-XX-A899-01Z-00-DX1.08FE27B7-73B8-4CE3-ACF...,TCGA-XX-A899,3
729,TCGA-XX-A89A-01Z-00-DX1.671E2AD6-4D1A-4579-88C...,TCGA-XX-A89A,2


In [49]:
patients_unique_tcga, patients_count_tcga = np.unique(biopsy_df_tcga.biopsy_id, return_counts=True)
patients_unique_tcga.shape, patients_count_tcga.shape

((678,), (678,))

In [50]:
biopsies_unique_tcga, biopsies_count_tcga = np.unique(biopsy_df_tcga.biopsy_id, return_counts=True)
biopsies_unique_tcga.shape, biopsies_count_tcga.shape, len(biopsy_df_tcga)

((678,), (678,), 678)

In [51]:
slide_id_counts_tcga = len(labels_df_tcga)
slide_id_counts_tcga

731

In [52]:
def print_biopsybag_and_patient_stats_sote(biopsy_df_tcga, labels_df_tcga):
    
    stages, counts = np.unique(biopsy_df_tcga.stage.values, return_counts=True)
    print(f"Biopsy bags - stages: {stages}, counts:  {counts},         total: {counts.sum()}")
    
    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Biopsy bags - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

    stages, counts = np.unique(biopsy_df_tcga.stage.values, return_counts=True)
    print(f"Patients    - stages: {stages}, counts:  {counts},         total: {counts.sum()}")

    total_counts = counts.sum()
    percentages = np.round( (counts / total_counts) * 100, 2)
    percentages[-1] = 100 - percentages[:-1].sum()
    print(f"Patients    - stages: {stages}, percent: {percentages}, total: {percentages.sum()}")

In [53]:
print("TCGA-BRCA TEST SET:")
print("\nTOTAL PATIENTS:   ", patients_count_tcga.shape[0])
print("TOTAL BIOPSY BAGS:", biopsies_count_tcga.shape[0])
print("TOTAL SLIDES:     ", slide_id_counts_tcga)
print_biopsybag_and_patient_stats_sote(biopsy_df_tcga, labels_df_tcga)

TCGA-BRCA TEST SET:

TOTAL PATIENTS:    678
TOTAL BIOPSY BAGS: 678
TOTAL SLIDES:      731
Biopsy bags - stages: [1 2 3], counts:  [122 404 152],         total: 678
Biopsy bags - stages: [1 2 3], percent: [17.99 59.59 22.42], total: 100.0
Patients    - stages: [1 2 3], counts:  [122 404 152],         total: 678
Patients    - stages: [1 2 3], percent: [17.99 59.59 22.42], total: 100.0
