In [None]:
import os
import numpy as np
import pandas as pd

from loguru import logger

# Primary Filterings

In [None]:
all_patient_info = pd.read_csv("../data/metadata/patient_information_113528_19-9-2024.csv")
normality_data = pd.read_csv("../data/metadata/report_42462_normal_heart_great_vessels.csv")

# --- filter only the studies with normal heart and great vessels
normality_data = normality_data[normality_data["isnormal"] == "Yes"]
normal_studies = list(normality_data["study_id"])

# --- normal data should overlap with patient info
patient_info = all_patient_info[all_patient_info["Study ID"].isin(normal_studies)]

# --- research consent should be True
patient_info = patient_info[patient_info["Research Consent"] == "1"]

# --- remove na values
patient_info.dropna(subset=['Age', 'Height', 'Sex', 'Weight', 'Ethnicity'], inplace=True)
patient_info.reset_index(drop=True, inplace=True)

# --- filter based on heght and weight
patient_info = patient_info[patient_info["Height"] < 2.3]
patient_info = patient_info[patient_info["Height"] > 1.0]

patient_info = patient_info[patient_info["Weight"] < 180]
patient_info = patient_info[patient_info["Weight"] > 30]

# ignore ages < 20 and ages >= 80
patient_info = patient_info[patient_info["Age"] >= 20]
patient_info = patient_info[patient_info["Age"] < 80]


# --- adding the age group
bins = list(range(10, 90, 10)) + [np.inf]
labels = [f"{i}-{i+9}" for i in bins[:-2]] + ['80+']
patient_info['Age Group'] = pd.cut(patient_info['Age'], bins=bins, labels=labels, right=False)

# --- print some output
logger.info("Number of all filtered patients:" + str(len(patient_info)) + "\n")
logger.info("Patient info head:\n" + patient_info.head().to_string() + "\n")
logger.info("Subgroup counts:\n" + patient_info.groupby(["Age Group", "Sex"]).size().to_string() + "\n")

# Sample Train and Test data

In [None]:
def sample_by_age_sex(df, n_max=700, seed=42):
    """
    Deterministically (with seed) sample patients based on Age Group and Sex.
    For each (Age Group, Sex) group, return min(n_max, 75% of group size), shuffled with seed.

    Parameters:
        df (pd.DataFrame): Input DataFrame with 'Age Group', 'Sex', and 'Patient ID' columns.
        n_max (int): Max number of samples per group. Default is 700.
        seed (int): Random seed for reproducibility. Default is 42.

    Returns:
        pd.DataFrame: Sampled DataFrame.
    """
    # Shuffle dataframe with the given seed
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    # Group by Age Group and Sex
    grouped = df_shuffled.groupby(['Age Group', 'Sex'])

    # Sampling function
    def custom_sample(group):
        n = min(n_max, int(0.75 * len(group)))
        return group.head(n)

    # Apply sampling
    sampled_df = grouped.apply(custom_sample).reset_index(drop=True)

    return sampled_df


In [None]:
sample_train = sample_by_age_sex(patient_info)
sample_train.to_csv("../data/samples/train_data.csv", index=False)
sample_train.groupby(["Age Group", "Sex"]).size()

In [None]:
df_train_excluded = patient_info[~patient_info["Study ID"].isin(sample_train["Study ID"])]
sample_test = sample_by_age_sex(df_train_excluded, n_max=200, seed=42)
sample_test.to_csv("../data/samples/test_data.csv", index=False)
sample_test.groupby(["Age Group", "Sex"]).size()