In [None]:
import numpy as np
import pandas as pd

In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         # print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_series_meta = pd.read_csv('/kaggle/input/rsna-2023-abdominal-trauma-detection/train_series_meta.csv')
train_series_meta.head()

In [None]:
image_level_labels_2024 = pd.read_csv('/kaggle/input/rsna-2023-abdominal-trauma-detection/image_level_labels_2024.csv')
image_level_labels_2024.head()

In [None]:
import keras_cv
import keras_core as keras
from keras_core import layers

import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# **EDA**

Reference Notebook: https://www.kaggle.com/code/aritrag/eda-train-csv

**Key observations related to the data**:
* **Healthy columns corelation**: The correlations between different health columns are generally small, indicating that the healthy state of one organ might not be strongly related to the healthy state of other organs.
* **Injury corelation:** There are some specific correlations between individual injury types, such as the correlation between extravasation_injury and spleen_high. However, the correlation between different organs' injuries is generally low, which might suggest that injuries to different organs occur independently of each other.
 
* **Relation between class**: bowel_injury and bowel_healthy are complementary. Their probabilities add up to 1.0.
Similarly, extravasation_injury and extravasation_healthy are complementary.

* **Simplification**: For the model, only {bowel/extravasation}_injury will be included, and the corresponding healthy status can be calculated using a sigmoid function.

* **Softmax**: {kidney/liver/spleen}_{healthy/low/high} classifications are softmaxed, ensuring their combined probabilities sum up to 1.0 for each organ, simplifying the model while preserving essential information.

# **Configuration Class**

In [None]:

class Config:
    SEED = 42
    IMAGE_SIZE = [256, 256]
    BATCH_SIZE = 64
    EPOCHS = 10
    TARGET_COLS  = [
        "bowel_injury", "extravasation_injury",
        "kidney_healthy", "kidney_low", "kidney_high",
        "liver_healthy", "liver_low", "liver_high",
        "spleen_healthy", "spleen_low", "spleen_high",
    ]
    AUTOTUNE = tf.data.AUTOTUNE

config = Config()

# Reproductibility 

In [None]:
keras.utils.set_random_seed(seed=config.SEED)

# Data set

This train.csv file contains details about the dataset, stored in a table-like format. Here's what each column means:abs

**patient_id:** A unique identifier for each patient. This ensures that each patient’s data can be easily tracked.

**series_id**: A unique identifier for each scan (series of images) for a patient. A patient can have multiple scans.

**instance_number**: The specific image number in a scan. Medical scans consist of multiple "slices," and this number helps identify them.

**[bowel/extravasation]_[healthy/injury]:** These columns indicate whether the patient has bowel or extravasation injuries (binary targets: 0 for healthy, 1 for injury).

**[kidney/liver/spleen]_[healthy/low/high]:** These columns specify the injury severity for organs:
* healthy: No injury
* low: Mild injury
* high: Severe injury
* any_injury: A binary column indicating if the patient has any injury at all (1 for injured, 0 for healthy).

In [None]:
BASE_PATH = f"/kaggle/input/rsna-2023-abdominal-trauma-detection"

In [None]:
# train
image_level_labels_2024_dataframe = pd.read_csv(f"{BASE_PATH}/image_level_labels_2024.csv")
image_level_labels_2024_dataframe["image_path"] = f"{BASE_PATH}/train_images"\
                    + "/" + dataframe.patient_id.astype(str)\
                    + "/" + dataframe.series_id.astype(str)\
                    + "/" + dataframe.instance_number.astype(str) +".png"
image_level_labels_2024_dataframe = image_level_labels_2024_dataframe.drop_duplicates()

image_level_labels_2024_dataframe.head(2)


# Dataset¶
The dataset provided in the competition consists of DICOM images. We will not be training on the DICOM images, rather would work on PNG image which are extracted from the DICOM format.

[A helpful resource on the conversion of DICOM to PNG](http://https://www.kaggle.com/code/radek1/how-to-process-dicom-images-to-pngs)

We split the training dataset into train and validation. This is a common practise in the Machine Learning pipelines. We not only want to train our model, but also want to validate it's training.

A small catch here is that the training and validation data should have an aligned data distribution. Here we handle that by grouping the lables and then splitting the dataset. This ensures an aligned data distribution between the training and the validation splits

In [None]:
train_2024_dataframe = pd.read_csv(f"{BASE_PATH}/train_2024.csv")

In [None]:
# Function to handle the split for each group
def split_group(group, test_size=0.2):
    if len(group) == 1:
        return (group, pd.DataFrame()) if np.random.rand() < test_size else (pd.DataFrame(), group)
    else:
        return train_test_split(group, test_size=test_size, random_state=42)

# Initialize the train and validation datasets
train_data = pd.DataFrame()
val_data = pd.DataFrame()

# Iterate through the groups and split them, handling single-sample groups
for _, group in train_2024_dataframe.groupby(config.TARGET_COLS):
    train_group, val_group = split_group(group)
    train_data = pd.concat([train_data, train_group], ignore_index=True)
    val_data = pd.concat([val_data, val_group], ignore_index=True)

In [None]:
train_data.shape, val_data.shape