#### Train Test Split Whole Dataset
##### 1. Interative stratified split

In [3]:
import os
import numpy as np
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)


import warnings
warnings.filterwarnings('ignore')

In [4]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


CUDA available: True
CUDA device count: 2
CUDA device name: NVIDIA A100-SXM4-80GB


In [6]:
#----------------------------------------------------------------------------------------------------------------------
## lOAD AND CLEAN DATA ##
#----------------------------------------------------------------------------------------------------------------------
original_path = 'path/to/data.csv' 
data = pd.read_csv(original_path)


Unnamed: 0,Text_desc,Active_bleeding_from_named_vessel,Active_bleeding_from_solid_organ,Bowel_resection,Class_I,Class_II,Class_III,Class_IV,Colostomy,Fascia_Closed_(Interrupted),...,Hand-Sewn_Anastomosis,Ileostomy,Primary_Repair,Serosal_tear_repair,Skin_Closed_(Full w/ Prevena),Skin_Closed_(Full),Skin_Closed_(Partial),Skin_Left_Open,Stapled_Anastomosis,Synthetic
0,TECHNICAL FINDINGS: The procedure was performe...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,DESCRIPTION OF PROCEDURE: With the patient in ...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,PROCEDURE IN DETAIL: The patient was brought i...,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,OPERATIVE PROCEDURE: The patient was brought t...,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,DESCRIPTION OF PROCEDURE: After informed conse...,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
import os
import pandas as pd
from skmultilearn.model_selection import IterativeStratification

# Assuming 'data' is your DataFrame and 'Text_desc' is your feature column
X = data['Text_desc'].values.reshape(-1, 1)  # Reshape to 2D array
y = data.drop('Text_desc', axis=1).values   

# Initialize the IterativeStratification
stratifier = IterativeStratification(n_splits=5, order=1)

# Directory to save the folds
output_dir = 'path/tp/folds'
os.makedirs(output_dir, exist_ok=True)

# Split the data and save each fold
for fold_idx, (train_index, test_index) in enumerate(stratifier.split(X, y), start=1):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Convert to DataFrame for saving
    train_df = pd.DataFrame(X_train, columns=['Text_desc'])
    train_df = pd.concat([train_df, pd.DataFrame(y_train, columns=data.columns[1:])], axis=1)
    
    test_df = pd.DataFrame(X_test, columns=['Text_desc'])
    test_df = pd.concat([test_df, pd.DataFrame(y_test, columns=data.columns[1:])], axis=1)
    
    # Save the train and test sets for each fold
    train_df.to_csv(os.path.join(output_dir, f'fold_{fold_idx}_train.csv'), index=False)
    test_df.to_csv(os.path.join(output_dir, f'fold_{fold_idx}_test.csv'), index=False)
    
    print(f"Fold {fold_idx} saved: Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Fold 1 saved: Train shape: (312, 1), Test shape: (74, 1)
Fold 2 saved: Train shape: (311, 1), Test shape: (75, 1)
Fold 3 saved: Train shape: (308, 1), Test shape: (78, 1)
Fold 4 saved: Train shape: (307, 1), Test shape: (79, 1)
Fold 5 saved: Train shape: (306, 1), Test shape: (80, 1)


In [12]:
import pandas as pd

# Initialize a list to store the label distribution for each fold
label_distribution = []

# Split the data and calculate label distribution
for fold_idx, (train_index, test_index) in enumerate(stratifier.split(X, y), start=1):
    y_train = y[train_index]
    y_test = y[test_index]
    
    # Calculate the percentage of each label in the train and test sets
    train_label_percent = (y_train.sum(axis=0) / y_train.shape[0]) * 100
    test_label_percent = (y_test.sum(axis=0) / y_test.shape[0]) * 100
    
    # Append the results to the list
    label_distribution.append({
        'Fold': fold_idx,
        'Train': train_label_percent,
        'Test': test_label_percent
    })

# Convert the list to a DataFrame for better visualization
distribution_df = pd.DataFrame(label_distribution)

# Expand the 'Train' and 'Test' columns to separate columns for each label
train_distribution = pd.DataFrame(distribution_df['Train'].tolist(), index=distribution_df['Fold'], columns=data.columns[1:])
test_distribution = pd.DataFrame(distribution_df['Test'].tolist(), index=distribution_df['Fold'], columns=data.columns[1:])

# Combine the train and test distributions into a single DataFrame and transpose it
combined_distribution = pd.concat([train_distribution.add_prefix('Train_'), test_distribution.add_prefix('Test_')], axis=1).transpose()

# Display the combined distribution DataFrame
print(combined_distribution)

combined_distribution.to_csv('path/to/label_distribution.csv')

Fold                                              1          2          3  \
Train_Active_bleeding_from_named_vessel    4.823151   4.792332   6.109325   
Train_Active_bleeding_from_solid_organ     9.324759   8.306709  10.289389   
Train_Bowel_resection                     31.832797  30.351438  31.189711   
Train_Class_I                             33.762058  33.865815  34.083601   
Train_Class_II                            47.909968  47.284345  47.909968   
Train_Class_III                           14.790997  15.015974  13.504823   
Train_Class_IV                            13.826367  13.099042  13.183280   
Train_Colostomy                            5.466238   4.792332   4.823151   
Train_Fascia_Closed_(Interrupted)         10.932476   8.626198   9.967846   
Train_Fascia_Closed_(Running/Continuous)  49.839228  49.520767  50.160772   
Train_Fascia_Left_Open                    33.440514  33.226837  33.440514   
Train_Hand-Sewn_Anastomosis                7.717042   8.306709   8.681672   