In [1]:
import pandas as pd
import pickle
import numpy as np
import os

def load_pickle(thePath):
    with open(thePath, 'rb') as f:
        data = pickle.load(f)
    return data

def save_pickle(L, given_path, file_name):
    # Ensure the directory exists
    if not os.path.exists(given_path):
        os.makedirs(given_path)
        print(f'\tDirectory created: {given_path}')
    
    # Save the list as a pickle file
    print(f'\tSaving to {given_path}/{file_name}.pkl')
    with open(os.path.join(given_path, f'{file_name}.pkl'), 'wb') as file:
        pickle.dump(L, file)

In [2]:
# loading structured and unstructured data...

the_path    = '../Data'

X_B  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_B.pkl')
# X_D  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_D.pkl')
X_L  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_L.pkl')
X_M  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_M.pkl')
X_P  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_P.pkl')
X_S  = load_pickle(f'{the_path}/structured/Lung_Cancer/X_S.pkl')
VIY  = load_pickle(f'{the_path}/structured/Lung_Cancer/VIY.pkl')

# ... unstructured data
model_name  = 'sci_sm' 
num_Tokens  = 2000
org_path    = f'{the_path}/unstructured/emb/BoW/ALL_first_last_{model_name}_{num_Tokens}.csv'

df_emb = pd.read_csv(org_path)

MIMIC_Path = '/lustre/home/almusawiaf/PhD_Projects/MIMIC_resources'
df_75_features = pd.read_csv(f'{MIMIC_Path}/ICU_patient_data2.csv')

In [3]:
# creating new X_VI
X_VI2 = []
for v,_, i in VIY:
    X_VI2.append([int(v), i])

X_VI2 = np.array(X_VI2)
X_VI2

array([[1.30744e+05, 3.02370e+00],
       [1.08732e+05, 2.18970e+00],
       [1.37006e+05, 3.18200e+00],
       ...,
       [1.95348e+05, 1.21212e+01],
       [1.44869e+05, 2.90750e+00],
       [1.29743e+05, 4.12270e+00]])

In [4]:
common_HADM_ID = np.intersect1d(df_emb['HADM_ID'], X_VI2[:, 0])
common_HADM_ID = np.sort(common_HADM_ID)

In [5]:
# Create a mask for the original array
mask = np.in1d(X_VI2[:,0], common_HADM_ID)

# Create a new array with the common HADM_ID
new_X_VI2 = X_VI2[mask]

# Sort the new array based on the common HADM_ID
idx = np.argsort(new_X_VI2[:, 0])
new_X_VI2 = new_X_VI2[idx]

print(X_VI2.shape, new_X_VI2.shape)
new_X_VI2

(709, 2) (684, 2)


array([[1.00085e+05, 1.14000e+00],
       [1.00271e+05, 1.96060e+00],
       [1.00283e+05, 3.86510e+00],
       ...,
       [1.99577e+05, 1.31443e+01],
       [1.99616e+05, 9.00250e+00],
       [1.99889e+05, 8.96050e+00]])

In [6]:
X_B2 = X_B[mask][idx]
X_L2 = X_L[mask][idx]
X_M2 = X_M[mask][idx]
X_P2 = X_P[mask][idx]
X_S2 = X_S[mask][idx]

VIY2 = VIY[mask][idx]

In [7]:
# Create a mask for the original DataFrame
df = df_emb.copy()
mask_df = df['HADM_ID'].isin(common_HADM_ID)

# Create a new DataFrame with the common HADM_ID
new_df = df[mask_df].copy()

# Sort the new DataFrame based on the common HADM_ID
new_df = new_df.set_index('HADM_ID')
new_df = new_df.loc[common_HADM_ID]
new_df = new_df.reset_index()

In [8]:
df3 = df_75_features.copy()

# ===============================================================================
# 1. Convert all boolean columns to integer (True → 1, False → 0)
df3 = df3.astype({col: int for col in df3.select_dtypes(include='bool').columns})
# ===============================================================================
# 2. label encoding for admission type
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the column
df3['ADMISSION_TYPE'] = le.fit_transform(df3['ADMISSION_TYPE'])
# ===============================================================================
# 3. dropping unneeded cols
df3.drop(columns=['Unnamed: 0', 'SUBJECT_ID', 'ADMITTIME', 'ICUSTAY_ID'], inplace=True)
# ===============================================================================

df3 = df3.groupby('HADM_ID').mean().reset_index()
df3

Unnamed: 0,HADM_ID,ADMISSION_TYPE,ADM_ELECTIVE,ADM_EMERGENCY,ADM_URGENT,Heart Rate,Systolic Blood Pressure,Diastolic Blood Pressure,Respiratory Rate,Pulse Oximetry,...,Inspired O2 Fraction,BUN,Anion Gap,INR,GENDER_M,GENDER_F,AGE_AGE middle adult,AGE_AGE senior,AGE_Other,AGE_Unknown
0,100001,1.0,0.0,1.0,0.0,122.0,192.0,100.0,14.0,100.0,...,100.0,42.0,20.0,1.3,0.0,1.0,0.0,0.0,1.0,0.0
1,100003,1.0,0.0,1.0,0.0,71.0,83.0,36.0,11.0,96.0,...,100.0,49.0,10.0,1.6,1.0,0.0,1.0,0.0,0.0,0.0
2,100006,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,19.0,13.0,1.3,0.0,1.0,1.0,0.0,0.0,0.0
3,100007,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,19.0,13.0,1.3,0.0,1.0,0.0,1.0,0.0,0.0
4,100009,1.0,0.0,1.0,0.0,80.0,117.0,47.0,21.0,100.0,...,100.0,13.0,12.0,1.2,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57781,199993,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,19.0,13.0,1.3,1.0,0.0,1.0,0.0,0.0,0.0
57782,199994,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,19.0,13.0,1.3,0.0,1.0,1.0,0.0,0.0,0.0
57783,199995,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,19.0,13.0,1.3,1.0,0.0,0.0,0.0,1.0,0.0
57784,199998,1.0,0.0,1.0,0.0,87.0,122.0,65.0,18.0,98.0,...,100.0,16.0,8.0,1.3,1.0,0.0,0.0,1.0,0.0,0.0


In [9]:
# Ensuring all ids exist in both tables.
missing_hadm = set(new_df['HADM_ID']) - set(df3['HADM_ID'])

if missing_hadm:
    print("Missing HADM_IDs in df3:", missing_hadm)
else:
    print("All HADM_IDs in df1 exist in df3.")
    
# Filter df2 to keep only HADM_IDs present in df1
df3 = df3[df3['HADM_ID'].isin(new_df['HADM_ID'])]

# Sort df2 to match the order of HADM_ID in df1
df3 = df3.set_index('HADM_ID').reindex(new_df['HADM_ID']).reset_index()

# Verify dimensions
print(df3.shape)  # Should be (42142, 31)

df3.drop(columns=['HADM_ID'], inplace=True )
F = df3.values
F.shape

All HADM_IDs in df1 exist in df3.
(684, 27)


(684, 26)

In [10]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# # Min-max scaling
# scaler = MinMaxScaler()
# min_max_scaled_features = scaler.fit_transform(F)
# min_max_scaled_features[1]
# Standardization
scaler = StandardScaler()
F2 = scaler.fit_transform(F)
F2

array([[-1.51837718,  1.83212661, -1.71864983, ...,  0.9185128 ,
        -0.22153951, -0.15964729],
       [ 0.33952978, -0.54581381,  0.58185209, ..., -1.08871645,
        -0.22153951, -0.15964729],
       [ 0.33952978, -0.54581381,  0.58185209, ...,  0.9185128 ,
        -0.22153951, -0.15964729],
       ...,
       [ 0.33952978, -0.54581381,  0.58185209, ...,  0.9185128 ,
        -0.22153951, -0.15964729],
       [ 0.33952978, -0.54581381,  0.58185209, ..., -1.08871645,
        -0.22153951, -0.15964729],
       [-1.51837718,  1.83212661, -1.71864983, ..., -1.08871645,
        -0.22153951, -0.15964729]])

In [12]:
F2.shape

(684, 26)

In [15]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Helper Functions
def classify_los_3_classes(los_list, th1, th2):
    return [0 if los < th1 else 1 if th1 <= los <= th2 else 2 for los in los_list]


def classify_los_binary(los_list, threshold):
    return [1 if los > threshold else 0 for los in los_list]

num_classes = '2_classes'

if num_classes =='2_classes':
    threshold = 7    
    labels = list(classify_los_binary(VIY2[:, 2], threshold))
    saving_path = f'{the_path}/XY_BoW/{num_classes}/BoW_{model_name}_{num_Tokens}_F_{threshold}_days_Lung_Cancer'
else:
    th1, th2 = 3, 7
    labels = list(classify_los_3_classes(VIY2[:, 2], th1, th2))
    saving_path = f'{the_path}/XY_BoW/{num_classes}/BoW_{model_name}_{num_Tokens}_F_{th1}_{th2}_days_Lung_Cancer'
    
    
# Encode Labels
visits = list(VIY2[:, 0])
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

E = new_df.drop(columns=['HADM_ID']).values

# Ensure structured data is converted to lists (if needed)
XM_list = X_M2.tolist()
XS_list = X_S2.tolist()
XB_list = X_B2.tolist()
XL_list = X_L2.tolist()
XP_list = X_P2.tolist()

XF_list = F.tolist()

# Perform the train-test split
(train_labels, test_labels, 
 train_visits, test_visits, 
 train_XM, test_XM, 
 train_XS, test_XS,
 train_XB, test_XB,
 train_XL, test_XL,
 train_XP, test_XP,
 train_XF, test_XF,
 train_E , test_E) = train_test_split(labels, visits, XM_list, XS_list, XB_list, XL_list, XP_list, XF_list, E,test_size=0.2, random_state=42, stratify=labels)

train_dataset = Dataset.from_dict({"label": train_labels, 
                                   "HADM_ID": train_visits})

train_dataset2 = Dataset.from_dict({"label": train_labels, 
                                   "HADM_ID": train_visits,
                                   "XM": train_XM,
                                   "XS": train_XS,
                                   "XB": train_XB,
                                   "XL": train_XL,
                                   "XP": train_XP,
                                   "XF": train_XF,
                                   "XE": train_E})

test_dataset  = Dataset.from_dict({"label": test_labels,  
                                   "HADM_ID": test_visits})

test_dataset2  = Dataset.from_dict({"label": test_labels,  
                                   "HADM_ID": test_visits,
                                   "XM": test_XM,
                                   "XS": test_XS,
                                   "XB": test_XB,
                                   "XL": test_XL,
                                   "XP": test_XP,
                                   "XF": test_XF,
                                   "XE": test_E})



os.makedirs(saving_path, exist_ok=True)  # Create directory if it doesn't exist

train_dataset2.save_to_disk(f"{saving_path}/train_dataset_X")
test_dataset2.save_to_disk(f"{saving_path}/test_dataset_X")
print("Datasets saved successfully!")

Saving the dataset (1/1 shards): 100%|██████████| 547/547 [00:00<00:00, 1925.54 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 137/137 [00:00<00:00, 2041.08 examples/s]


Datasets saved successfully!
