## **Unifying and Combining the final data**

### 1. Unifying the data
- The goal of this step is to unify the data from strucutured and unstructured data.
    - Processing the structured data (XY)
    - Processing the embedding of the unstructured data
    - Saving...
    
### 2. Combining and preparing Y
- The goal here is to combine the data so that we end with the following information:
    - **Z**: base features
    - **ZS**: base features + MeSH-based symptoms
    - **T0**: original text
    - **T1**: t5-small
    - **T2**: bart large cnn
    - **T3**: medical summarization

    - **ZST0**: base + MeSH + original text
    - **ZST1**: base + MeSH + t5-small
    - **ZST2**: base + MeSH + bart large cnn
    - **ZST3**: base + MeSH + medical summarization

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import ast

In [8]:
class Unifying():
    def __init__(self,  structured_path, unstructured_path, saving_path, the_model):
        self.embedding_model = the_model
        self.saving_path = f'{saving_path}/{self.embedding_model}'
        
        self.XB, self.XD, self.XL, self.XM, self.XP, self.XS, self.VIY, new_emb = self.extract_shared_data(structured_path, unstructured_path)
        self.process_and_save_embeddings(new_emb)
        self.concatenate_and_save()


    def extract_shared_data(self, structured_path, unstructured_path):
        """
        Extracts shared HADM_IDs from VIY and df_emb, then retrieves corresponding values from XB, XD, and LoS.
        
        Parameters:
            VIY (np.ndarray): A 2D NumPy array with columns [HADM_ID, ICU_ID, LoS].
            df_emb (pd.DataFrame): DataFrame containing HADM_ID and other columns.
            XB (np.ndarray): Structured data corresponding to VIY.
            XD (np.ndarray): Structured data corresponding to VIY.

        Returns:
            extracted_XB (np.ndarray): Filtered and sorted version of XB.
            extracted_XD (np.ndarray): Filtered and sorted version of XD.
            extracted_LoS (np.ndarray): Filtered and sorted Length of Stay values.
            extracted_emb (pd.DataFrame): Filtered and sorted DataFrame containing embeddings.
        """
        # --------------------------------------------------------------------------------------------------------
        print('Loading the data')
        # Reading reading reading...
        # ... structured data
        XB = self.load_pickle(f'{structured_path}/X_B.pkl')
        XD = self.load_pickle(f'{structured_path}/X_D.pkl')
        XL = self.load_pickle(f'{structured_path}/X_L.pkl')
        XM = self.load_pickle(f'{structured_path}/X_M.pkl')
        XP = self.load_pickle(f'{structured_path}/X_P.pkl')
        XS = self.load_pickle(f'{structured_path}/X_S.pkl')
        VIY = self.load_pickle(f'{structured_path}/VIY.pkl')

        # ... unstructured data
        df_emb = pd.read_csv(f'{unstructured_path}/merged_embeddings_{self.embedding_model}.csv')
        
        # --------------------------------------------------------------------------------------------------------
        #                                   Extracting the data
        # --------------------------------------------------------------------------------------------------------
        # Convert VIY to DataFrame for easier operations
        df_viy = pd.DataFrame(VIY, columns=['HADM_ID', 'ICU_ID', 'LoS'])

        # Find common HADM_IDs
        common_hadm_ids = np.intersect1d(df_viy['HADM_ID'].values, df_emb['HADM_ID'].values)

        # Filter VIY and df_emb based on common HADM_IDs
        filtered_viy = df_viy[df_viy['HADM_ID'].isin(common_hadm_ids)].sort_values(by='HADM_ID').reset_index(drop=True)
        new_emb = df_emb[df_emb['HADM_ID'].isin(common_hadm_ids)].sort_values(by='HADM_ID').reset_index(drop=True)

        # Create a mask to extract corresponding XB and XD values
        mask = np.isin(VIY[:, 0], common_hadm_ids)

        return XB[mask], XD[mask], XL[mask], XM[mask], XP[mask], XS[mask], VIY[mask, 2], new_emb
    

    def process_and_save_embeddings(self, merged_df):
        """
        Extracts embedding columns from self.merged_df, converts text representations 
        of lists into NumPy arrays, and saves them as pickle files.
        """
        print('Processing and saving the embedding ...')
        embedding_columns = ['EMB_TEXT', 
                             'EMB_1_t5_small2_SUMMARY', 
                             'EMB_3_bart_large_cnn_SUMMARY', 
                             'EMB_4_medical_summarization_SUMMARY']
        
        for col in embedding_columns:
            print(f"Processing {col}...")

            # Convert string representation of lists to actual lists of floats
            try:
                processed_data = merged_df[col].apply(lambda x: np.array(ast.literal_eval(x), dtype=np.float32))
            except Exception as e:
                print(f"Error processing column {col}: {e}")
                continue

            # Stack into a single NumPy array (shape: [num_samples, embedding_dim])
            embeddings_array = np.vstack(processed_data.to_numpy())

            # Save the array using the provided function
            self.save_list_as_pickle(embeddings_array, self.saving_path, col)
            print(f"Saved {col} with shape {embeddings_array.shape}")
    
        
    def classify_los_3_classes(self, los_list):
        return [0 if los < 3 else 1 if 3 <= los <= 7 else 2 for los in los_list]

    def concatenate_and_save(self):    
        print('Reading the data ...')

        T0 = load_pickle(f'{self.saving_path}/EMB_TEXT.pkl')
        T1 = load_pickle(f'{self.saving_path}/EMB_1_t5_small2_SUMMARY.pkl')
        T2 = load_pickle(f'{self.saving_path}/EMB_3_bart_large_cnn_SUMMARY.pkl')
        T3 = load_pickle(f'{self.saving_path}/EMB_4_medical_summarization_SUMMARY.pkl')

        Z = np.concatenate((self.XB, self.XD, self.XL, self.XM, self.XP), axis=1)
        
        ZS   = np.concatenate((Z, self.XS), axis=1)

        ZST0 = np.concatenate((ZS, T0), axis=1)
        ZST1 = np.concatenate((ZS, T1), axis=1)
        ZST2 = np.concatenate((ZS, T2), axis=1)
        ZST3 = np.concatenate((ZS, T3), axis=1)
        
        LoS    = self.VIY[:,2]
        Visits = self.VIY[:,0]

        Y = self.classify_los_3_classes(LoS)

        print('|--- Saving ...')
        self.save_list_as_pickle(Z,    self.saving_path, 'Z')
        self.save_list_as_pickle(ZS,   self.saving_path, 'ZS')
        self.save_list_as_pickle(T0,   self.saving_path, 'T0')
        self.save_list_as_pickle(T1,   self.saving_path, 'T1')
        self.save_list_as_pickle(T2,   self.saving_path, 'T2')
        self.save_list_as_pickle(T3,   self.saving_path, 'T3')
        self.save_list_as_pickle(ZST0, self.saving_path, 'ZST0')
        self.save_list_as_pickle(ZST1, self.saving_path, 'ZST1')
        self.save_list_as_pickle(ZST2, self.saving_path, 'ZST2')
        self.save_list_as_pickle(ZST3, self.saving_path, 'ZST3')
        self.save_list_as_pickle(Y,    self.saving_path, 'Y')

    # FUNCTIONS
    def save_list_as_pickle(self, L, given_path, file_name):
        # Ensure the directory exists
        if not os.path.exists(given_path):
            os.makedirs(given_path)
            print(f'\tDirectory created: {given_path}')
        
        # Save the list as a pickle file
        print(f'\tSaving to {given_path}/{file_name}.pkl')
        with open(os.path.join(given_path, f'{file_name}.pkl'), 'wb') as file:
            pickle.dump(L, file)
            
    def load_pickle(self, thePath):
        with open(thePath, 'rb') as f:
            data = pickle.load(f)
        return data

In [9]:
structured_path = '../../../Data/structured'
unstructured_path = '../../../Data/unstructured/emb'
saving_path = '../../../Data/XY2'
for the_model  in ['bioclinicalbert', 'clinicalbert', 'gatortron']:
    _ = Unifying(structured_path, unstructured_path, saving_path, the_model)    


Loading the data
Processing and saving the embedding ...
Processing EMB_TEXT...
	Directory created: ../../../Data/XY2/clinicalbert
	Saving to ../../../Data/XY2/clinicalbert/EMB_TEXT.pkl
Saved EMB_TEXT with shape (42142, 768)
Processing EMB_1_t5_small2_SUMMARY...
	Saving to ../../../Data/XY2/clinicalbert/EMB_1_t5_small2_SUMMARY.pkl
Saved EMB_1_t5_small2_SUMMARY with shape (42142, 768)
Processing EMB_3_bart_large_cnn_SUMMARY...
	Saving to ../../../Data/XY2/clinicalbert/EMB_3_bart_large_cnn_SUMMARY.pkl
Saved EMB_3_bart_large_cnn_SUMMARY with shape (42142, 768)
Processing EMB_4_medical_summarization_SUMMARY...
