In [1]:
import pandas as pd
import iisignature
import os
import numpy as np

In [2]:
#Reading a sample of data
data = pd.read_csv("/Users/farhat/Documents/Project/RawData/TrainingSet_A/p000044.psv", sep='|')
data.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,79.0,100.0,36.2,118.0,86.0,69.5,15.0,,3.0,24.0,...,12.5,,136.0,59.34,1,0,1,-19.05,5,0
1,78.5,100.0,35.75,131.5,92.5,75.5,12.0,,,,...,,,,59.34,1,0,1,-19.05,6,0
2,58.0,99.0,36.2,121.0,83.0,67.0,19.0,,0.0,,...,,,,59.34,1,0,1,-19.05,7,0
3,70.5,100.0,36.25,138.0,96.0,79.0,16.5,,0.0,,...,,,,59.34,1,0,1,-19.05,8,0
4,88.5,98.5,36.7,137.0,90.5,75.0,21.0,,,,...,,,,59.34,1,0,1,-19.05,9,0


### Function for calculating the partial sofa from existing data in the table

In [None]:
def compute_partial_sofa(df):
    # Coagulation (Platelets)
    df['SOFA_coagulation'] = pd.cut(df['Platelets'],
                                    bins=[-np.inf,20,50,100,150,np.inf],
                                    labels=[4,3,2,1,0]).astype(float)
    
    # Liver (Bilirubin)
    df['SOFA_liver'] = pd.cut(df['Bilirubin_direct'],
                              bins=[-np.inf,1.2,1.9,5.9,11.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Cardiovascular (MAP only, no vasopressors recorded)
    df['SOFA_cardiovascular'] = np.where(df['MAP'] >= 70, 0, 1)
    
    # Renal (Creatinine)
    df['SOFA_renal'] = pd.cut(df['Creatinine'],
                              bins=[-np.inf,1.2,1.9,3.4,4.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Partial SOFA = sum of available components
    df['PartialSOFA'] = df[['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal']].sum(axis=1)
    df=df.drop(columns=['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal'])
    
    return df

In [None]:
# Compute signature features with sliding window
def compute_signature_features(df, cols, window_size=7, sig_order=3):
        sig_features = []
        for i in range(window_size, len(df)):
            window = df[cols].iloc[i-window_size:i].values
            sig = iisignature.sig(window, sig_order)
            sig_features.append(sig)
        pad = np.full((window_size, len(sig_features[0])), np.nan)
        sig_features = np.vstack([pad, sig_features])
        sig_df = pd.DataFrame(sig_features, columns=[f"sig_{i}" for i in range(sig_features.shape[1])])
        return pd.concat([df.reset_index(drop=True), sig_df], axis=1)

### Reading all the data sets from Training sets A and B, Filling the missing values and Extracting new features

In [None]:


# Define source and destination folders
source_folder = '/Users/farhat/Documents/Project/RawData/TrainingSet_B'
destination_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B'

# Making sure that the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Looping through each file in the source folder
for filename in os.listdir(source_folder):
    # Checking if it is a file with the desired extension(.psv)
    if filename.endswith('.psv'):
        # Constructing the full file path
        file_path = os.path.join(source_folder, filename)
        
        # Reading the file into a DataFrame
        df = pd.read_csv(file_path, sep='|')
        
        #Filling missing values
        df['DBP']= df['DBP'].fillna((3*df['MAP']-df['SBP'])/2) #Filling DBP calculating the value from SBP and MAP. 
        df=df.ffill()
        df=df.bfill()

        #Hand Crafted Data
        df['ShockIndex'] = df['HR'] / df['SBP']
        df['BUN_CR'] = df['Bilirubin_direct'] / df['Creatinine']
        
        #Computing partial sofa using the function provided above
        df=compute_partial_sofa(df)

        #Adding signature columns to the dataset
        sig_cols = ['PartialSOFA', 'MAP', 'BUN_CR']  
        df = compute_signature_features(df, sig_cols)

        # Adding a column of patient's id to the data
        df['Patient_Id'] = filename.replace('.psv', '')
        # Reorder columns with 'Patient_Id' first
        df = df[['Patient_Id'] + [col for col in df.columns if col != 'Patient_Id']]

        # Reordering the sepsis label column
        col = df.pop('SepsisLabel')
        df['SepsisLabel']=col
        
        # Saving the preprocessed DataFrame to destination folder with the same filename
        save_path = os.path.join(destination_folder, filename.replace('.psv','.csv'))
        df.to_csv(save_path, index=False)

        print(f'Saved to {save_path}')


Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p116812.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p109932.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p102867.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p114963.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p100916.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p118181.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p108392.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p100902.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p109098.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p118195.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p108386.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p114977.csv
Saved to /Users/farhat/Docum

### Merging the processed data into datasetA and datasetB

In [9]:
df=pd.read_csv("/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p000006.csv", sep=',')
dataA=pd.DataFrame(columns=df.columns)
source_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A'
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .csv
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep=',')
        dataA = pd.concat([dataA, df], ignore_index=True)

dataA.to_csv('/Users/farhat/Documents/Project/ProcessedData/DataSetA.csv')

  dataA = pd.concat([dataA, df], ignore_index=True)


In [22]:
del source_folder, df, filename, file_path

In [24]:
# Reading a sample data to extract the column names
df=pd.read_csv("/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p000006.csv", sep=',')
dataB=pd.DataFrame(columns=df.columns)
source_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B'
# Looping throug each file in the file list and concating the data to the previous ones. 
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .csv
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep=',')
        dataB = pd.concat([dataB, df], ignore_index=True)

dataB.to_csv('/Users/farhat/Documents/Project/ProcessedData/DataSetB.csv')

  dataB = pd.concat([dataB, df], ignore_index=True)


### Merging the datasetA and datasetB into fullData

In [40]:
df_a = pd.read_csv('/Users/farhat/Documents/Project/ProcessedData/DataSetA.csv', index_col=0)
df_b = pd.read_csv('/Users/farhat/Documents/Project/ProcessedData/DataSetB.csv', index_col=0)
fulldata = pd.concat([df_a, df_b])
fulldata.sort_values(by=['Patient_Id','HospAdmTime','ICULOS'], inplace=True)
fulldata.reset_index(drop=True, inplace=True)
fulldata.to_csv("/Users/farhat/Documents/Project/ProcessedData/fullData.csv", index=False)
fulldata.head(10)

Unnamed: 0,Patient_Id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,sig_30,sig_31,sig_32,sig_33,sig_34,sig_35,sig_36,sig_37,sig_38,SepsisLabel
0,p000001,97.0,95.0,36.11,98.0,75.33,63.995,19.0,,24.0,...,,,,,,,,,,0
1,p000001,97.0,95.0,36.11,98.0,75.33,63.995,19.0,,24.0,...,,,,,,,,,,0
2,p000001,89.0,99.0,36.11,122.0,86.0,68.0,22.0,,24.0,...,,,,,,,,,,0
3,p000001,90.0,95.0,36.11,122.0,86.0,68.0,30.0,,24.0,...,,,,,,,,,,0
4,p000001,103.0,88.5,36.11,122.0,91.33,75.995,24.5,,24.0,...,,,,,,,,,,0
5,p000001,110.0,91.0,36.11,122.0,91.33,75.995,22.0,,24.0,...,,,,,,,,,,0
6,p000001,108.0,92.0,36.11,123.0,77.0,54.0,29.0,,24.0,...,,,,,,,,,,0
7,p000001,106.0,90.5,36.11,93.0,76.33,67.995,29.0,,24.0,...,,,,,,,,,,0
8,p000001,104.0,95.0,36.11,133.0,88.33,65.995,26.0,,24.0,...,,,,,,,,,,0
9,p000001,102.0,91.0,36.11,134.0,87.33,63.995,30.0,,24.0,...,,,,,,,,,,0
