In [1]:
import pandas as pd
import iisignature
import os
import numpy as np

In [2]:
#Reading a sample of data
data = pd.read_csv("/Users/farhat/Documents/Project/RawData/TrainingSet_A/p000044.psv", sep='|')
data.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,79.0,100.0,36.2,118.0,86.0,69.5,15.0,,3.0,24.0,...,12.5,,136.0,59.34,1,0,1,-19.05,5,0
1,78.5,100.0,35.75,131.5,92.5,75.5,12.0,,,,...,,,,59.34,1,0,1,-19.05,6,0
2,58.0,99.0,36.2,121.0,83.0,67.0,19.0,,0.0,,...,,,,59.34,1,0,1,-19.05,7,0
3,70.5,100.0,36.25,138.0,96.0,79.0,16.5,,0.0,,...,,,,59.34,1,0,1,-19.05,8,0
4,88.5,98.5,36.7,137.0,90.5,75.0,21.0,,,,...,,,,59.34,1,0,1,-19.05,9,0


### Function for calculating the partial sofa from existing data in the table

In [3]:
def compute_partial_sofa(df):
    # Coagulation (Platelets)
    df['SOFA_coagulation'] = pd.cut(df['Platelets'],
                                    bins=[-np.inf,20,50,100,150,np.inf],
                                    labels=[4,3,2,1,0]).astype(float)
    
    # Liver (Bilirubin)
    df['SOFA_liver'] = pd.cut(df['Bilirubin_direct'],
                              bins=[-np.inf,1.2,1.9,5.9,11.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Cardiovascular (MAP only, no vasopressors recorded)
    df['SOFA_cardiovascular'] = np.where(df['MAP'] >= 70, 0, 1)
    
    # Renal (Creatinine)
    df['SOFA_renal'] = pd.cut(df['Creatinine'],
                              bins=[-np.inf,1.2,1.9,3.4,4.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Partial SOFA = sum of available components
    df['PartialSOFA'] = df[['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal']].sum(axis=1)
    df=df.drop(columns=['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal'])
    
    return df


In [4]:
# Compute signature features with sliding window
def compute_signature_features(df, cols, window_size=7, sig_order=3):
        sig_features = []
        for i in range(window_size, len(df)):
            window = df[cols].iloc[i-window_size:i].values
            sig = iisignature.sig(window, sig_order)
            sig_features.append(sig)
        pad = np.full((window_size, len(sig_features[0])), np.nan)
        sig_features = np.vstack([pad, sig_features])
        sig_df = pd.DataFrame(sig_features, columns=[f"sig_{i}" for i in range(sig_features.shape[1])])
        return pd.concat([df.reset_index(drop=True), sig_df], axis=1)
    


In [8]:
 # Construct full file path
file_path = "/Users/farhat/Documents/Project/RawData/TrainingSet_B/p000001.psv"
        
# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='|')

# [Optional] Process your data here
# For example, print shape or clean missing values
df['DBP']= df['DBP'].fillna((3*df['MAP']-df['SBP'])/2) #Filling DBP calculating the value from SBP and MAP. 
df=df.ffill()
df=df.bfill()

#Hand Crafted Data
df['ShockIndex'] = df['HR'] / df['SBP']
df['BUN_CR'] = df['Bilirubin_direct'] / df['Creatinine']
# Partial SOFA and SOFA deterioration require definitions based on available variables
# Placeholder below:
df=compute_partial_sofa(df)

#Adding signature column to the dataset
sig_cols = ['PartialSOFA', 'MAP', 'BUN_CR']  
df = compute_signature_features(df, sig_cols)

# Adding a column of patient's id to the data
df['Patient_Id'] = 'p000001'
# Reorder columns with 'Patient_Id' first
df = df[['Patient_Id'] + [col for col in df.columns if col != 'Patient_Id']]

# Reordering the sepsis label column
col = df.pop('SepsisLabel')
df['SepsisLabel']=col

# Save to destination folder with the same filename
save_path = "/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p000001.csv"
df.to_csv(save_path, index=False)

### Reading all the data sets from Training sets A and B, Filling the missing values and Extracting new features

In [6]:
# Define source and destination folders
source_folder = '/Users/farhat/Documents/Project/RawData/TrainingSet_A'
destination_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A'

# Make sure destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Loop through each file in the source folder
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .psv
    if filename.endswith('.psv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep='|')
        
        # [Optional] Process your data here
        # For example, print shape or clean missing values
        df['DBP']= df['DBP'].fillna((3*df['MAP']-df['SBP'])/2) #Filling DBP calculating the value from SBP and MAP. 
        df=df.ffill()
        df=df.bfill()

        #Hand Crafted Data
        df['ShockIndex'] = df['HR'] / df['SBP']
        df['BUN_CR'] = df['Bilirubin_direct'] / df['Creatinine']
        # Partial SOFA and SOFA deterioration require definitions based on available variables
        # Placeholder below:
        df=compute_partial_sofa(df)

        #Adding signature column to the dataset
        sig_cols = ['PartialSOFA', 'MAP', 'BUN_CR']  
        df = compute_signature_features(df, sig_cols)

        # Adding a column of patient's id to the data
        df['Patient_Id'] = filename.replace('.psv', '')
        # Reorder columns with 'Patient_Id' first
        df = df[['Patient_Id'] + [col for col in df.columns if col != 'Patient_Id']]

        # Reordering the sepsis label column
        col = df.pop('SepsisLabel')
        df['SepsisLabel']=col
        
        # Save to destination folder with the same filename
        save_path = os.path.join(destination_folder, filename.replace('.psv','.csv'))
        df.to_csv(save_path, index=False)

        print(f'Saved to {save_path}')


Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p014977.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p000902.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p009098.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p008386.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p018195.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p009926.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p016806.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p002873.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p002867.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p009932.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p020378.csv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p016812.csv
Saved to /Users/farhat/Docum

In [9]:
df=pd.read_csv("/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p000006.csv", sep=',')
data=pd.DataFrame(columns=df.columns)
source_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A'
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .csv
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep=',')
        data = pd.concat([data, df], ignore_index=True)

  data = pd.concat([data, df], ignore_index=True)


In [12]:
source_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B'
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .csv
    if filename.endswith('.csv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep=',')
        data = pd.concat([data, df], ignore_index=True)

In [15]:
data.to_csv("/Users/farhat/Documents/Project/ProcessedData/fullData.csv", index=False)
data.head(10)


Unnamed: 0,Patient_Id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,sig_30,sig_31,sig_32,sig_33,sig_34,sig_35,sig_36,sig_37,sig_38,SepsisLabel
0,p016161,74.0,100,37.11,,101,,14.5,,,...,,,,,,,,,,1
1,p016161,72.0,98,37.11,,81,,14.5,,,...,,,,,,,,,,1
2,p016161,75.0,100,37.11,,83,,14.0,,,...,,,,,,,,,,1
3,p016161,63.0,100,37.11,,64,,14.0,,,...,,,,,,,,,,1
4,p016161,61.0,100,37.17,,66,,14.5,,,...,,,,,,,,,,1
5,p016161,68.0,100,37.17,,63,,14.0,,,...,,,,,,,,,,1
6,p016161,64.0,100,37.17,,94,,14.0,,,...,,,,,,,,,,1
7,p016161,66.0,100,37.17,,83,,14.0,,,...,,,,,,,,,,1
8,p019252,80.0,100,34.45,109.5,89,72.0,12.0,,1.0,...,,,,,,,,,,0
9,p019252,80.0,100,35.7,90.0,68,54.0,10.0,,1.0,...,,,,,,,,,,0


In [None]:
data.sort_values()

(1580430, 84)