In [7]:
import pandas as pd
import iisignature
import os
import numpy as np

In [None]:
#Reading a sample of data
data = pd.read_csv("/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A/p000044.psv", sep=',')
data.head()

Unnamed: 0,Patient_Id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,ShockIndex,BUN_CR,PartialSOFA,SepsisLabel
0,p000044,79.0,100.0,36.2,118.0,86.0,69.5,15.0,,3.0,...,59.34,1,0,1,-19.05,5,0.669492,,1.0,0
1,p000044,78.5,100.0,35.75,131.5,92.5,75.5,12.0,,3.0,...,59.34,1,0,1,-19.05,6,0.596958,,1.0,0
2,p000044,58.0,99.0,36.2,121.0,83.0,67.0,19.0,,0.0,...,59.34,1,0,1,-19.05,7,0.479339,,1.0,0
3,p000044,70.5,100.0,36.25,138.0,96.0,79.0,16.5,,0.0,...,59.34,1,0,1,-19.05,8,0.51087,,1.0,0
4,p000044,88.5,98.5,36.7,137.0,90.5,75.0,21.0,,0.0,...,59.34,1,0,1,-19.05,9,0.645985,,1.0,0


### Function for calculating the partial sofa from existing data in the table

In [4]:
def compute_partial_sofa(df):
    # Coagulation (Platelets)
    df['SOFA_coagulation'] = pd.cut(df['Platelets'],
                                    bins=[-np.inf,20,50,100,150,np.inf],
                                    labels=[4,3,2,1,0]).astype(float)
    
    # Liver (Bilirubin)
    df['SOFA_liver'] = pd.cut(df['Bilirubin_direct'],
                              bins=[-np.inf,1.2,1.9,5.9,11.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Cardiovascular (MAP only, no vasopressors recorded)
    df['SOFA_cardiovascular'] = np.where(df['MAP'] >= 70, 0, 1)
    
    # Renal (Creatinine)
    df['SOFA_renal'] = pd.cut(df['Creatinine'],
                              bins=[-np.inf,1.2,1.9,3.4,4.9,np.inf],
                              labels=[0,1,2,3,4]).astype(float)
    
    # Partial SOFA = sum of available components
    df['PartialSOFA'] = df[['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal']].sum(axis=1)
    df=df.drop(columns=['SOFA_coagulation','SOFA_liver','SOFA_cardiovascular','SOFA_renal'])
    
    return df


### Reading all the data sets from Training sets A and B, Filling the missing values and Extracting new features

In [14]:
# Define source and destination folders
source_folder = '/Users/farhat/Documents/Project/RawData/TrainingSet_B'
destination_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B'

# Make sure destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Loop through each file in the source folder
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .psv
    if filename.endswith('.psv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep='|')
        
        # [Optional] Process your data here
        # For example, print shape or clean missing values
        df['DBP']= df['DBP'].fillna((3*df['MAP']-df['SBP'])/2) #Filling DBP calculating the value from SBP and MAP. 
        df=df.ffill()
        df=df.bfill()

        #Hand Crafted Data
        df['ShockIndex'] = df['HR'] / df['SBP']
        df['BUN_CR'] = df['Bilirubin_direct'] / df['Creatinine']
        # Partial SOFA and SOFA deterioration require definitions based on available variables
        # Placeholder below:
        df=compute_partial_sofa(df)

        # Adding a column of patient's id to the data
        df['Patient_Id'] = filename.replace('.psv', '')
        # Reorder columns with 'Patient_Id' first
        df = df[['Patient_Id'] + [col for col in df.columns if col != 'Patient_Id']]

        # Reordering the sepsis label column
        col = df.pop('SepsisLabel')
        df['SepsisLabel']=col
        
        # Save to destination folder with the same filename
        save_path = os.path.join(destination_folder, filename)
        df.to_csv(save_path, index=False)

        print(f'Saved to {save_path}')


Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p014977.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p000902.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p009098.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p008386.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p018195.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p009926.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p016806.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p002873.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p002867.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p009932.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p020378.psv
Saved to /Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p016812.psv
Saved to /Users/farhat/Docum

In [45]:
df=pd.read_csv("/Users/farhat/Documents/Project/ProcessedData/TrainingSet_B/p000006.psv", sep=',')
data=pd.DataFrame(columns=df.columns)
source_folder = '/Users/farhat/Documents/Project/ProcessedData/TrainingSet_A'
for filename in os.listdir(source_folder):
    # Check if it is a file with the desired extension, e.g. .psv
    if filename.endswith('.psv'):
        # Construct full file path
        file_path = os.path.join(source_folder, filename)
        
        # Read the file into a DataFrame
        df = pd.read_csv(file_path, sep=',')
        data = pd.concat([data, df], ignore_index=True)

  data = pd.concat([data, df], ignore_index=True)


In [5]:
data.head(10)

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,ShockIndex,BUN_CR,PartialSOFA,SepsisLabel
0,109.0,100.0,36.56,118.0,86.0,70.0,18.5,,0.0,29.0,...,52.01,1,1,0,-0.03,3,0.923729,,0.0,0
1,111.0,100.0,36.72,127.0,90.33,71.995,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,4,0.874016,,0.0,0
2,107.0,99.0,36.72,122.0,78.0,56.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,5,0.877049,,0.0,0
3,106.0,99.0,36.72,101.0,73.0,59.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,6,1.049505,,0.0,0
4,103.0,97.0,36.72,112.0,78.0,61.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,7,0.919643,,0.0,0
5,100.0,99.0,36.67,122.0,85.33,66.995,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,8,0.819672,,0.0,0
6,96.0,98.0,36.67,111.0,75.67,58.005,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,9,0.864865,,0.0,0
7,95.0,98.0,36.67,129.0,95.0,78.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,10,0.736434,,0.0,0
8,94.0,99.0,36.67,144.0,100.0,78.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,11,0.652778,,0.0,0
9,95.0,98.0,36.72,115.0,89.0,76.0,43.0,,0.0,29.0,...,52.01,1,1,0,-0.03,12,0.826087,,0.0,0


In [6]:
data.shape

(17, 44)

In [51]:
data.to_csv("/Users/farhat/Documents/Project/ProcessedData/fullData.csv", index=False)