In [1]:
import os
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

Read and Combine PSV Files:

In [2]:
# Path to your PSV files folder
folder_path = 'training/'
# List to store individual dataframes
dataframes = []

# Loop through files in the folder and read them into dataframes
for filename in os.listdir(folder_path):
    if filename.endswith('.psv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep='|')
        
        # Extract patient ID from the filename (assuming filename format 'pXXXXXX.psv')
        patient_id = int(filename.split('p')[1].split('.')[0])  # Extract numeric part from the filename
        
        # Add patient ID column to the dataframe
        df.insert(0, 'patient_id', patient_id)
        
        dataframes.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)



In [3]:
# Calculate days since admission based on ICULOS for each row
combined_df['days_since_admission'] = combined_df['ICULOS'] / 24  # Assuming ICULOS is in hours

# Identify patients that meet the condition
patients_to_keep = combined_df.loc[combined_df['days_since_admission'] > 4, 'patient_id'].unique()

# Create subDataset containing rows for patients that meet the condition
subDataset = combined_df[combined_df['patient_id'].isin(patients_to_keep)]

# Drop the temporary 'days_since_admission' column if it's no longer needed
subDataset.drop(columns=['days_since_admission'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subDataset.drop(columns=['days_since_admission'], inplace=True)


In [4]:
subDataset

Unnamed: 0,patient_id,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
448,373,77.0,100.0,37.39,161.0,,,18.5,,,...,,,,56.99,1,,,-0.02,2,0
449,373,78.0,99.0,,129.0,,,18.0,,,...,,,,56.99,1,,,-0.02,3,0
450,373,74.0,100.0,,144.5,102.0,74.0,20.0,,,...,,,,56.99,1,,,-0.02,4,0
451,373,77.0,100.0,36.83,150.5,103.0,78.0,18.0,,,...,,,,56.99,1,,,-0.02,5,0
452,373,78.0,100.0,,134.0,99.0,80.0,18.0,,,...,,,,56.99,1,,,-0.02,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14941,311,92.0,100.0,,,,,27.0,,,...,,,,64.57,0,1.0,0.0,-0.02,95,1
14942,311,100.0,91.0,,143.0,67.0,,29.0,,,...,,,,64.57,0,1.0,0.0,-0.02,96,1
14943,311,99.0,99.0,37.22,152.0,90.0,,29.0,,,...,,,,64.57,0,1.0,0.0,-0.02,97,1
14944,311,96.0,96.0,,118.0,67.0,,16.0,,,...,,,,64.57,0,1.0,0.0,-0.02,98,1


In [5]:
# Calculate the 'SIRS' column
subDataset['SIRS'] = (subDataset['HR'].between(60, 100) &
                      subDataset['Temp'].between(36, 38.3) &
                      subDataset['WBC'].between(4.0, 12.0)).astype(int)

# Insert 'SIRS' column as the second column in the DataFrame
subDataset.insert(1, 'SIRS', subDataset.pop('SIRS'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subDataset['SIRS'] = (subDataset['HR'].between(60, 100) &


In [6]:
subDataset

Unnamed: 0,patient_id,SIRS,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
448,373,0,77.0,100.0,37.39,161.0,,,18.5,,...,,,,56.99,1,,,-0.02,2,0
449,373,0,78.0,99.0,,129.0,,,18.0,,...,,,,56.99,1,,,-0.02,3,0
450,373,0,74.0,100.0,,144.5,102.0,74.0,20.0,,...,,,,56.99,1,,,-0.02,4,0
451,373,0,77.0,100.0,36.83,150.5,103.0,78.0,18.0,,...,,,,56.99,1,,,-0.02,5,0
452,373,0,78.0,100.0,,134.0,99.0,80.0,18.0,,...,,,,56.99,1,,,-0.02,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14941,311,0,92.0,100.0,,,,,27.0,,...,,,,64.57,0,1.0,0.0,-0.02,95,1
14942,311,0,100.0,91.0,,143.0,67.0,,29.0,,...,,,,64.57,0,1.0,0.0,-0.02,96,1
14943,311,0,99.0,99.0,37.22,152.0,90.0,,29.0,,...,,,,64.57,0,1.0,0.0,-0.02,97,1
14944,311,0,96.0,96.0,,118.0,67.0,,16.0,,...,,,,64.57,0,1.0,0.0,-0.02,98,1


In [7]:
# Function to classify patients into SOFA groups
def classify_sofa_group(row):
    respiration = row['Resp']
    platelets = row['Platelets']
    bilirubins = row['Bilirubin_total']
    arterial_media = row['MAP']
    creatinine = row['Creatinine']
    
    if respiration >= 400 and platelets >= 150 and bilirubins < 1.2 and arterial_media >= 70 and creatinine > 1.2:
        return 0
    elif respiration < 400 and platelets < 150 and 1.2 < bilirubins < 1.9 and arterial_media < 70 and 1.2 < creatinine < 1.9:
        return 1
    elif respiration < 300 and platelets < 100 and 2.0 < bilirubins < 5.9 and arterial_media < 70 and 2.0 < creatinine < 3.4:
        return 2
    elif respiration < 200 and platelets < 50 and 6.0 < bilirubins < 11.9 and arterial_media < 70 and 3.5 < creatinine < 4.9:
        return 3
    elif respiration < 100 and platelets < 20 and bilirubins > 12.0 and arterial_media < 70 and creatinine > 5.0:
        return 4
    else:
        return None  # Return None for unclassified rows

# Apply the classification function to create 'SOFA_Group' column
subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)


In [8]:
subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)

# Insert 'SOFA_Group' column as the third column in the DataFrame
subDataset.insert(2, 'SOFA', subDataset.pop('SOFA'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)


In [9]:
subDataset

Unnamed: 0,patient_id,SIRS,SOFA,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
448,373,0,,77.0,100.0,37.39,161.0,,,18.5,...,,,,56.99,1,,,-0.02,2,0
449,373,0,,78.0,99.0,,129.0,,,18.0,...,,,,56.99,1,,,-0.02,3,0
450,373,0,,74.0,100.0,,144.5,102.0,74.0,20.0,...,,,,56.99,1,,,-0.02,4,0
451,373,0,,77.0,100.0,36.83,150.5,103.0,78.0,18.0,...,,,,56.99,1,,,-0.02,5,0
452,373,0,,78.0,100.0,,134.0,99.0,80.0,18.0,...,,,,56.99,1,,,-0.02,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14941,311,0,,92.0,100.0,,,,,27.0,...,,,,64.57,0,1.0,0.0,-0.02,95,1
14942,311,0,,100.0,91.0,,143.0,67.0,,29.0,...,,,,64.57,0,1.0,0.0,-0.02,96,1
14943,311,0,,99.0,99.0,37.22,152.0,90.0,,29.0,...,,,,64.57,0,1.0,0.0,-0.02,97,1
14944,311,0,,96.0,96.0,,118.0,67.0,,16.0,...,,,,64.57,0,1.0,0.0,-0.02,98,1


In [10]:
subDataset["SOFA"].unique()

array([None], dtype=object)