In [1]:
import os
import pandas as pd
from sklearn.impute import KNNImputer
import numpy as np

Read and Combine PSV Files:

In [None]:
# Path to your PSV files folder
folder_path = 'training/'
# List to store individual dataframes
dataframes = []

# Loop through files in the folder and read them into dataframes
for filename in os.listdir(folder_path):
    if filename.endswith('.psv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep='|')
        
        # Extract patient ID from the filename (assuming filename format 'pXXXXXX.psv')
        patient_id = int(filename.split('p')[1].split('.')[0])  # Extract numeric part from the filename
        
        # Add patient ID column to the dataframe
        df.insert(0, 'patient_id', patient_id)
        
        dataframes.append(df)

# Combine all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)



In [None]:
# Calculate days since admission based on ICULOS for each row
combined_df['days_since_admission'] = combined_df['ICULOS'] / 24  # Assuming ICULOS is in hours

# Identify patients that meet the condition
patients_to_keep = combined_df.loc[combined_df['days_since_admission'] > 4, 'patient_id'].unique()

# Create subDataset containing rows for patients that meet the condition
subDataset = combined_df[combined_df['patient_id'].isin(patients_to_keep)]

# Drop the temporary 'days_since_admission' column if it's no longer needed
subDataset.drop(columns=['days_since_admission'], inplace=True)


In [None]:
subDataset

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer


In [None]:
columns_to_impute

In [None]:
# Specify columns for imputation
columns_to_impute = ['HR', 'Temp', 'WBC', 'MAP', 'Platelets', 'Bilirubin_total', 'Creatinine', 'Resp']

# Create a KNNImputer object 
imputer = KNNImputer(n_neighbors=5)  

# Perform imputation for specified columns
subDataset[columns_to_impute] = imputer.fit_transform(subDataset[columns_to_impute])


In [None]:
# Calculate the 'SIRS' column
subDataset['SIRS'] = (subDataset['HR'].between(60, 100) &
                      subDataset['Temp'].between(36, 38.3) &
                      subDataset['WBC'].between(4.0, 12.0)).astype(int)

# Insert 'SIRS' column as the second column in the DataFrame
subDataset.insert(1, 'SIRS', subDataset.pop('SIRS'))


In [None]:
subDataset

In [None]:
# Function to classify patients into SOFA groups
def classify_sofa_group(row):
    respiration = row['Resp']
    platelets = row['Platelets']
    bilirubins = row['Bilirubin_total']
    arterial_media = row['MAP']
    creatinine = row['Creatinine']
    
    if respiration >= 400 and platelets >= 150 and bilirubins < 1.2 and arterial_media >= 70 and creatinine > 1.2:
        return 0
    elif respiration < 400 and platelets < 150 and 1.2 < bilirubins < 1.9 and arterial_media < 70 and 1.2 < creatinine < 1.9:
        return 1
    elif respiration < 300 and platelets < 100 and 2.0 < bilirubins < 5.9 and arterial_media < 70 and 2.0 < creatinine < 3.4:
        return 2
    elif respiration < 200 and platelets < 50 and 6.0 < bilirubins < 11.9 and arterial_media < 70 and 3.5 < creatinine < 4.9:
        return 3
    elif respiration < 100 and platelets < 20 and bilirubins > 12.0 and arterial_media < 70 and creatinine > 5.0:
        return 4
    else:
        return None  # Return None for unclassified rows

# Apply the classification function to create 'SOFA_Group' column
subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)



In [None]:
subDataset['SOFA'] = subDataset.apply(classify_sofa_group, axis=1)

# Insert 'SOFA_Group' column as the third column in the DataFrame
subDataset.insert(2, 'SOFA', subDataset.pop('SOFA'))

In [None]:
subDataset.to_csv('ehr.csv', index=False)