In [None]:
import numpy as np
import os, copy
import pandas as pd
import csv

# Read in data... just do this once.
path = r'PATH/data/pneumona_reduced_vitals.csv'
vitals = pd.read_csv(path)
vitals = np.array(vitals)

# Structure of vitals rows
#ROW_ID | SUBJECT_ID | HADM_ID | ICUSTAY_ID | ITEMID | CHARTTIME | STORETIME | CGID | VALUE | VALUENUM | VALUEUOM | WARNING | ERROR | RESULTSTATUS | STOPPED

In [None]:
# Get the unique PATIENT_ID values and count them
unique_ids = np.unique(vitals[:, 1])
num_unique_ids = len(unique_ids)
unique_hids = np.unique(vitals[:, 2])
num_unique_hids = len(unique_hids)
print("Number of distinct PATIENT_IDs:", num_unique_ids)
print("Number of distinct HADM_IDs:", num_unique_hids)


## Separate Pneumonia patients & write out vitals to distinct .csv's

In [None]:
# Create the output folder if it doesn't exist
output_folder = 'Pneumonia_sep'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Open the mixed CSV file for reading
with open(path, 'r') as mixed_file:
    reader = csv.DictReader(mixed_file)
    
    # Create a dictionary to store the output file handles
    output_files = {}
    
    # Iterate over the rows in the mixed file
    for row in reader:
        hadm_id = row['HADM_ID']
        output_file_name = f'{output_folder}/{hadm_id}.csv'
        
        # If this is the first row with this HADM ID, open a new output file
        if hadm_id not in output_files:
            output_files[hadm_id] = open(output_file_name, 'w', newline='')
            writer = csv.DictWriter(output_files[hadm_id], fieldnames=reader.fieldnames)
            writer.writeheader()
        
        # Write the row to the output file
        writer = csv.DictWriter(output_files[hadm_id], fieldnames=reader.fieldnames)
        writer.writerow(row)
    
    # Close all the output files
    for output_file in output_files.values():
        output_file.close()

## Reformat each patient-visit level file

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Set up the file paths for the input and output directories
input_dir = r'PATH/data/Pneumonia_sep'
output_dir = r'PATH/data/Pneumonia_sep_processed'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# create a list of the ITEMID values you want to extract
item_ids = [211, 220045, #Heart Rate
            646, 220227, #Sp02 (blood oxygen level)
            8368, 220051, #Arterial BP Diastolic
            51, 220050, #Arterial BP Systolic
            615, 224690] #Total Resp

csv_files = [filename for filename in os.listdir(input_dir) if filename.endswith('.csv')]

# loop over each file in the input directory
for filename in csv_files:
    # read the CSV file into a pandas dataframe
    df = pd.read_csv(os.path.join(input_dir, filename))
    
    # group the dataframe by CHARTTIME
    grouped = df.groupby(['CHARTTIME'])
    result = pd.DataFrame()

    # iterate over each group and extract the ITEMID values
    for name, group in grouped:
        # create a new row for the result dataframe
        row = {'CHARTTIME': name}
        for item_id in item_ids:
            # extract the rows with the specified ITEMID at CHARTTIME
            filtered = group[(group['ITEMID'] == item_id) & (group['CHARTTIME'] == name)]
            if not filtered.empty:
                # if there is a row with the ITEMID at CHARTTIME, extract the value
                row[f'ITEMID_{item_id}'] = filtered.iloc[0]['VALUE']
            else:
                # if there is no row with the ITEMID at CHARTTIME, leave the spot blank
                row[f'ITEMID_{item_id}'] = ''
        # add the row to the result dataframe
        result = result.append(row, ignore_index=True)

    # save the result dataframe to a new CSV file in the output directory
    output_filename = os.path.join(output_dir, filename)
    result.to_csv(output_filename, index=False)
    print(f"Processed {filename}")

In [None]:
folder_path = r'PATH/data/Pneumonia_sep_processed'
csv_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]
num_csv_files = len(csv_files)
print(f"There are {num_csv_files} CSV files in Pneumonia_sep_processed.")

## Merge the columns of equivalent readings

In [None]:
# Set up the file paths for the input and output directories
input_dir = r'PATH/data/Pneumonia_sep_processed'
output_dir = r'PATH/data/Pneumonia_sep_col_combined'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# loop through each csv file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # read in the csv file
        df = pd.read_csv(os.path.join(input_dir, filename))
        
        # combine the specified columns and rename them
        df['HR'] = df[['ITEMID_211', 'ITEMID_220045']].fillna(method='backfill', axis=1)['ITEMID_211']
        df['spo2'] = df[['ITEMID_646', 'ITEMID_220227']].fillna(method='backfill', axis=1)['ITEMID_646']
        df['dias'] = df[['ITEMID_8368', 'ITEMID_220051']].fillna(method='backfill', axis=1)['ITEMID_8368']
        df['sys'] = df[['ITEMID_51', 'ITEMID_220050']].fillna(method='backfill', axis=1)['ITEMID_51']
        df['resp'] = df[['ITEMID_615', 'ITEMID_224690']].fillna(method='backfill', axis=1)['ITEMID_615']
        
        # drop the original columns & reorder what's left
        df = df.drop(['ITEMID_211', 'ITEMID_220045', 'ITEMID_646', 'ITEMID_220227', 
                      'ITEMID_8368', 'ITEMID_220051', 'ITEMID_51', 'ITEMID_220050', 'ITEMID_615', 'ITEMID_224690'], axis=1)
        df = df[['CHARTTIME', 'HR', 'sys', 'dias', #'mean',
                 'resp','spo2']] 

        # write the modified dataframe to a new csv file in the output directory
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path, index=False)
        print(f"Processed {filename}")

In [None]:
folder_path = r'PATH/data/Pneumonia_sep_col_combined'
csv_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]
num_csv_files = len(csv_files)
print(f"There are {num_csv_files} CSV files in Pneumonia_sep_col_combined.")

## Aggregate each patient file by hour

In [None]:
# Set the input and output directories
input_dir = r'PATH/data/Pneumonia_sep_col_combined'
output_dir = r'PATH/data/Pneumonia_agg_col_combined'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through all the csv files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read the csv file into a pandas dataframe
        df = pd.read_csv(os.path.join(input_dir, filename))

        # Convert CHARTTIME to a datetime object
        df['CHARTTIME'] = pd.to_datetime(df['CHARTTIME'])

        # Group the dataframe by hour and calculate the mean and standard deviation for each group
        grouped_df = df.groupby(pd.Grouper(key='CHARTTIME', freq='H')).agg({'HR': [np.mean],
                                                                            'spo2': [np.mean],
                                                                            'dias': [np.mean],
                                                                            'sys': [np.mean],
                                                                            'resp': [np.mean]})

        # Flatten the multi-level column index to make it easier to work with
        grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]
        
        # Add a new column called DATE_HOUR that contains the date and hour for each group
        grouped_df['DATE_HOUR'] = grouped_df.index.strftime('%Y-%m-%d %H:00:00')
        
        # Make DATE_HOUR the first column
        cols = ['DATE_HOUR'] + [col for col in grouped_df.columns if col != 'DATE_HOUR']
        grouped_df = grouped_df[cols]

        # Save the aggregated data to a new csv file in the output directory with the same name
        grouped_df.to_csv(os.path.join(output_dir, filename), index=False)


In [None]:
folder_path = r'PATH/data/Pneumonia_agg_col_combined'
csv_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]
num_csv_files = len(csv_files)
print(f"There are {num_csv_files} CSV files in Pneumonia_agg_col_combined.")

## Select First 4 hours of vitals

In [None]:
output_dir = r'PATH/data/Pneumonia_agg_col_combined/'

count = 0
for filename in os.listdir(output_dir):
    with open(os.path.join(output_dir, filename), 'r') as csvfile:
        reader = csv.reader(csvfile)
        lines = [next(reader) for _ in range(5)]
        csvfile.seek(0)  # reset the file pointer
        all_lines = [row for row in reader]  # get all lines in the file

    if len(all_lines) < 5:
        count += 1

print('Number of files less than 4 hours of data :', count)


# Separate intubated and nonintubated patients

In [None]:
int_dir = 'PATH/data/INTUBATED/'
not_int_dir = 'PATH/data/NOT_INTUBATED/'
for directory in [int_dir, not_int_dir]:
    if not os.path.exists(directory):
        os.makedirs(directory)
        
directory = r'PATH/data/Pneumonia_agg_col_combined/'
flag_file = r'PATH/data/pneumonia_patients.csv'

with open(flag_file, 'r') as flagfile:
    flag_reader = csv.DictReader(flagfile)
    for row in flag_reader:
        filename = row['HADM_ID'] + '.csv'
        int_flag = row['INTUBATED']
        save_directory = int_dir if int_flag else not_int_dir
        
        if os.path.exists(os.path.join(directory, filename)):
            with open(os.path.join(directory, filename), 'r') as csvfile:
                reader = csv.reader(csvfile)
                lines = [next(reader) for _ in range(5)]

            with open(os.path.join(save_directory, filename), 'w') as newfile:
                writer = csv.writer(newfile)
                writer.writerows(lines)

In [None]:
folder_path = r'PATH/data/INTUBATED/'
csv_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]
num_csv_files = len(csv_files)
print(f"There are {num_csv_files} CSV files in INTUBATED.")

In [None]:
folder_path = r'PATH/data/NOT_INTUBATED/'
csv_files = [filename for filename in os.listdir(folder_path) if filename.endswith('.csv')]
num_csv_files = len(csv_files)
print(f"There are {num_csv_files} CSV files in NOT_INTUBATED.")

## Flatten Positive & Negative readings

In [None]:
output_dir = r'PATH/data/Flattened_Vitals/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

int_dir = r'PATH/data/INTUBATED/'
not_int_dir = r'PATH/data/NOT_INTUBATED/'

int_out_file = "PATH/data/Flattened_Vitals/intubated.csv"
nint_out_file = "PATH/data/Flattened_Vitals/notintubated.csv"

comb_file = "PATH/data/Flattened_Vitals/combined.csv"

#Delete the current file if it's there.
if os.path.exists(nint_out_file):
    os.remove(nint_out_file)
if os.path.exists(int_out_file):
    os.remove(int_out_file)
    
# Flatten intubated patients
for file in os.listdir(int_dir):
    with open(os.path.join(int_dir, file)) as samp:
        reader = csv.reader(samp)
        heading = next(samp) #Skip the header
        f = []
        f = [os.path.splitext(file)[0]] #add the hadm_id as the first column
        for row in reader:
            f = f + row[1:] # skip the first column
        with open(int_out_file, "a") as fp:
            wr = csv.writer(fp, dialect='excel')
            wr.writerow(f)
            
            
# Flatten non intubated patients
for file in os.listdir(not_int_dir):
    with open(os.path.join(not_int_dir, file)) as samp:
        reader = csv.reader(samp)
        heading = next(samp) #Skip the header
        f = []
        f = [os.path.splitext(file)[0]] #add the hadm_id as the first column
        for row in reader:
            f = f + row[1:] # skip the first column
        with open(nint_out_file, "a") as fp:
            wr = csv.writer(fp, dialect='excel')
            wr.writerow(f)
            
# Make a combined file with both intubated and non-intubated patients
df1 = pd.read_csv(int_out_file, header = None)
df2 = pd.read_csv(nint_out_file, header = None)

df1.insert(0, '', 1)
df2.insert(0, '', 0)

# concatenate the two DataFrames
result = pd.concat([df1, df2], ignore_index=True)

# write the concatenated DataFrame to a new CSV file
result.to_csv(comb_file, index=False, header = None)

## Remove Error/missing values, split test/train, scale for deep learning

In [None]:
import os
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# Split the data into training and testing sets

df = pd.read_csv("PATH/data/Flattened_Vitals/combined.csv", header = None)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Separate out results from training/testing data, also eliminate the hadm_id column (location 1)
train_results = train_df.iloc[:, 0]
train_data = train_df.iloc[:, 2:]
test_results = test_df.iloc[:, 0]
test_data = test_df.iloc[:, 2:]

# Save the split data to separate CSV files
train_results.to_csv("PATH/data/Flattened_Vitals/train_results.csv", index=False)
test_results.to_csv("PATH/data/Flattened_Vitals/test_results.csv", index=False)
train_data.to_csv("PATH/data/Flattened_Vitals/train_data.csv", index=False)
test_data.to_csv("PATH/Flattened_Vitals/test_data.csv", index=False)

In [None]:
# Replace all error values, non-numbers, and zeros with blanks - do this BEFORE creating a scaled version
# output these files back to the same location

input_dir = r'PATH/data/Flattened_Vitals/'
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        with open(os.path.join(input_dir, filename), 'r') as infile:
            reader = csv.reader(infile)
            new_rows = []
            for row in reader:
                new_row = []
                for cell in row:
                    #Replace negatives and '0' with empty string
                    if cell.startswith('-') or cell == '0':
                        cell = ''
                    #Replace non-numbers, blanks with '0'
                    try:
                        float(cell)
                    except ValueError:
                        cell = ''
                    new_row.append(cell)
                new_rows.append(new_row)
        with open(os.path.join(input_dir, filename), 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerows(new_rows)

In [None]:
# Create a scaled version, before replacing missing values with 0's
test_dat = pd.read_csv(r'PATH/data/Flattened_Vitals/test_data.csv', header = None)
train_dat = pd.read_csv(r'PATH/data/Flattened_Vitals/train_data.csv', header = None)

sc = StandardScaler()
X_train = sc.fit_transform(train_dat)
X_test = sc.transform (test_dat)

np.savetxt("PATH/data/Flattened_Vitals/scaled_test_data.csv", X_test, delimiter=",")
np.savetxt("PATH/data/Flattened_Vitals/scaled_train_data.csv", X_train, delimiter=",")

In [None]:
#Replace all empties with 0's, and write out both the scaled, and non-scaled to the Cleaned_Vitals folder

input_dir = r'PATH/data/Flattened_Vitals/'
output_dir = r'PATH/data/Cleaned_Vitals/'

# create the output directory if it doesn't already exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        with open(os.path.join(input_dir, filename), 'r') as infile:
            reader = csv.reader(infile)
            new_rows = []
            for row in reader:
                new_row = []
                for cell in row:
                    # Replace 'nan' with '0'
                    if cell == 'nan':
                        cell = '0'
                    #Replace non-numbers, blanks with '0'
                    try:
                        float(cell)
                    except ValueError:
                        cell = '0'
                    new_row.append(cell)
                new_rows.append(new_row)
        with open(os.path.join(output_dir, filename), 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerows(new_rows)
