# **Time series data extraction for feature set 1**

Feature set 1 is defined as the features that are the most popular from literature and clinically available, with the WAVE study used as a proxy.

For all patients in the derived set, time series data for each of these features will be extracted from the chart events table.

The data will then be observed, cleaned and pre-processed as required to train an LSTM model.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
# Extract id, extubation starttime and annotation columns from patient data
patient_df = patient_df[['subject_id', 'extubation_starttime', 'extubation_failure']]
patient_df.head()

Unnamed: 0,subject_id,extubation_starttime,extubation_failure
0,10001884,2131-01-12 17:40:00,1
22,10002428,2156-04-22 17:10:00,0
29,10004235,2196-02-27 16:28:00,1
32,10004720,2186-11-17 14:00:00,1
33,10004733,2174-12-07 16:20:00,0


In [None]:
# Initialize a dictionary to hold the null counts
null_counts = {
    'subject_id': 0,
    'charttime': 0,
    'itemid': 0,
    'valuenum': 0
}

In [None]:
chartevents_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

In [None]:
# Read in chartevents file into pandas and process in chunks and count null values
chartevents_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

processed_rows = 0

for chunk in pd.read_csv(chartevents_path, chunksize=100000, parse_dates=['charttime']):
  # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Count the null values in the current chunk
    chunk_null_counts = chunk.isnull().sum()

    # Aggregate the null counts
    for column in null_counts:
        null_counts[column] += chunk_null_counts[column]

    processed_rows += len(chunk)

    if processed_rows % 1000000 == 0:
        print(f"Processed {processed_rows} rows")

# Print the results
for column, count in null_counts.items():
    print(f"Column '{column}': {count} null values")

Processed 1000000 rows
Processed 2000000 rows
Processed 3000000 rows
Processed 4000000 rows
Processed 5000000 rows
Processed 6000000 rows
Processed 7000000 rows
Processed 8000000 rows
Processed 9000000 rows
Processed 10000000 rows
Processed 11000000 rows
Processed 12000000 rows
Processed 13000000 rows
Processed 14000000 rows
Processed 15000000 rows
Processed 16000000 rows
Processed 17000000 rows
Processed 18000000 rows
Processed 19000000 rows
Processed 20000000 rows
Processed 21000000 rows
Processed 22000000 rows
Processed 23000000 rows
Processed 24000000 rows
Processed 25000000 rows
Processed 26000000 rows
Processed 27000000 rows
Processed 28000000 rows
Processed 29000000 rows
Processed 30000000 rows
Processed 31000000 rows
Processed 32000000 rows
Processed 33000000 rows
Processed 34000000 rows
Processed 35000000 rows
Processed 36000000 rows
Processed 37000000 rows
Processed 38000000 rows
Processed 39000000 rows
Processed 40000000 rows
Processed 41000000 rows
Processed 42000000 rows
P

In [None]:
# Print the results
for column, count in null_counts.items():
    print(f"Column '{column}': {count} null values")

Column 'subject_id': 0 null values
Column 'charttime': 0 null values
Column 'itemid': 1 null values
Column 'valuenum': 185521099 null values


Now trying to execute task to extract data.

In [None]:
from datetime import timedelta

In [None]:
def extract_patient_data(patient_df, chartevents_file_path, features, time_window):
  results = []

  chunksize = 100000

  features_set = set(features)

  processed_rows = 0

  # Read and process file in chunks
  for chunk in pd.read_csv(chartevents_file_path, chunksize=chunksize, parse_dates=['charttime']):
    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where itemid or valuenum is null
    chunk = chunk.dropna(subset=['itemid', 'valuenum'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by itemid
    chunk = chunk[chunk['itemid'].isin(features_set)]

    # Merge with patient_df to get the extubation_starttime and extubation_failure columns
    merged = pd.merge(chunk, patient_df, on='subject_id', how='inner')

    # Filter by time window
    merged = merged[(merged['charttime'] >= merged['extubation_starttime'] - time_window)
              & (merged['charttime'] <= merged['extubation_starttime'])]

    results.append(merged)

    processed_rows += len(chunk)

    if processed_rows % 10000 == 0:
        print(f"Processed {processed_rows} rows")


  results_df = pd.concat(results)

  # Extract the data and labels
  data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
  labels = result_df[['subject_id', 'extubation_starttime', 'extubation_failure']]

  return data, labels

In [None]:
# State the features to be extracted
feature_set_1_dynamic = [220210, 220277, 228640, 220235, 223830, 220224,
                         220228, 223835, 224685, 224686, 224695, 224687, 224696, 223849, 224419]

In [None]:
chartevents_path

'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

In [None]:
time_window_hours = timedelta(hours=6)

In [None]:
# Test out on patient file
data, labels = extract_patient_data(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)

print(data.head())
print(labels.head())



KeyboardInterrupt: 

Merging is slow so will try without merging and filtering directly.

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
# Test on patient set
data, labels = extract_patient_data_2(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)

print(data.head())
print(labels.head())

1
3
4
5
6
New chunk
New chunk


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
print(data.head())
print(labels.head())

   subject_id           charttime  itemid  valuenum
0    10001884 2131-01-12 15:00:00  223835      40.0
1    10001884 2131-01-12 15:00:00  224685     284.0
2    10001884 2131-01-12 15:00:00  224686     284.0
3    10001884 2131-01-12 15:00:00  224687       6.1
4    10001884 2131-01-12 15:00:00  224695      17.0
   subject_id  extubation_failure
0    10001884                   1
1    10001884                   1
2    10001884                   1
3    10001884                   1
4    10001884                   1


In [None]:
def process_chunk_2(chunk, patient_dict, itemid_set, time_window):
    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where itemid or valuenum is null
    chunk = chunk.dropna(subset=['itemid', 'valuenum'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_dict.keys())]

    # Filter by itemid
    chunk = chunk[chunk['itemid'].isin(itemid_set)]

    # Apply the time window filter using the patient_dict
    filtered_rows = []
    for idx, row in chunk.iterrows():
        extubation_time = patient_dict[row['subject_id']]['extubation_starttime']
        if (row['charttime'] >= extubation_time - time_window) and (row['charttime'] <= extubation_time):
            filtered_rows.append(row)

    filtered_df = pd.DataFrame(filtered_rows)
    if not filtered_df.empty:
        filtered_df['extubation_starttime'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_starttime'])
        filtered_df['extubation_failure'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_failure'])

    return filtered_df

In [None]:
def extract_patient_data_2(patient_df, chartevents_file_path, itemids, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert itemids to a set for faster lookup
    itemid_set = set(itemids)

    # Convert patient_df to a dictionary for fast lookups
    patient_dict = patient_df.set_index('subject_id').to_dict('index')

    # # Get the total number of rows in the file
    # total_rows = sum(1 for row in open(chartevents_file_path)) - 1  # Subtract 1 for the header row


    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):

      # Process each chunk and filter the data
      filtered_df = process_chunk_2(chunk, patient_dict, itemid_set, time_window)

      # Append the filtered data to the results list
      results.append(filtered_df)

      # Update the processed rows count
      processed_rows += len(chunk)

      # Print the progress every 100,000 rows
      if processed_rows % 10000000 == 0:
          print(f"Processed {processed_rows} rows")


    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Check if result_df is empty and handle it accordingly
    if result_df.empty:
        print("Warning: No data found for the given criteria.")
        return pd.DataFrame(), pd.DataFrame() # Return empty DataFrames


    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id','extubation_failure']]

    return data, labels

In [None]:
# Now can run this for all patient files
patient_chunk_files = [f'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_{i}.parquet' for i in range(10)]

output_dir = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results'

In [None]:
for i, patient_file in enumerate(patient_chunk_files):
  print(f'Processing patient file {patient_file}')

  patient_df = pd.read_parquet(patient_file)

  data, labels = extract_patient_data_2(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)

  data_file_path = f'{output_dir}/data_chunk_{i}.parquet'
  labels_file_path = f'{output_dir}/labels_chunk_{i}.parquet'

  data.to_parquet(data_file_path, index=False)
  labels.to_parquet(labels_file_path, index=False)

  print(f"Saved data and labels for patient file {i} to {output_dir}")

print("All files processed and saved.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 244800000 rows
Processed 244900000 rows
Processed 245000000 rows
Processed 245100000 rows
Processed 245200000 rows
Processed 245300000 rows
Processed 245400000 rows
Processed 245500000 rows
Processed 245600000 rows
Processed 245700000 rows
Processed 245800000 rows
Processed 245900000 rows
Processed 246000000 rows
Processed 246100000 rows
Processed 246200000 rows
Processed 246300000 rows
Processed 246400000 rows
Processed 246500000 rows
Processed 246600000 rows
Processed 246700000 rows
Processed 246800000 rows
Processed 246900000 rows
Processed 247000000 rows
Processed 247100000 rows
Processed 247200000 rows
Processed 247300000 rows
Processed 247400000 rows
Processed 247500000 rows
Processed 247600000 rows
Processed 247700000 rows
Processed 247800000 rows
Processed 247900000 rows
Processed 248000000 rows
Processed 248100000 rows
Saved data and labels for patient file 6 to /content/drive/MyDrive/MSc_Final_Project/

KeyError: "None of [Index(['subject_id', 'charttime', 'itemid', 'valuenum'], dtype='object')] are in the [columns]"

This worked for the first 7 patient chunks so will reattempt on 8 and 9

In [None]:
# Load patient files 8 and 9
patient_chunk_8 = pd.read_parquet('/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_8.parquet')
patient_chunk_9 = pd.read_parquet('/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_9.parquet')

patient_chunks_2 = [patient_chunk_8, patient_chunk_9]

In [None]:
for i, patient_file in enumerate(patient_chunks_2):
  print(f'Processing patient file {patient_file}')

  patient_df = pd.read_parquet(patient_file)

  data, labels = extract_patient_data_2(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)

  data_file_path = f'{output_dir}/data_chunk_{i}.parquet'
  labels_file_path = f'{output_dir}/labels_chunk_{i}.parquet'

  data.to_parquet(data_file_path, index=False)
  labels.to_parquet(labels_file_path, index=False)

  print(f"Saved data and labels for patient file {i} to {output_dir}")

Processing patient file        subject_id   hadm_id   stay_id ventilation_starttime  \
20810    18043820  25736266  37330264   2187-03-20 14:05:00   
20812    18044289  23747254  32716668   2153-06-26 11:40:00   
20813    18044722  23096487  33401395   2133-04-22 05:00:00   
20822    18046344  22674038  30198295   2118-12-30 23:41:00   
20824    18049351  28941140  37164191   2170-05-16 17:00:00   
...           ...       ...       ...                   ...   
23553    19034482  28077278  31136798   2140-04-28 18:06:00   
23556    19034841  25036939  37384421   2177-11-01 08:00:00   
23557    19036639  23424336  30354179   2152-08-03 19:23:00   
23560    19039155  28887061  33701331   2169-12-03 00:00:00   
23562    19040738  26585450  35842035   2143-08-13 04:00:00   

      ventilation_endtime  ventilation_itemid ventilation_ordercategoryname  \
20810 2187-03-21 15:32:00              225792                   Ventilation   
20812 2153-06-27 16:45:00              225792                

TypeError: cannot construct a FileSource from        subject_id   hadm_id   stay_id ventilation_starttime  \
20810    18043820  25736266  37330264   2187-03-20 14:05:00   
20812    18044289  23747254  32716668   2153-06-26 11:40:00   
20813    18044722  23096487  33401395   2133-04-22 05:00:00   
20822    18046344  22674038  30198295   2118-12-30 23:41:00   
20824    18049351  28941140  37164191   2170-05-16 17:00:00   
...           ...       ...       ...                   ...   
23553    19034482  28077278  31136798   2140-04-28 18:06:00   
23556    19034841  25036939  37384421   2177-11-01 08:00:00   
23557    19036639  23424336  30354179   2152-08-03 19:23:00   
23560    19039155  28887061  33701331   2169-12-03 00:00:00   
23562    19040738  26585450  35842035   2143-08-13 04:00:00   

      ventilation_endtime  ventilation_itemid ventilation_ordercategoryname  \
20810 2187-03-21 15:32:00              225792                   Ventilation   
20812 2153-06-27 16:45:00              225792                   Ventilation   
20813 2133-04-23 14:31:00              225792                   Ventilation   
20822 2119-01-02 09:20:00              225792                   Ventilation   
20824 2170-05-19 09:15:00              225792                   Ventilation   
...                   ...                 ...                           ...   
23553 2140-05-03 08:46:00              225792                   Ventilation   
23556 2177-11-06 06:56:00              225792                   Ventilation   
23557 2152-08-08 07:44:00              225792                   Ventilation   
23560 2169-12-07 12:20:00              225792                   Ventilation   
23562 2143-08-14 12:46:00              225792                   Ventilation   

      extubation_starttime  extubation_endtime  extubation_itemid  \
20810  2187-03-21 15:33:00 2187-03-21 15:34:00             227194   
20812  2153-06-27 16:45:00 2153-06-27 16:46:00             227194   
20813  2133-04-23 14:36:00 2133-04-23 14:37:00             227194   
20822  2119-01-02 09:20:00 2119-01-02 09:21:00             227194   
20824  2170-05-19 09:15:00 2170-05-19 09:16:00             227194   
...                    ...                 ...                ...   
23553  2140-05-03 08:49:00 2140-05-03 08:50:00             227194   
23556  2177-11-06 06:57:00 2177-11-06 06:58:00             227194   
23557  2152-08-08 07:44:00 2152-08-08 07:45:00             227194   
23560  2169-12-07 12:20:00 2169-12-07 12:21:00             227194   
23562  2143-08-14 12:46:00 2143-08-14 12:47:00             227194   

      extubation_ordercategoryname  ventilation_duration  anchor_age  \
20810        Intubation/Extubation                1527.0          79   
20812        Intubation/Extubation                1745.0          26   
20813        Intubation/Extubation                2011.0          41   
20822        Intubation/Extubation                3459.0          33   
20824        Intubation/Extubation                3855.0          66   
...                            ...                   ...         ...   
23553        Intubation/Extubation                6640.0          79   
23556        Intubation/Extubation                7136.0          87   
23557        Intubation/Extubation                6501.0          21   
23560        Intubation/Extubation                6500.0          51   
23562        Intubation/Extubation                1966.0          67   

       extubation_failure  
20810                   0  
20812                   0  
20813                   0  
20822                   0  
20824                   0  
...                   ...  
23553                   1  
23556                   0  
23557                   0  
23560                   0  
23562                   0  

[597 rows x 14 columns]

There seems to be an issue with files 8 and 9 so further investigation will be carried out

In [None]:
# Load set 8
chunk_8 = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_8.parquet'
patient_chunk_8 = pd.read_parquet(chunk_8)
patient_chunk_8.head()

Unnamed: 0,subject_id,hadm_id,stay_id,ventilation_starttime,ventilation_endtime,ventilation_itemid,ventilation_ordercategoryname,extubation_starttime,extubation_endtime,extubation_itemid,extubation_ordercategoryname,ventilation_duration,anchor_age,extubation_failure
20810,18043820,25736266,37330264,2187-03-20 14:05:00,2187-03-21 15:32:00,225792,Ventilation,2187-03-21 15:33:00,2187-03-21 15:34:00,227194,Intubation/Extubation,1527.0,79,0
20812,18044289,23747254,32716668,2153-06-26 11:40:00,2153-06-27 16:45:00,225792,Ventilation,2153-06-27 16:45:00,2153-06-27 16:46:00,227194,Intubation/Extubation,1745.0,26,0
20813,18044722,23096487,33401395,2133-04-22 05:00:00,2133-04-23 14:31:00,225792,Ventilation,2133-04-23 14:36:00,2133-04-23 14:37:00,227194,Intubation/Extubation,2011.0,41,0
20822,18046344,22674038,30198295,2118-12-30 23:41:00,2119-01-02 09:20:00,225792,Ventilation,2119-01-02 09:20:00,2119-01-02 09:21:00,227194,Intubation/Extubation,3459.0,33,0
20824,18049351,28941140,37164191,2170-05-16 17:00:00,2170-05-19 09:15:00,225792,Ventilation,2170-05-19 09:15:00,2170-05-19 09:16:00,227194,Intubation/Extubation,3855.0,66,0


In [None]:
# Run extraction on chunk 8
data_8, labels_8 = extract_patient_data_2(patient_chunk_8, chartevents_path, feature_set_1_dynamic, time_window_hours)

data_8_file_path = f'{output_dir}/data_chunk_{8}.parquet'
labels_8_file_path = f'{output_dir}/labels_chunk_{8}.parquet'

data_8.to_parquet(data_8_file_path, index=False)
labels_8.to_parquet(labels_8_file_path, index=False)

print(f"Saved data and labels for patient file to {output_dir}")

Processed 100000 rows
Processed 200000 rows
Processed 300000 rows
Processed 400000 rows
Processed 500000 rows
Processed 600000 rows
Processed 700000 rows
Processed 800000 rows
Processed 900000 rows
Processed 1000000 rows
Processed 1100000 rows
Processed 1200000 rows
Processed 1300000 rows
Processed 1400000 rows
Processed 1500000 rows
Processed 1600000 rows
Processed 1700000 rows
Processed 1800000 rows
Processed 1900000 rows
Processed 2000000 rows
Processed 2100000 rows
Processed 2200000 rows
Processed 2300000 rows
Processed 2400000 rows
Processed 2500000 rows
Processed 2600000 rows
Processed 2700000 rows
Processed 2800000 rows
Processed 2900000 rows
Processed 3000000 rows
Processed 3100000 rows
Processed 3200000 rows
Processed 3300000 rows
Processed 3400000 rows
Processed 3500000 rows
Processed 3600000 rows
Processed 3700000 rows
Processed 3800000 rows
Processed 3900000 rows
Processed 4000000 rows
Processed 4100000 rows
Processed 4200000 rows
Processed 4300000 rows
Processed 4400000 ro

In [None]:
# Repeat the same for chunk 9
chunk_9 = ('/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_9.parquet')
patient_chunk_9 = pd.read_parquet(chunk_9)
patient_chunk_9.head()

Unnamed: 0,subject_id,hadm_id,stay_id,ventilation_starttime,ventilation_endtime,ventilation_itemid,ventilation_ordercategoryname,extubation_starttime,extubation_endtime,extubation_itemid,extubation_ordercategoryname,ventilation_duration,anchor_age,extubation_failure
23568,19041108,24980238,32226561,2123-03-05 23:55:00,2123-03-11 11:15:00,225792,Ventilation,2123-03-11 11:15:00,2123-03-11 11:16:00,227194,Intubation/Extubation,7880.0,54,0
23569,19042662,28440124,37249777,2184-04-04 19:37:00,2184-04-13 09:40:00,225792,Ventilation,2184-04-13 09:40:00,2184-04-13 09:41:00,227194,Intubation/Extubation,12363.0,62,0
23571,19045496,22343752,37979029,2148-08-09 11:40:00,2148-08-13 15:40:00,225792,Ventilation,2148-08-13 15:40:00,2148-08-13 15:41:00,227194,Intubation/Extubation,6000.0,80,0
23576,19046076,27649252,33240155,2116-04-05 00:00:00,2116-04-06 08:58:00,225792,Ventilation,2116-04-06 09:02:00,2116-04-06 09:03:00,227194,Intubation/Extubation,1978.0,74,0
23577,19046107,21398294,32733363,2138-04-14 12:08:00,2138-04-16 12:39:00,225792,Ventilation,2138-04-16 12:40:00,2138-04-16 12:41:00,227194,Intubation/Extubation,2911.0,62,0


In [None]:
# Run extraction on chunk 9
data_9, labels_9 = extract_patient_data_2(patient_chunk_9, chartevents_path, feature_set_1_dynamic, time_window_hours)
data_9_file_path = f'{output_dir}/data_chunk_{9}.parquet'
labels_9_file_path = f'{output_dir}/labels_chunk_{9}.parquet'

data_9.to_parquet(data_9_file_path, index=False)
labels_9.to_parquet(labels_9_file_path, index=False)

print(f"Saved data and labels for patient file to {output_dir}")

Processed 100000 rows
Processed 200000 rows
Processed 300000 rows
Processed 400000 rows
Processed 500000 rows
Processed 600000 rows
Processed 700000 rows
Processed 800000 rows
Processed 900000 rows
Processed 1000000 rows
Processed 1100000 rows
Processed 1200000 rows
Processed 1300000 rows
Processed 1400000 rows
Processed 1500000 rows
Processed 1600000 rows
Processed 1700000 rows
Processed 1800000 rows
Processed 1900000 rows
Processed 2000000 rows
Processed 2100000 rows
Processed 2200000 rows
Processed 2300000 rows
Processed 2400000 rows
Processed 2500000 rows
Processed 2600000 rows
Processed 2700000 rows
Processed 2800000 rows
Processed 2900000 rows
Processed 3000000 rows
Processed 3100000 rows
Processed 3200000 rows
Processed 3300000 rows
Processed 3400000 rows
Processed 3500000 rows
Processed 3600000 rows
Processed 3700000 rows
Processed 3800000 rows
Processed 3900000 rows
Processed 4000000 rows
Processed 4100000 rows
Processed 4200000 rows
Processed 4300000 rows
Processed 4400000 ro

In [None]:
# Analyse chunk 8 and 9 results
chunk_8_results = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_8.parquet'

chunk_8_results_df = pd.read_parquet(chunk_8_results)
chunk_8_results_df.head()

In [None]:
chunk_9_results = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_9.parquet'

chunk_9_results_df = pd.read_parquet(chunk_9_results)
chunk_9_results_df.head()

It seems the results for both chunk 8 and chunk 9 are empty.

Analyse if the patients in these chunks have any entries in the last 6 hours before extubation.

Either: These patients have no entries for these specific itemids, or they have no entries at all.


In [None]:
# Analyse if patients have any entries at all and then within those check for the specific itemids
def process_all_chunk(chunk, patient_dict, time_window):
    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where valuenum is null
    chunk = chunk.dropna(subset=['valuenum'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_dict.keys())]

    # Apply the time window filter using the patient_dict
    filtered_rows = []
    for idx, row in chunk.iterrows():
        extubation_time = patient_dict[row['subject_id']]['extubation_starttime']
        if (row['charttime'] >= extubation_time - time_window) and (row['charttime'] <= extubation_time):
            filtered_rows.append(row)

    filtered_df = pd.DataFrame(filtered_rows)
    if not filtered_df.empty:
        filtered_df.loc[:, 'extubation_starttime'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_starttime'])
        filtered_df.loc[:, 'extubation_failure'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_failure'])

    return filtered_df


In [None]:
# Extract all possible data from patients in each chunk
def extract_all_patient_data(patient_df, chartevents_file_path, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert patient_df to a dictionary for fast lookups
    patient_dict = patient_df.set_index('subject_id').to_dict('index')

    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):
        # Process each chunk and filter the data
        filtered_df = process_all_chunk(chunk, patient_dict, time_window)

        # Append the filtered data to the results list
        results.append(filtered_df)

        # Update the number of processed rows
        processed_rows += len(chunk)

        # Print the progress every 100,000 rows
        if processed_rows % 1000000 == 0:
          print(f"Processed {processed_rows} rows")

    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id', 'extubation_failure']]

    return data, labels


In [None]:
time_window_hours = timedelta(hours=6)

In [None]:
chartevents_path

'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

In [None]:
output_dir

'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results'

In [None]:
empty_patient_files = [chunk_8, chunk_9]

for i, patient_file in enumerate(empty_patient_files, start=8):
    print(f"Processing patient chunk file: {patient_file}")

    # Read the patient chunk file
    patient_df = pd.read_parquet(patient_file)

    # Extract all data and labels
    data, labels = extract_all_patient_data(patient_df, chartevents_path, time_window_hours)

    # Save the results to Google Drive
    data_file_path = f'{output_dir}all_data_chunk_{i}.parquet'
    labels_file_path = f'{output_dir}all_labels_chunk_{i}.parquet'

    data.to_parquet(data_file_path)
    labels.to_parquet(labels_file_path)

    print(f"Saved all data and labels for chunk {i} to {output_dir}")

Processing patient chunk file: /content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_8.parquet
Processed 1000000 rows
Processed 2000000 rows
Processed 3000000 rows
Processed 4000000 rows
Processed 5000000 rows
Processed 6000000 rows
Processed 7000000 rows
Processed 8000000 rows
Processed 9000000 rows
Processed 10000000 rows
Processed 11000000 rows
Processed 12000000 rows
Processed 13000000 rows
Processed 14000000 rows
Processed 15000000 rows
Processed 16000000 rows
Processed 17000000 rows
Processed 18000000 rows
Processed 19000000 rows
Processed 20000000 rows
Processed 21000000 rows
Processed 22000000 rows
Processed 23000000 rows
Processed 24000000 rows
Processed 25000000 rows
Processed 26000000 rows
Processed 27000000 rows
Processed 28000000 rows
Processed 29000000 rows
Processed 30000000 rows
Processed 31000000 rows
Processed 32000000 rows
Processed 33000000 rows
Processed 34000000 rows
Processed 35

KeyError: "None of [Index(['subject_id', 'charttime', 'itemid', 'valuenum'], dtype='object')] are in the [columns]"

There seems to be an error that the specified columns are not present in the chunk dataframe from chartevents.

We will debug this issue by first printing the column names in the chunk.

In [None]:
def process_all_chunk_2(chunk, patient_dict, time_window):

    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # # Print columns to debug
    # print("Columns in chunk:", chunk.columns)

    # Drop rows where valuenum is null
    chunk = chunk.dropna(subset=['valuenum'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_dict.keys())]

    # Apply the time window filter using the patient_dict
    filtered_rows = []
    for idx, row in chunk.iterrows():
        extubation_time = patient_dict[row['subject_id']]['extubation_starttime']
        if (row['charttime'] >= extubation_time - time_window) and (row['charttime'] <= extubation_time):
            filtered_rows.append(row)

    filtered_df = pd.DataFrame(filtered_rows)
    if not filtered_df.empty:
        filtered_df.loc[:, 'extubation_starttime'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_starttime'])
        filtered_df.loc[:, 'extubation_failure'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_failure'])

    return filtered_df

def extract_all_patient_data_2(patient_df, chartevents_file_path, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert patient_df to a dictionary for fast lookups
    patient_dict = patient_df.set_index('subject_id').to_dict('index')

    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):

        # print(chunk.head())

        # Process each chunk and filter the data
        filtered_df = process_all_chunk_2(chunk, patient_dict, time_window)

        # Append the filtered data to the results list
        results.append(filtered_df)

        # # Check if the filtered DataFrame is empty and has the expected columns
        # if filtered_df.empty:
        #     print("Warning: Filtered DataFrame is empty for this chunk.")
        # elif not all(col in filtered_df.columns for col in ['subject_id', 'charttime', 'itemid', 'valuenum']):
        #     print("Warning: Filtered DataFrame is missing expected columns.")
        #     return
        # else:
        #     # Append the filtered data to the results list
        #     results.append(filtered_df)

        # Update the number of processed rows
        processed_rows += len(chunk)

        # Print progress
        print(f"Processed {processed_rows} rows.")

    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id', 'extubation_starttime', 'extubation_failure']]

    return data, labels

In [None]:
empty_patient_files = [chunk_8, chunk_9]

for i, patient_file in enumerate(empty_patient_files, start=8):
    print(f"Processing patient chunk file: {patient_file}")

    # Read the patient chunk file
    patient_df = pd.read_parquet(patient_file)

    # Extract all data and labels
    data, labels = extract_all_patient_data_2(patient_df, chartevents_path, time_window_hours)

    # Save the results to Google Drive
    data_file_path = f'{output_dir}all_data_chunk_{i}.parquet'
    labels_file_path = f'{output_dir}all_labels_chunk_{i}.parquet'

    data.to_parquet(data_file_path)
    labels.to_parquet(labels_file_path)

    print(f"Saved all data and labels for chunk {i} to {output_dir}")

Processing patient chunk file: /content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_8.parquet
   subject_id   hadm_id   stay_id  caregiver_id           charttime  \
0    10000032  29079034  39553978       47007.0 2180-07-23 21:01:00   
1    10000032  29079034  39553978       47007.0 2180-07-23 21:01:00   
2    10000032  29079034  39553978       47007.0 2180-07-23 21:01:00   
3    10000032  29079034  39553978       47007.0 2180-07-23 22:00:00   
4    10000032  29079034  39553978       47007.0 2180-07-23 22:00:00   

0  2180-07-23 22:15:00  220179    82      82.0     mmHg      0.0  
1  2180-07-23 22:15:00  220180    59      59.0     mmHg      0.0  
2  2180-07-23 22:15:00  220181    63      63.0     mmHg      0.0  
3  2180-07-23 22:15:00  220045    94      94.0      bpm      0.0  
4  2180-07-23 22:15:00  220179    85      85.0     mmHg      0.0  
Columns in chunk: Index(['subject_id', 'charttime', 'item

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

Similar issue is cropping up.

We know the other patient files worked so will try the same function with those.

In [None]:
# Load chunk 7 results
chunk_7_results = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_7.parquet'

chunk_7_results_df = pd.read_parquet(chunk_7_results)
chunk_7_results_df.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum
0,17062372,2143-02-25 04:00:00,223835.0,40.0
1,17062372,2143-02-25 04:00:00,223849.0,11.0
2,17062372,2143-02-25 04:00:00,224685.0,757.0
3,17062372,2143-02-25 04:00:00,224686.0,750.0
4,17062372,2143-02-25 04:00:00,224687.0,9.6


In [None]:
chunk_7 = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_7.parquet'

# Run chunk 7 through function
example_result, example_labels = extract_all_patient_data_2(pd.read_parquet(chunk_7), chartevents_path, time_window_hours)

Processed 100000 rows.
Processed 200000 rows.
Processed 300000 rows.
Processed 400000 rows.
Processed 500000 rows.
Processed 600000 rows.
Processed 700000 rows.
Processed 800000 rows.
Processed 900000 rows.
Processed 1000000 rows.
Processed 1100000 rows.
Processed 1200000 rows.
Processed 1300000 rows.
Processed 1400000 rows.
Processed 1500000 rows.
Processed 1600000 rows.
Processed 1700000 rows.
Processed 1800000 rows.
Processed 1900000 rows.
Processed 2000000 rows.
Processed 2100000 rows.
Processed 2200000 rows.
Processed 2300000 rows.
Processed 2400000 rows.
Processed 2500000 rows.
Processed 2600000 rows.
Processed 2700000 rows.
Processed 2800000 rows.
Processed 2900000 rows.
Processed 3000000 rows.
Processed 3100000 rows.
Processed 3200000 rows.
Processed 3300000 rows.
Processed 3400000 rows.
Processed 3500000 rows.
Processed 3600000 rows.
Processed 3700000 rows.
Processed 3800000 rows.
Processed 3900000 rows.
Processed 4000000 rows.
Processed 4100000 rows.
Processed 4200000 rows.
P

In [None]:
example_result.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum
0,17062372,2143-02-25 04:00:00,220292.0,5.0
1,17062372,2143-02-25 04:00:00,220293.0,20.0
2,17062372,2143-02-25 04:00:00,220339.0,5.0
3,17062372,2143-02-25 04:00:00,223835.0,40.0
4,17062372,2143-02-25 04:00:00,223848.0,1.0


Chunk 7 has results so it works for chunks other than chunk 8 and 9.

Lets see if there are any sturctural differences between chunks 7, 8 and 9.

In [None]:
chunk_7_df = pd.read_parquet(chunk_7)
chunk_8_df = pd.read_parquet(chunk_8)
chunk_9_df = pd.read_parquet(chunk_9)

chunk_7_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 18404 to 20808
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   subject_id                     597 non-null    int64         
 1   hadm_id                        597 non-null    int64         
 2   stay_id                        597 non-null    int64         
 3   ventilation_starttime          597 non-null    datetime64[ns]
 4   ventilation_endtime            597 non-null    datetime64[ns]
 5   ventilation_itemid             597 non-null    int64         
 6   ventilation_ordercategoryname  597 non-null    object        
 7   extubation_starttime           597 non-null    datetime64[ns]
 8   extubation_endtime             597 non-null    datetime64[ns]
 9   extubation_itemid              597 non-null    int64         
 10  extubation_ordercategoryname   597 non-null    object        
 11  ventilation_durati

In [None]:
# See if there are any structural differences between chunk 7 8 and 9
chunk_8_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 20810 to 23562
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   subject_id                     597 non-null    int64         
 1   hadm_id                        597 non-null    int64         
 2   stay_id                        597 non-null    int64         
 3   ventilation_starttime          597 non-null    datetime64[ns]
 4   ventilation_endtime            597 non-null    datetime64[ns]
 5   ventilation_itemid             597 non-null    int64         
 6   ventilation_ordercategoryname  597 non-null    object        
 7   extubation_starttime           597 non-null    datetime64[ns]
 8   extubation_endtime             597 non-null    datetime64[ns]
 9   extubation_itemid              597 non-null    int64         
 10  extubation_ordercategoryname   597 non-null    object        
 11  ventilation_durati

In [None]:
chunk_9_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 597 entries, 23568 to 26133
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   subject_id                     597 non-null    int64         
 1   hadm_id                        597 non-null    int64         
 2   stay_id                        597 non-null    int64         
 3   ventilation_starttime          597 non-null    datetime64[ns]
 4   ventilation_endtime            597 non-null    datetime64[ns]
 5   ventilation_itemid             597 non-null    int64         
 6   ventilation_ordercategoryname  597 non-null    object        
 7   extubation_starttime           597 non-null    datetime64[ns]
 8   extubation_endtime             597 non-null    datetime64[ns]
 9   extubation_itemid              597 non-null    int64         
 10  extubation_ordercategoryname   597 non-null    object        
 11  ventilation_durati

In [None]:
chunk_8_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,ventilation_starttime,ventilation_endtime,ventilation_itemid,ventilation_ordercategoryname,extubation_starttime,extubation_endtime,extubation_itemid,extubation_ordercategoryname,ventilation_duration,anchor_age,extubation_failure
20810,18043820,25736266,37330264,2187-03-20 14:05:00,2187-03-21 15:32:00,225792,Ventilation,2187-03-21 15:33:00,2187-03-21 15:34:00,227194,Intubation/Extubation,1527.0,79,0
20812,18044289,23747254,32716668,2153-06-26 11:40:00,2153-06-27 16:45:00,225792,Ventilation,2153-06-27 16:45:00,2153-06-27 16:46:00,227194,Intubation/Extubation,1745.0,26,0
20813,18044722,23096487,33401395,2133-04-22 05:00:00,2133-04-23 14:31:00,225792,Ventilation,2133-04-23 14:36:00,2133-04-23 14:37:00,227194,Intubation/Extubation,2011.0,41,0
20822,18046344,22674038,30198295,2118-12-30 23:41:00,2119-01-02 09:20:00,225792,Ventilation,2119-01-02 09:20:00,2119-01-02 09:21:00,227194,Intubation/Extubation,3459.0,33,0
20824,18049351,28941140,37164191,2170-05-16 17:00:00,2170-05-19 09:15:00,225792,Ventilation,2170-05-19 09:15:00,2170-05-19 09:16:00,227194,Intubation/Extubation,3855.0,66,0


In [None]:
chunk_9_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,ventilation_starttime,ventilation_endtime,ventilation_itemid,ventilation_ordercategoryname,extubation_starttime,extubation_endtime,extubation_itemid,extubation_ordercategoryname,ventilation_duration,anchor_age,extubation_failure
23568,19041108,24980238,32226561,2123-03-05 23:55:00,2123-03-11 11:15:00,225792,Ventilation,2123-03-11 11:15:00,2123-03-11 11:16:00,227194,Intubation/Extubation,7880.0,54,0
23569,19042662,28440124,37249777,2184-04-04 19:37:00,2184-04-13 09:40:00,225792,Ventilation,2184-04-13 09:40:00,2184-04-13 09:41:00,227194,Intubation/Extubation,12363.0,62,0
23571,19045496,22343752,37979029,2148-08-09 11:40:00,2148-08-13 15:40:00,225792,Ventilation,2148-08-13 15:40:00,2148-08-13 15:41:00,227194,Intubation/Extubation,6000.0,80,0
23576,19046076,27649252,33240155,2116-04-05 00:00:00,2116-04-06 08:58:00,225792,Ventilation,2116-04-06 09:02:00,2116-04-06 09:03:00,227194,Intubation/Extubation,1978.0,74,0
23577,19046107,21398294,32733363,2138-04-14 12:08:00,2138-04-16 12:39:00,225792,Ventilation,2138-04-16 12:40:00,2138-04-16 12:41:00,227194,Intubation/Extubation,2911.0,62,0


There seems to be no visible difference between the patient chunks.

Let's try a variation of the function that does not rely on a dictionary - as it seems the issue may be centered around this.

In [None]:
# Equivalent function that does not rely on dictionaries
def process_all_chunk_3(chunk, patient_df, itemid_set, time_window):

    # Filter the columns to include only the relevant ones
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where valuenum is null
    chunk = chunk.dropna(subset=['valuenum'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_df['subject_id'])]

    # Filter by itemid
    chunk = chunk[chunk['itemid'].isin(itemid_set)]

    # Use broadcasting to filter the chunk based on the extubation_starttime and time_window
    chunk = chunk.merge(patient_df[['subject_id', 'extubation_starttime', 'extubation_failure']], on='subject_id', how='left')
    chunk = chunk[(chunk['charttime'] >= chunk['extubation_starttime'] - time_window) &
                  (chunk['charttime'] <= chunk['extubation_starttime'])]

    return chunk

In [None]:
def extract_all_patient_data_3(patient_df, chartevents_file_path, itemids, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert itemids to a set for faster lookup
    itemid_set = set(itemids)

    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):
        # Process each chunk and filter the data
        filtered_df = process_all_chunk_3(chunk, patient_df, itemid_set, time_window)

        # Append the filtered data to the results list
        results.append(filtered_df)

        # Update the number of processed rows
        processed_rows += len(chunk)

        # Print progress
        print(f"Processed {processed_rows} rows.")

    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Check if results_df is empty
    if result_df.empty:
        print("Warning: Results DataFrame is empty.")
        return None, None

    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id', 'extubation_failure']]

    return data, labels

In [None]:
patient_chunk_files = [chunk_8, chunk_9]

chartevents_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

output_dir = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results'


In [None]:
time_window_hours = timedelta(hours=6)

In [None]:
feature_set_1_dynamic

[220210,
 220277,
 228640,
 220235,
 223830,
 220224,
 220228,
 223835,
 224685,
 224686,
 224695,
 224687,
 224696,
 223849,
 224419]

In [None]:
# Process patient chunk files 8 and 9 and save results to Google Drive
for i, patient_file in enumerate(patient_chunk_files, start=8):
    print(f"Processing patient chunk file: {patient_file}")

    # Read the patient chunk file
    patient_df = pd.read_parquet(patient_file)

    # Extract all data and labels
    data, labels = extract_all_patient_data_3(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)

    # Save the results to Google Drive
    data_file_path = f'{output_dir}all_data_chunk_{i}.parquet'
    labels_file_path = f'{output_dir}all_labels_chunk_{i}.parquet'

    data.to_parquet(data_file_path)
    labels.to_parquet(labels_file_path)

    print(f"Saved all data and labels for chunk {i} to {output_dir}")

Processing patient chunk file: /content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/final_set_split_files/patient_chunk_8.parquet
Processed 100000 rows.
Processed 200000 rows.
Processed 300000 rows.
Processed 400000 rows.
Processed 500000 rows.
Processed 600000 rows.
Processed 700000 rows.
Processed 800000 rows.
Processed 900000 rows.
Processed 1000000 rows.
Processed 1100000 rows.
Processed 1200000 rows.
Processed 1300000 rows.
Processed 1400000 rows.
Processed 1500000 rows.
Processed 1600000 rows.
Processed 1700000 rows.
Processed 1800000 rows.
Processed 1900000 rows.
Processed 2000000 rows.
Processed 2100000 rows.
Processed 2200000 rows.
Processed 2300000 rows.
Processed 2400000 rows.
Processed 2500000 rows.
Processed 2600000 rows.
Processed 2700000 rows.
Processed 2800000 rows.
Processed 2900000 rows.
Processed 3000000 rows.
Processed 3100000 rows.
Processed 3200000 rows.
Processed 3300000 rows.
Processed 3400000 rows.
Processed 35

AttributeError: 'NoneType' object has no attribute 'to_parquet'

Yet again, there seems to be no data for either of the chunks.

As a final try, we will run the whole patient dataset and extract the timeseries data without splitting into chunks.

In [None]:
# Load all patients
patient_file = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/annotation_v03.parquet'

patient_df = pd.read_parquet(patient_file)
patient_df.shape[0]

5970

In [None]:
chartevents_path

'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

In [None]:
feature_set_1_dynamic

[220210,
 220277,
 228640,
 220235,
 223830,
 220224,
 220228,
 223835,
 224685,
 224686,
 224695,
 224687,
 224696,
 223849,
 224419]

In [None]:
time_window_hours = timedelta(hours=6)

In [None]:
output_dir

'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results'

In [None]:
# Run the original function on the entire dataset and save the results
full_data, full_labels = extract_patient_data_2(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)
full_data.to_parquet(f'{output_dir}/full_data.parquet')
full_labels.to_parquet(f'{output_dir}/full_labels.parquet')

print(f"Saved all data and labels to {output_dir}")

Processed 10000000 rows
Processed 20000000 rows
Processed 30000000 rows
Processed 40000000 rows
Processed 50000000 rows
Processed 60000000 rows
Processed 70000000 rows
Processed 80000000 rows
Processed 90000000 rows
Processed 100000000 rows
Processed 110000000 rows
Processed 120000000 rows
Processed 130000000 rows
Processed 140000000 rows
Processed 150000000 rows
Processed 160000000 rows
Processed 170000000 rows
Processed 180000000 rows
Processed 190000000 rows
Processed 200000000 rows
Processed 210000000 rows
Processed 220000000 rows
Processed 230000000 rows
Processed 240000000 rows
Saved all data and labels to /content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results


In [None]:
# Analyse full_data
full_data.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum
0,10001884,2131-01-12 15:00:00,223835.0,40.0
1,10001884,2131-01-12 15:00:00,224685.0,284.0
2,10001884,2131-01-12 15:00:00,224686.0,284.0
3,10001884,2131-01-12 15:00:00,224687.0,6.1
4,10001884,2131-01-12 15:00:00,224695.0,17.0


In [None]:
full_data.shape[0]

116639

In [None]:
# Count the number of unique subject_ids in the full data
unique_subject_ids = full_data['subject_id'].unique()
print("Number of unique subject_ids in the full data: ", len(unique_subject_ids))

Number of unique subject_ids in the full data:  4701


In [None]:
# Count the number of patients in the original patient set
num_patients = patient_df.shape[0]
print("Number of patients in the original patient set: ", num_patients)

Number of patients in the original patient set:  5970


When extracting the full patient data, it seems there is only data present for 4701 patients out of the full 5970 which corroborates the possibility that both patient chunks 8 and 9 had no relevant data as this is approximately 20% of the orginal patient size.

For a final validation, we can look at how many unique patients there were in the 8 patient chunks.

In [None]:
# Load the first 8 patient chunks
successful_patient_chunk_results_files = [f'/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_{i}.parquet' for i in range(8)]
successful_patient_chunk_results_files


['/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_0.parquet',
 '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_1.parquet',
 '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_2.parquet',
 '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_3.parquet',
 '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_4.parquet',
 '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results/data_chunk_5.pa

In [None]:
# Load and count the total number of unique subject_ids across all 8 patient chunk results
total_unique_subject_ids = 0
for file in successful_patient_chunk_results_files:
    patient_chunk = pd.read_parquet(file)
    unique_subject_ids = patient_chunk['subject_id'].unique()

    total_unique_subject_ids += len(unique_subject_ids)

print("Total number of unique subject_ids across all 8 patient chunks: ", total_unique_subject_ids)

Total number of unique subject_ids across all 8 patient chunks:  4701


As this is the same for the chunks and the full dataset we can conclude that patients in chunks 8 and 9 do not have any data for the 15 itemids in the last 6 hours before extubation.

As such we can treat the full dataset as the final dataset to carry forward for further preprocessing.

In [None]:
# Count the number of unique itemids in the full dataset
unique_itemids = full_data['itemid'].unique()
print("Number of unique itemids in the full dataset: ", len(unique_itemids))

Number of unique itemids in the full dataset:  15


In [None]:
# Count the number of itemids initally input to be extracted
num_itemids = len(feature_set_1_dynamic)
print("Number of itemids initially input to be extracted: ", num_itemids)

Number of itemids initially input to be extracted:  15


All itemids have a record but we will need to see how consistently these were measured for when we format the data for our LSTM model.

This will be carried out in the next file.

**Addendum: Extracting data inclusive of missing valuenum**

We can also extract the full data including missing values that we can impute later.

To do this we need to amend the functions slightly.

In [None]:
def process_chunk_2_incl_null(chunk, patient_dict, itemid_set, time_window):
    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where only itemid is null and NOT valuenum
    chunk = chunk.dropna(subset=['itemid'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_dict.keys())]

    # Filter by itemid
    chunk = chunk[chunk['itemid'].isin(itemid_set)]

    # Apply the time window filter using the patient_dict
    filtered_rows = []
    for idx, row in chunk.iterrows():
        extubation_time = patient_dict[row['subject_id']]['extubation_starttime']
        if (row['charttime'] >= extubation_time - time_window) and (row['charttime'] <= extubation_time):
            filtered_rows.append(row)

    filtered_df = pd.DataFrame(filtered_rows)
    if not filtered_df.empty:
        filtered_df['extubation_starttime'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_starttime'])
        filtered_df['extubation_failure'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_failure'])

    return filtered_df

In [None]:
def extract_patient_data_2_incl_null(patient_df, chartevents_file_path, itemids, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert itemids to a set for faster lookup
    itemid_set = set(itemids)

    # Convert patient_df to a dictionary for fast lookups
    patient_dict = patient_df.set_index('subject_id').to_dict('index')

    # # Get the total number of rows in the file
    # total_rows = sum(1 for row in open(chartevents_file_path)) - 1  # Subtract 1 for the header row


    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):

      # Process each chunk and filter the data
      filtered_df = process_chunk_2_incl_null(chunk, patient_dict, itemid_set, time_window)

      # Append the filtered data to the results list
      results.append(filtered_df)

      # Update the processed rows count
      processed_rows += len(chunk)

      # Print the progress every 100,000 rows
      if processed_rows % 10000000 == 0:
          print(f"Processed {processed_rows} rows")


    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Check if result_df is empty and handle it accordingly
    if result_df.empty:
        print("Warning: No data found for the given criteria.")
        return pd.DataFrame(), pd.DataFrame() # Return empty DataFrames


    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id','extubation_failure']]

    return data, labels

In [None]:
# Run the original function on the entire dataset and save the results
full_data_incl_null, full_labels_incl_null = extract_patient_data_2_incl_null(patient_df, chartevents_path, feature_set_1_dynamic, time_window_hours)
full_data.to_parquet(f'{output_dir}/full_data_incl_null.parquet')
full_labels.to_parquet(f'{output_dir}/full_labels_incl_null.parquet')

print(f"Saved all data and labels to {output_dir}")

Processed 10000000 rows
Processed 20000000 rows
Processed 30000000 rows
Processed 40000000 rows
Processed 50000000 rows
Processed 60000000 rows
Processed 70000000 rows
Processed 80000000 rows
Processed 90000000 rows
Processed 100000000 rows
Processed 110000000 rows
Processed 120000000 rows
Processed 130000000 rows
Processed 140000000 rows
Processed 150000000 rows
Processed 160000000 rows
Processed 170000000 rows
Processed 180000000 rows
Processed 190000000 rows
Processed 200000000 rows
Processed 210000000 rows
Processed 220000000 rows
Processed 230000000 rows
Processed 240000000 rows
Saved all data and labels to /content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_1_results


In [None]:
# Count the number of patients in this full dataset without nas
unique_subject_ids = full_data_incl_null['subject_id'].unique()
print("Number of unique subject_ids in the full data including nulls: ", len(unique_subject_ids))

Number of unique subject_ids in the full data including nulls:  4701


The same number of patients are present in this dataset as expected.

In [None]:
# Count the number of rows in this dataset to see how much timeseries data we have
num_rows = full_data_incl_null.shape[0]
print("Number of rows in the full data including nulls: ", num_rows)

Number of rows in the full data including nulls:  117438


In [None]:
# Calculate the difference between the number of rows with and without nulls
diff = num_rows - full_data.shape[0]
print("Difference between the number of rows with and without nulls: ", diff)

Difference between the number of rows with and without nulls:  799


Let's see which itemids are contributing to these null values


In [None]:
# For each itemid count the number of null values
null_counts = full_data_incl_null.groupby('itemid')['valuenum'].apply(lambda x: x.isnull().sum()).reset_index()
null_counts.columns = ['itemid', 'null_count']
null_counts

Unnamed: 0,itemid,null_count
0,220210.0,0
1,220224.0,0
2,220228.0,0
3,220235.0,0
4,220277.0,0
5,223830.0,0
6,223835.0,0
7,223849.0,799
8,224419.0,0
9,224685.0,0


It seems to be just one item id (223849) that is contributing to the null values.

Let's identify this item ID so we can consider this in future preprocessing.

In [None]:
# Load the d_items data frame
d_items_file = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/d_items.csv'
d_items_df = pd.read_csv(d_items_file)
d_items_df.head()

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,220001,Problem List,Problem List,chartevents,General,,Text,,
1,220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
3,220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
4,220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,


In [None]:
# Match the itemid 223849 to the corresponding label in d_items and return the label to identify
itemid_223849 = d_items_df[d_items_df['itemid'] == 223849]['label']
print(itemid_223849)


391    Ventilator Mode
Name: label, dtype: object


Ventilator mode has a significant number of null values which will need to be handled in later preprocessing.

As a final check, we can ensure that the item_ids in the full_data have no null values.

In [None]:
# Check that the itemids in full_data have no null values
null_counts = full_data.groupby('itemid')['valuenum'].apply(lambda x: x.isnull().sum()).reset_index()
null_counts.columns = ['itemid', 'null_count']
null_counts

Unnamed: 0,itemid,null_count
0,220210.0,0
1,220224.0,0
2,220228.0,0
3,220235.0,0
4,220277.0,0
5,223830.0,0
6,223835.0,0
7,223849.0,0
8,224419.0,0
9,224685.0,0


We can now proceed with pre-processing the data for feature set 1 for model input.

**Analyse the class split in the data**

In [3]:
import pandas as pd

In [4]:
data_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_extraction/feature_set_1_results/full_data_feature_set_1/full_data_incl_null.parquet'

data = pd.read_parquet(data_path)
data.head()

Unnamed: 0,subject_id,charttime,itemid,valuenum
0,10001884,2131-01-12 15:00:00,223835.0,40.0
1,10001884,2131-01-12 15:00:00,224685.0,284.0
2,10001884,2131-01-12 15:00:00,224686.0,284.0
3,10001884,2131-01-12 15:00:00,224687.0,6.1
4,10001884,2131-01-12 15:00:00,224695.0,17.0


In [5]:
data['subject_id'].nunique()

4701

In [6]:
labels_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_extraction/feature_set_1_results/full_data_feature_set_1/full_labels_incl_null.parquet'

labels = pd.read_parquet(labels_path)
labels.head()

Unnamed: 0,subject_id,extubation_failure
0,10001884,1
1,10001884,1
2,10001884,1
3,10001884,1
4,10001884,1


In [7]:
# Drop duplicates of subject ifd
labels = labels.drop_duplicates(subset=['subject_id'])
labels.head()

Unnamed: 0,subject_id,extubation_failure
0,10001884,1
30,10002428,0
66,10004235,1
91,10004720,1
115,10004733,0


In [9]:
labels['subject_id'].nunique()

4701

In [11]:
# Give the % of 1 in extubation failure
labels['extubation_failure'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
extubation_failure,Unnamed: 1_level_1
0,67.155924
1,32.844076
