# **Time series data extraction for feature set 2**

Feature set 2 is defined as the features that are the most popular from literature and clinically not available, with the WAVE study used as a proxy.

 As with feature set 1, for all patients in the derived set, time series data for each of these features will be extracted from the chart events table.

The data will then be observed, cleaned and pre-processed as required to train an LSTM model.

We will collect data for both feature set 1 and feature set 2 so we can train the models with a larger number of features

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load all patients
patient_file = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/03_annotated_set/annotation_v03.parquet'

patient_df = pd.read_parquet(patient_file)
patient_df.shape[0]


5970

In [None]:
# Load variables for extraction
chartevents_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/chartevents.csv'

In [None]:
time_window_hours = timedelta(hours=6)

In [None]:
# State the features to be extracted
feature_set_2_dynamic = [220210, 220277, 228640, 220235, 223830, 220224, 220228, 223835, 224685, 224686, 224695, 224687,
                         224696, 223849, 224419, 220045, 223901, 220739, 220052, 223761, 228096, 220050, 220051, 220546,
                         220615, 227457, 220645, 227442, 225668, 220621, 225690, 220545, 225667, 224697]
len(feature_set_2_dynamic)

34

In [None]:
# State the features to be extracted
feature_set_1_dynamic = [220210, 220277, 228640, 220235, 223830, 220224,
                         220228, 223835, 224685, 224686, 224695, 224687, 224696, 223849, 224419]

len(feature_set_1_dynamic)

15

In [None]:
# Check if there are any overlapping itemids between feature set 1 and 2
set(feature_set_1_dynamic) & set(feature_set_2_dynamic)

{220210,
 220224,
 220228,
 220235,
 220277,
 223830,
 223835,
 223849,
 224419,
 224685,
 224686,
 224687,
 224695,
 224696,
 228640}

In [None]:
# Combine feature set 1 and 2 together
feature_set_1_and_2_dynamic = feature_set_2_dynamic + feature_set_1_dynamic
len(feature_set_1_and_2_dynamic)

49

In [None]:
# Make sure all itemids are unique
len(feature_set_1_and_2_dynamic) == len(set(feature_set_1_and_2_dynamic))

# Only include unique itemids
feature_set_1_and_2_dynamic = list(set(feature_set_1_and_2_dynamic))

In [None]:
len(feature_set_1_and_2_dynamic)

34

In [None]:
# Function copied from feature set 1 extraction
def process_chunk_2_incl_null(chunk, patient_dict, itemid_set, time_window):
    # Filter the columns
    chunk = chunk[['subject_id', 'charttime', 'itemid', 'valuenum']].copy()

    # Drop rows where only itemid is null and NOT valuenum
    chunk = chunk.dropna(subset=['itemid'])

    # Convert subject_id to nullable integer type if necessary
    chunk.loc[:, 'subject_id'] = chunk['subject_id'].astype('Int64')

    # Filter by subject_id
    chunk = chunk[chunk['subject_id'].isin(patient_dict.keys())]

    # Filter by itemid
    chunk = chunk[chunk['itemid'].isin(itemid_set)]

    # Apply the time window filter using the patient_dict
    filtered_rows = []
    for idx, row in chunk.iterrows():
        extubation_time = patient_dict[row['subject_id']]['extubation_starttime']
        if (row['charttime'] >= extubation_time - time_window) and (row['charttime'] <= extubation_time):
            filtered_rows.append(row)

    filtered_df = pd.DataFrame(filtered_rows)
    if not filtered_df.empty:
        filtered_df['extubation_starttime'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_starttime'])
        filtered_df['extubation_failure'] = filtered_df['subject_id'].map(lambda x: patient_dict[x]['extubation_failure'])

    return filtered_df

In [None]:
def extract_patient_data_2_incl_null(patient_df, chartevents_file_path, itemids, time_window):
    # Initialize a list to hold the results
    results = []

    # Define the chunk size
    chunksize = 100000

    # Convert itemids to a set for faster lookup
    itemid_set = set(itemids)

    # Convert patient_df to a dictionary for fast lookups
    patient_dict = patient_df.set_index('subject_id').to_dict('index')

    # # Get the total number of rows in the file
    # total_rows = sum(1 for row in open(chartevents_file_path)) - 1  # Subtract 1 for the header row


    # Track the number of processed rows
    processed_rows = 0

    # Read and process the file in chunks
    for chunk in pd.read_csv(chartevents_file_path, parse_dates=['charttime'], chunksize=chunksize):

      # Process each chunk and filter the data
      filtered_df = process_chunk_2_incl_null(chunk, patient_dict, itemid_set, time_window)

      # Append the filtered data to the results list
      results.append(filtered_df)

      # Update the processed rows count
      processed_rows += len(chunk)

      # Print the progress every 10,000,000 rows
      if processed_rows % 10000000 == 0:
          print(f"Processed {processed_rows} rows")


    # Concatenate all the results
    result_df = pd.concat(results, ignore_index=True)

    # Check if result_df is empty and handle it accordingly
    if result_df.empty:
        print("Warning: No data found for the given criteria.")
        return pd.DataFrame(), pd.DataFrame() # Return empty DataFrames


    # Extract the data and labels
    data = result_df[['subject_id', 'charttime', 'itemid', 'valuenum']]
    labels = result_df[['subject_id','extubation_failure']]

    return data, labels

In [None]:
# Extract time series data for feature set 1 and 2 and save to drive
full_data_set_2, full_labels_2 = extract_patient_data_2_incl_null(patient_df, chartevents_path, feature_set_1_and_2_dynamic, time_window_hours)

full_data_set_2.to_parquet('/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_2_results/full_data_set_2.parquet')
full_labels_2.to_parquet('/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/05_time_series_data_preprocessing/feature_set_2_results/full_labels_2.parquet')

Processed 10000000 rows
Processed 20000000 rows
Processed 30000000 rows
Processed 40000000 rows
Processed 50000000 rows
Processed 60000000 rows
Processed 70000000 rows
Processed 80000000 rows
Processed 90000000 rows
Processed 100000000 rows
Processed 110000000 rows
Processed 120000000 rows
Processed 130000000 rows
Processed 140000000 rows
Processed 150000000 rows
Processed 160000000 rows
Processed 170000000 rows
Processed 180000000 rows
Processed 190000000 rows
Processed 200000000 rows
Processed 210000000 rows
Processed 220000000 rows
Processed 230000000 rows
Processed 240000000 rows


In [None]:
# Count the number of unqiue patients in the extracted data
full_data_set_2['subject_id'].nunique()
print('Number of unique patients in feature set 2: ', full_data_set_2['subject_id'].nunique())

Number of unique patients in feature set 2:  4701


In [None]:
# Count the number of itemids present
full_data_set_2['itemid'].nunique()
print('Number of unique itemids in feature set 2: ', full_data_set_2['itemid'].nunique())


Number of unique itemids in feature set 2:  34


In [None]:
feature_set_1_and_2_dynamic

[220210,
 220277,
 228640,
 220235,
 223830,
 220224,
 220228,
 223835,
 224685,
 224686,
 224695,
 224687,
 224696,
 223849,
 224419,
 220045,
 223901,
 220739,
 220052,
 223761,
 228096,
 220050,
 220051,
 220546,
 220615,
 227457,
 220645,
 227442,
 225668,
 220621,
 225690,
 220545,
 225667,
 224697,
 220210,
 220277,
 228640,
 220235,
 223830,
 220224,
 220228,
 223835,
 224685,
 224686,
 224695,
 224687,
 224696,
 223849,
 224419]

In [None]:
list_2 = list(full_data_set_2['itemid'].unique().astype('int64'))
list_2

[223835,
 224685,
 224686,
 224687,
 224695,
 224697,
 228640,
 220045,
 220210,
 220739,
 223901,
 220277,
 228096,
 223849,
 220050,
 220051,
 220052,
 223761,
 220545,
 220645,
 227442,
 220224,
 220235,
 223830,
 225667,
 225668,
 220228,
 220546,
 220615,
 220621,
 227457,
 224696,
 225690,
 224419]

In [None]:
# Count occurrences in both lists
list1_count = {num: feature_set_1_and_2_dynamic.count(num) for num in set(feature_set_1_and_2_dynamic)}
list2_count = {num: list_2.count(num) for num in set(list_2)}

In [None]:
# Find numbers present in the first list but not in the second
difference_manual = []
for num in list1_count:
    if num not in list2_count or list1_count[num] > list2_count[num]:
        difference_manual.extend([num] * (list1_count[num] - list2_count.get(num, 0)))

# Convert to sorted list
difference_manual_sorted = sorted(difference_manual)
print(difference_manual_sorted)

[220210, 220224, 220228, 220235, 220277, 223830, 223835, 223849, 224419, 224685, 224686, 224687, 224695, 224696, 228640]


In [None]:
len(difference_manual_sorted)

15

There are 15 itemids for which no data has been collected.

We can use the d_items file to identify these.

In [None]:
# Load the d_items data frame
d_items_file = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/mimic-iv-2.2-raw-data/icu/d_items.csv'
d_items_df = pd.read_csv(d_items_file)
d_items_df.head()

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,220001,Problem List,Problem List,chartevents,General,,Text,,
1,220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
3,220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
4,220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,


In [None]:
# Identify and print the label for the itemids that were missing
missing_labels = d_items_df[d_items_df['itemid'].isin(difference_manual_sorted)]['label'].unique()

# Print the itemid next to its label
for label in missing_labels:
    print(f"itemid: {label}")

itemid: Respiratory Rate
itemid: Arterial O2 pressure
itemid: Hemoglobin
itemid: Arterial CO2 Pressure
itemid: O2 saturation pulseoxymetry
itemid: PH (Arterial)
itemid: Inspired O2 Fraction
itemid: Ventilator Mode
itemid: Negative Insp. Force
itemid: Tidal Volume (observed)
itemid: Tidal Volume (spontaneous)
itemid: Minute Volume
itemid: Peak Insp. Pressure
itemid: Plateau Pressure
itemid: EtCO2


It seems all the itemids are from feature set 1.

In [None]:
# See if there is any difference between the missing itemids and feature set 1
feature_diff = list(set(difference_manual_sorted) - set(feature_set_1_dynamic))
feature_diff

[]

There is no difference between feature set 1 and the missing features.

This is due to the original feature set not being unique and so 34 features is correct.

In [None]:
# See if there is any difference between the features in the results and feature set 2
feature_diff_2 = list(set(list_2) - set(feature_set_2_dynamic))
feature_diff_2

[]

In [None]:
# Print the labels of the itemids in the results data from d_items
present_labels = d_items_df[d_items_df['itemid'].isin(list_2)]['label'].unique()

for label in present_labels:
    print(f"itemid: {label}")


itemid: Heart Rate
itemid: Arterial Blood Pressure systolic
itemid: Arterial Blood Pressure diastolic
itemid: Arterial Blood Pressure mean
itemid: Respiratory Rate
itemid: Arterial O2 pressure
itemid: Hemoglobin
itemid: Arterial CO2 Pressure
itemid: O2 saturation pulseoxymetry
itemid: Hematocrit (serum)
itemid: WBC
itemid: Creatinine (serum)
itemid: Glucose (serum)
itemid: Sodium (serum)
itemid: GCS - Eye Opening
itemid: Temperature Fahrenheit
itemid: PH (Arterial)
itemid: Inspired O2 Fraction
itemid: Ventilator Mode
itemid: GCS - Motor Response
itemid: Negative Insp. Force
itemid: Tidal Volume (observed)
itemid: Tidal Volume (spontaneous)
itemid: Minute Volume
itemid: Peak Insp. Pressure
itemid: Plateau Pressure
itemid: Mean Airway Pressure
itemid: Ionized Calcium
itemid: Lactic Acid
itemid: Total Bilirubin
itemid: Potassium (serum)
itemid: Platelet Count
itemid: Richmond-RAS Scale
itemid: EtCO2


All features selected are from feature sets 1 and 2.
