# **LSTM preprocessing for feature set 2 dynamic features**

We are now ensuring that all subsets have the same number of patients.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Step 1: Load the time series data**

In [None]:
# Load the train and test data
train_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/07_data_preprocessing/02_feature_set_2/03_train_data_f2_outliers_removed.parquet'
test_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/07_data_preprocessing/02_feature_set_2/03_test_data_f2_outliers_removed.parquet'
train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)

train_df.head()

Unnamed: 0,subject_id,valuenum,time_from_window_start_mins,feature_label,extubation_failure
0,10001884,40.0,200.0,Inspired O2 Fraction,1
1,10001884,,200.0,Tidal Volume (observed),1
2,10001884,,200.0,Tidal Volume (spontaneous),1
3,10001884,6.1,200.0,Minute Volume,1
4,10001884,17.0,200.0,Peak Insp. Pressure,1


In [None]:
train_df = train_df.rename(columns={'time_from_window_start': 'time_from_window_start_mins'})
test_df = test_df.rename(columns={'time_from_window_start': 'time_from_window_start_mins'})

train_df.head()

Unnamed: 0,subject_id,valuenum,time_from_window_start_mins,feature_label,extubation_failure
0,10001884,40.0,200.0,Inspired O2 Fraction,1
1,10001884,,200.0,Tidal Volume (observed),1
2,10001884,,200.0,Tidal Volume (spontaneous),1
3,10001884,6.1,200.0,Minute Volume,1
4,10001884,17.0,200.0,Peak Insp. Pressure,1


# **Step 2: Determine subset groups**

In [None]:
# Group by subject_id and itemid to count the number of samples for each combination
per_patient_sampling_frequency = train_df.groupby(['subject_id', 'feature_label']).size().reset_index(name='count')

per_patient_sampling_frequency_pivot = per_patient_sampling_frequency.pivot(index='subject_id', columns='feature_label', values='count').fillna(0)

# Calculate the average sampling frequency per feature
average_sampling_frequency = per_patient_sampling_frequency_pivot.mean().sort_values(ascending=False)

# Create columns for the table
average_sampling_frequency_df = pd.DataFrame({'Feature': average_sampling_frequency.index, 'Average Train Set Sampling Frequency': average_sampling_frequency.values})

# Display the table
average_sampling_frequency_df

Unnamed: 0,Feature,Average Train Set Sampling Frequency
0,Heart Rate,6.610638
1,O2 saturation pulseoxymetry,6.597074
2,Respiratory Rate,6.578989
3,Arterial Blood Pressure mean,3.75133
4,Arterial Blood Pressure diastolic,3.746277
5,Arterial Blood Pressure systolic,3.745745
6,Inspired O2 Fraction,2.106383
7,GCS - Eye Opening,1.631383
8,GCS - Motor Response,1.627128
9,Tidal Volume (observed),1.616755


Low observed features have already been removed and so we are left with 19 time series features to group. **Note: the frequencies are based on the training data only**

The features will be split as follows: **keeping the same as previously done**

- Low Frequency Subset (frequency < 1 in 6 hours)
- Medium Frequency Subset (1 < frequency < 3 in 6 hours)
- High Frequency Subset (frequency > 3 in 6 hours)

This facilitates creating subsets where each set can be resampled at a bespoke rate that better considers the actual sampling frequency of the real data.

In [None]:
low_frequency_features = average_sampling_frequency_df[average_sampling_frequency_df['Average Train Set Sampling Frequency'] < 1]['Feature'].tolist()
medium_frequency_features = average_sampling_frequency_df[(average_sampling_frequency_df['Average Train Set Sampling Frequency'] >= 1) & (average_sampling_frequency_df['Average Train Set Sampling Frequency'] < 3)]['Feature'].tolist()
high_frequency_features = average_sampling_frequency_df[average_sampling_frequency_df['Average Train Set Sampling Frequency'] >= 3]['Feature'].tolist()

print(f"Low frequency features: {low_frequency_features}")
print(f"Medium frequency features: {medium_frequency_features}")
print(f"High frequency features: {high_frequency_features}")

Low frequency features: ['PH (Arterial)', 'Arterial O2 pressure', 'Arterial CO2 Pressure']
Medium frequency features: ['Inspired O2 Fraction', 'GCS - Eye Opening', 'GCS - Motor Response', 'Tidal Volume (observed)', 'Minute Volume', 'Mean Airway Pressure', 'Peak Insp. Pressure', 'Temperature Fahrenheit', 'Tidal Volume (spontaneous)', 'Richmond-RAS Scale']
High frequency features: ['Heart Rate', 'O2 saturation pulseoxymetry', 'Respiratory Rate', 'Arterial Blood Pressure mean', 'Arterial Blood Pressure diastolic', 'Arterial Blood Pressure systolic']


# **Step 3: Split the train and test data into three subsets but maintain all patients throughout**

Now we need to split the train and test data into the three subsets based on our feature grouping. However, we will need to make sure that all patients are represented in all subsets.

To achieve this, if the patient has no data for a feature, a value will be created at an arbitrary time point within the window and the valuenum set to NaN. This maintains the patient in the subset but also provides a platform to fill with NaN values for that feature and mask when passed through the model.

In [None]:
def split_into_subsets(df, low_features, medium_features, high_features):
    # Helper function to ensure all patients have all features in the subset
    def ensure_all_features_present(df, features, patients, extubation_failure_map):
        # Create a DataFrame with all combinations of patients and features
        all_combinations = pd.MultiIndex.from_product([patients, features], names=['subject_id', 'feature_label']).to_frame(index=False)

        # Merge this with the original DataFrame to find missing combinations
        merged = all_combinations.merge(df, on=['subject_id', 'feature_label'], how='left')

        # Set an arbitrary time point within the 0-360 mins window for missing values
        merged['time_from_window_start_mins'] = merged['time_from_window_start_mins'].fillna(np.random.uniform(0, 360))

        # Fill the extubation_failure column using the map
        merged['extubation_failure'] = merged['subject_id'].map(extubation_failure_map)

        return merged

    # Get a list of all unique patients
    patients = df['subject_id'].unique()

    # Get the extubation_failure status for each patient
    extubation_failure_map = df[['subject_id', 'extubation_failure']].drop_duplicates().set_index('subject_id')['extubation_failure'].to_dict()

    # Split data into low, medium, and high subsets
    low_df = df[df['feature_label'].isin(low_features)]
    medium_df = df[df['feature_label'].isin(medium_features)]
    high_df = df[df['feature_label'].isin(high_features)]

    # Ensure all patients have all features in each subset
    low_df = ensure_all_features_present(low_df, low_features, patients, extubation_failure_map)
    medium_df = ensure_all_features_present(medium_df, medium_features, patients, extubation_failure_map)
    high_df = ensure_all_features_present(high_df, high_features, patients, extubation_failure_map)

    return low_df, medium_df, high_df

In [None]:
low_train_df, medium_train_df, high_train_df = split_into_subsets(train_df, low_frequency_features, medium_frequency_features, high_frequency_features)

print("Number of patients in low frequency subset:", len(low_train_df['subject_id'].unique()))
print("Number of patients in medium frequency subset:", len(medium_train_df['subject_id'].unique()))
print("Number of patients in high frequency subset:", len(high_train_df['subject_id'].unique()))

Number of patients in low frequency subset: 3760
Number of patients in medium frequency subset: 3760
Number of patients in high frequency subset: 3760


In [None]:
# Check for NaN values in each column
print(low_train_df.isnull().sum())
print(medium_train_df.isnull().sum())
print(high_train_df.isnull().sum())

subject_id                        0
feature_label                     0
valuenum                       7188
time_from_window_start_mins       0
extubation_failure                0
dtype: int64
subject_id                        0
feature_label                     0
valuenum                       5565
time_from_window_start_mins       0
extubation_failure                0
dtype: int64
subject_id                         0
feature_label                      0
valuenum                       17322
time_from_window_start_mins        0
extubation_failure                 0
dtype: int64


Let's check all patients and values are represented

In [None]:
def check_all_features_present(df, feature_list):
    # Get a list of all unique patients
    patients = df['subject_id'].unique()

    # Create a DataFrame with all combinations of patients and features
    all_combinations = pd.MultiIndex.from_product([patients, feature_list], names=['subject_id', 'feature_label']).to_frame(index=False)

    # Merge this with the subset DataFrame to find missing combinations
    merged = all_combinations.merge(df, on=['subject_id', 'feature_label'], how='left')

    # Check if there are any missing combinations
    missing_combinations = merged[merged['valuenum'].isna() & merged['time_from_window_start_mins'].isna()]

    if missing_combinations.empty:
        return True
    else:
        print("The following patient-feature combinations are missing observations:")
        print(missing_combinations[['subject_id', 'feature_label']])
        return False


In [None]:
print("Low frequency subset:")
print(check_all_features_present(low_train_df, low_frequency_features))

print("Medium frequency subset:")
print(check_all_features_present(medium_train_df, medium_frequency_features))

print("High frequency subset:")
print(check_all_features_present(high_train_df, high_frequency_features))

Low frequency subset:
True
Medium frequency subset:
True
High frequency subset:
True


Apply to test data

In [None]:
low_test_df, medium_test_df, high_test_df = split_into_subsets(test_df, low_frequency_features, medium_frequency_features, high_frequency_features)

print("Number of patients in low frequency subset:", len(low_test_df['subject_id'].unique()))
print("Number of patients in medium frequency subset:", len(medium_test_df['subject_id'].unique()))
print("Number of patients in high frequency subset:", len(high_test_df['subject_id'].unique()))

Number of patients in low frequency subset: 941
Number of patients in medium frequency subset: 941
Number of patients in high frequency subset: 941


In [None]:
# Save progress
data_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/07_data_preprocessing/02_feature_set_2/feature_subsets_v2'
low_train_df.to_parquet(f'{data_path}/01_low_train_df.parquet', index=False)
medium_train_df.to_parquet(f'{data_path}/01_medium_train_df.parquet', index=False)
high_train_df.to_parquet(f'{data_path}/01_high_train_df.parquet', index=False)
low_test_df.to_parquet(f'{data_path}/01_low_test_df.parquet', index=False)
medium_test_df.to_parquet(f'{data_path}/01_medium_test_df.parquet', index=False)
high_test_df.to_parquet(f'{data_path}/01_high_test_df.parquet', index=False)

# **Step 4: Resample and interpolate**

The same strategy will be utilised for the resampling frequency for each subset.

The resampling rate for this study will be the following:
- Low frequency: every 2 hours (seq_length = 4)
- Medium frequency: every 1 hour (seq_length = 7)
- High frequency: every 30 mins (seq_length = 13)

Any lower for low frequency data then there would not be enough to predict on which would interfere with the global prediction.

In [None]:
def count_patients_with_no_data(df, feature_list):
    # Create a dictionary to store the count of patients with no data for each feature
    no_data_count = {}

    for feature in feature_list:
        # Filter the DataFrame for the current feature
        feature_df = df[df['feature_label'] == feature]

        # Count the number of patients with all NaN values for the current feature
        patients_with_no_data = feature_df.groupby('subject_id')['valuenum'].apply(lambda x: x.isna().all()).sum()

        # Store the count in the dictionary
        no_data_count[feature] = patients_with_no_data

    return no_data_count

In [None]:
print("Low frequency subset:")
low_no_data_count = count_patients_with_no_data(low_train_df, low_frequency_features)
print(low_no_data_count)

print("Medium frequency subset:")
medium_no_data_count = count_patients_with_no_data(medium_train_df, medium_frequency_features)
print(medium_no_data_count)

print("High frequency subset:")
high_no_data_count = count_patients_with_no_data(high_train_df, high_frequency_features)
print(high_no_data_count)

Low frequency subset:
{'PH (Arterial)': 2378, 'Arterial O2 pressure': 2383, 'Arterial CO2 Pressure': 2390}
Medium frequency subset:
{'Inspired O2 Fraction': 77, 'GCS - Eye Opening': 257, 'GCS - Motor Response': 269, 'Tidal Volume (observed)': 529, 'Minute Volume': 381, 'Mean Airway Pressure': 158, 'Peak Insp. Pressure': 194, 'Temperature Fahrenheit': 481, 'Tidal Volume (spontaneous)': 862, 'Richmond-RAS Scale': 822}
High frequency subset:
{'Heart Rate': 1, 'O2 saturation pulseoxymetry': 13, 'Respiratory Rate': 9, 'Arterial Blood Pressure mean': 1622, 'Arterial Blood Pressure diastolic': 2204, 'Arterial Blood Pressure systolic': 1756}


In [None]:
# Apply the same to the test data
print("Low frequency subset:")
low_no_data_count = count_patients_with_no_data(low_test_df, low_frequency_features)
print(low_no_data_count)

print("Medium frequency subset:")
medium_no_data_count = count_patients_with_no_data(medium_test_df, medium_frequency_features)
print(medium_no_data_count)

print("High frequency subset:")
high_no_data_count = count_patients_with_no_data(high_test_df, high_frequency_features)
print(high_no_data_count)

Low frequency subset:
{'PH (Arterial)': 596, 'Arterial O2 pressure': 593, 'Arterial CO2 Pressure': 597}
Medium frequency subset:
{'Inspired O2 Fraction': 17, 'GCS - Eye Opening': 58, 'GCS - Motor Response': 62, 'Tidal Volume (observed)': 147, 'Minute Volume': 79, 'Mean Airway Pressure': 39, 'Peak Insp. Pressure': 51, 'Temperature Fahrenheit': 123, 'Tidal Volume (spontaneous)': 238, 'Richmond-RAS Scale': 208}
High frequency subset:
{'Heart Rate': 2, 'O2 saturation pulseoxymetry': 2, 'Respiratory Rate': 7, 'Arterial Blood Pressure mean': 411, 'Arterial Blood Pressure diastolic': 538, 'Arterial Blood Pressure systolic': 458}


Convert all to time delta format

In [None]:
low_train_df['time_from_window_start_mins'] = pd.to_timedelta(low_train_df['time_from_window_start_mins'], unit='m')
low_test_df['time_from_window_start_mins'] = pd.to_timedelta(low_test_df['time_from_window_start_mins'], unit='m')
medium_train_df['time_from_window_start_mins'] = pd.to_timedelta(medium_train_df['time_from_window_start_mins'], unit='m')
medium_test_df['time_from_window_start_mins'] = pd.to_timedelta(medium_test_df['time_from_window_start_mins'], unit='m')
high_train_df['time_from_window_start_mins'] = pd.to_timedelta(high_train_df['time_from_window_start_mins'], unit='m')
high_test_df['time_from_window_start_mins'] = pd.to_timedelta(high_test_df['time_from_window_start_mins'], unit='m')

In [None]:
low_train_df.head()

Unnamed: 0,subject_id,feature_label,valuenum,time_from_window_start_mins,extubation_failure
0,10001884,PH (Arterial),,0 days 02:01:21.656806236,1
1,10001884,Arterial O2 pressure,,0 days 02:01:21.656806236,1
2,10001884,Arterial CO2 Pressure,,0 days 02:01:21.656806236,1
3,10002428,PH (Arterial),7.43,0 days 05:42:00,0
4,10002428,Arterial O2 pressure,127.0,0 days 05:42:00,0


In [None]:
low_train_df_copy = low_train_df.copy()
low_test_df_copy = low_test_df.copy()
medium_train_df_copy = medium_train_df.copy()
medium_test_df_copy = medium_test_df.copy()
high_train_df_copy = high_train_df.copy()
high_test_df_copy = high_test_df.copy()

# **Step 5: Applying resampling and interpolation logic**

In [None]:
def fill_start_end_values(df, feature_labels, start_means, end_means, start_window, end_window_start):
    """
    Fill missing start and end values for features in a DataFrame with specified means or existing values.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing time series data for various features.
    feature_labels (list): A list of feature labels to check for completeness.
    start_means (pd.Series): A Series containing mean values for each feature to be used if no start value is present.
    end_means (pd.Series): A Series containing mean values for each feature to be used if no end value is present.
    start_window (pd.Timedelta): The time window within which to look for a starting value.
    end_window_start (pd.Timedelta): The time window start within which to look for an ending value.

    Returns:
    pd.DataFrame: The DataFrame with filled start and end values.
    """
    new_rows = []

    for feature in feature_labels:
        for subject_id in df['subject_id'].unique():
            subject_df = df[df['subject_id'] == subject_id]
            extubation_failure_label = subject_df['extubation_failure'].iloc[0]

            # Check if the original data for this feature for this patient is entirely NaN
            if subject_df[subject_df['feature_label'] == feature]['valuenum'].isna().all():
                # If all values are NaN, ensure any filled values are also NaN
                new_rows.append({
                    'subject_id': subject_id,
                    'feature_label': feature,
                    'time_from_window_start_mins': pd.Timedelta(0),
                    'valuenum': np.nan,
                    'extubation_failure': extubation_failure_label
                })
                new_rows.append({
                    'subject_id': subject_id,
                    'feature_label': feature,
                    'time_from_window_start_mins': pd.Timedelta(minutes=360),
                    'valuenum': np.nan,
                    'extubation_failure': extubation_failure_label
                })
                continue

            # Handle start values
            start_check = (subject_df['feature_label'] == feature) & (subject_df['time_from_window_start_mins'] == pd.Timedelta(0))
            if not start_check.any():
                # Check if there's a value in the first half of the sampling window
                start_window_check = (subject_df['feature_label'] == feature) & (subject_df['time_from_window_start_mins'] <= start_window)
                if start_window_check.any():
                    # Use the earliest value within the start window
                    start_value = subject_df[start_window_check].sort_values('time_from_window_start_mins').iloc[0]['valuenum']
                else:
                    # Use the mean start value if no value is found within the start window
                    start_value = start_means[feature]
                new_rows.append({
                    'subject_id': subject_id,
                    'feature_label': feature,
                    'time_from_window_start_mins': pd.Timedelta(0),
                    'valuenum': start_value,
                    'extubation_failure': extubation_failure_label
                })

            # Handle end values
            end_check = (subject_df['feature_label'] == feature) & (subject_df['time_from_window_start_mins'] == pd.Timedelta(minutes=360))
            if not end_check.any():
                # Check if there's a value in the last half of the sampling window
                end_window_check = (subject_df['feature_label'] == feature) & (subject_df['time_from_window_start_mins'] >= end_window_start)
                if end_window_check.any():
                    # Use the latest value within the end window
                    end_value = subject_df[end_window_check].sort_values('time_from_window_start_mins', ascending=False).iloc[0]['valuenum']
                else:
                    # Use the mean end value if no value is found within the end window
                    end_value = end_means[feature]
                new_rows.append({
                    'subject_id': subject_id,
                    'feature_label': feature,
                    'time_from_window_start_mins': pd.Timedelta(minutes=360),
                    'valuenum': end_value,
                    'extubation_failure': extubation_failure_label
                })

    # Add new rows to the dataframe
    if new_rows:
        new_df = pd.DataFrame(new_rows)
        df = pd.concat([df, new_df], ignore_index=True)

    return df

In [None]:
def resample_and_interpolate(df, feature_labels, categorical_features, initial_interval='1T', target_interval='120T'):
    target_interval_minutes = int(target_interval.strip('T'))
    resampled_dfs = []

    # Get all unique subject IDs
    all_subject_ids = df['subject_id'].unique()

    for subject_id in all_subject_ids:
        subject_df = df[df['subject_id'] == subject_id]

        for feature in feature_labels:
            feature_df = subject_df[subject_df['feature_label'] == feature].set_index('time_from_window_start_mins')

            # Convert index to timedelta for resampling
            feature_df.index = pd.to_timedelta(feature_df.index, unit='m')

            # Remove duplicates by taking the first value if duplicates exist
            feature_df = feature_df[~feature_df.index.duplicated(keep='first')]

            if feature in categorical_features:
                # Handle categorical features
                if feature_df.empty or feature_df['valuenum'].isna().all():
                    # If the feature is completely absent or all values are NaN, create NaNs for the entire interval
                    new_index = pd.timedelta_range(start='0 min', periods=int(360 / target_interval_minutes + 1), freq=f'{target_interval_minutes}T')
                    feature_df = pd.DataFrame(index=new_index, columns=feature_df.columns)
                    feature_df['valuenum'] = np.nan
                    feature_df['extubation_failure'] = subject_df['extubation_failure'].iloc[0] if not subject_df.empty else np.nan
                else:
                    # Step 1: Resample to every minute to ensure data points at every minute
                    feature_df = feature_df.resample(initial_interval).asfreq()

                    # Step 2: Apply forward fill followed by backward fill
                    feature_df['valuenum'] = feature_df['valuenum'].ffill().bfill()

                    # Step 3: Resample to the target interval
                    feature_df = feature_df.resample(target_interval).asfreq()
            else:
                # Handle numerical features
                if feature_df.empty or feature_df['valuenum'].isna().all():
                    # If the feature is completely absent or all values are NaN, create NaNs for the entire interval
                    new_index = pd.timedelta_range(start='0 min', periods=int(360 / target_interval_minutes + 1), freq=f'{target_interval_minutes}T')
                    feature_df = pd.DataFrame(index=new_index, columns=feature_df.columns)
                    feature_df['valuenum'] = np.nan
                    feature_df['extubation_failure'] = subject_df['extubation_failure'].iloc[0] if not subject_df.empty else np.nan
                else:
                    # Step 1: Resample to every minute to ensure data points at every minute
                    feature_df = feature_df.resample(initial_interval).asfreq()

                    # Step 2: Interpolate missing values if the feature has some data
                    feature_df['valuenum'] = feature_df['valuenum'].interpolate(method='linear')

                    # Check for any remaining NaN values and fill with the mean of the feature for the specific patient
                    if feature_df['valuenum'].isna().sum() > 0:
                        feature_mean = subject_df[subject_df['feature_label'] == feature]['valuenum'].mean()
                        feature_df['valuenum'].fillna(feature_mean, inplace=True)

                    # Step 3: Resample to the target interval
                    feature_df = feature_df.resample(target_interval).asfreq()

            # Align extubation_failure columns with the resampled index and forward/backward fill for each patient
            feature_df['extubation_failure'] = feature_df['extubation_failure'].ffill().bfill()

            # Restore subject_id and feature_label columns
            feature_df['subject_id'] = subject_id
            feature_df['feature_label'] = feature

            # Reset index to retain the time information correctly
            feature_df.reset_index(inplace=True)
            feature_df.rename(columns={'index': 'time_from_window_start_mins'}, inplace=True)

            # Convert minutes to timedelta
            feature_df['time_from_window_start_mins'] = pd.to_timedelta(feature_df['time_from_window_start_mins'], unit='m')

            resampled_dfs.append(feature_df)

    # Concatenate all resampled dataframes
    resampled_df = pd.concat(resampled_dfs).reset_index(drop=True)

    return resampled_df

In [None]:
def process_data(train_df, test_df, feature_labels, categorical_features, initial_interval='1T', target_interval='120T'):
    """
    Process the train and test datasets by filling start and end values, resampling, and interpolating missing values.

    Parameters:
    train_df (pd.DataFrame): The input training DataFrame containing time series data for various features.
    test_df (pd.DataFrame): The input test DataFrame containing time series data for various features.
    feature_labels (list): A list of feature labels to check for completeness.
    categorical_features (list): A list of feature labels that are categorical numerical features.
    initial_interval (str): The initial resampling interval to ensure all data points are included. Default is '1T' (1 minute).
    target_interval (str): The target resampling interval. Default is '120T' (120 minutes).

    Returns:
    tuple: The processed training and test DataFrames along with their corresponding masks indicating where values were NaNs.
    """
    # Convert target interval to minutes for window calculations
    target_interval_minutes = int(target_interval.strip('T'))
    start_window = pd.to_timedelta(target_interval_minutes // 2, unit='m')
    end_window_start = pd.to_timedelta(360 - target_interval_minutes // 2, unit='m')

    # Calculate means from training data for start and end windows
    start_means = train_df[train_df['time_from_window_start_mins'] <= start_window].groupby('feature_label')['valuenum'].mean()
    end_means = train_df[train_df['time_from_window_start_mins'] >= end_window_start].groupby('feature_label')['valuenum'].mean()

    # Fill start and end values for train and test data
    train_df = fill_start_end_values(train_df, feature_labels, start_means, end_means, start_window, end_window_start)
    test_df = fill_start_end_values(test_df, feature_labels, start_means, end_means, start_window, end_window_start)

    # Resample and interpolate data for train and test sets
    train_df = resample_and_interpolate(train_df, feature_labels, categorical_features, initial_interval, target_interval)
    test_df = resample_and_interpolate(test_df, feature_labels, categorical_features, initial_interval, target_interval)

    # Create masks indicating where values are NaNs
    train_mask = train_df.isna()
    test_mask = test_df.isna()

    return train_df, test_df, train_mask, test_mask

In [None]:
def check_processed_data(original_df, modified_df, feature_labels, target_interval='30T'):
    """
    Check if all patients have been processed correctly in the modified data.

    Parameters:
    original_df (pd.DataFrame): The original DataFrame containing time series data for various features.
    modified_df (pd.DataFrame): The modified DataFrame containing resampled and interpolated data.
    feature_labels (list): A list of feature labels to check for completeness.
    target_interval (str): The target resampling interval. Default is '30T' (30 minutes).

    Returns:
    bool: True if all patients have been processed correctly, False otherwise.
    """
    target_interval_minutes = int(target_interval.strip('T'))  # Convert target interval to minutes
    expected_intervals = [pd.Timedelta(minutes=m) for m in range(0, 361, target_interval_minutes)]  # Expected time intervals

    for subject_id in original_df['subject_id'].unique():
        for feature in feature_labels:
            original_feature_df = original_df[(original_df['subject_id'] == subject_id) & (original_df['feature_label'] == feature)]
            modified_feature_df = modified_df[(modified_df['subject_id'] == subject_id) & (modified_df['feature_label'] == feature)]

            # Check if the original data has any non-NaN values for this feature
            original_has_valid_data = original_feature_df['valuenum'].notna().any()

            modified_values = modified_feature_df['valuenum'].values
            modified_times = modified_feature_df['time_from_window_start_mins'].values

            # Check that all expected intervals are present in the modified data
            if not all(time in modified_times for time in expected_intervals):
                print(f"Missing intervals for subject {subject_id}, feature {feature}")
                return False

            if original_has_valid_data:
                # If the original data had valid values, check that the modified data has no NaNs
                if np.isnan(modified_values).any():
                    print(f"NaN values found in modified data for subject {subject_id}, feature {feature}")
                    return False
            else:
                # If the original data did not have any valid values, check that the modified data is all NaNs
                if not np.isnan(modified_values).all():
                    print(f"Non-NaN values found in modified data for subject {subject_id}, feature {feature} where original data had no values")
                    return False

    return True

In [None]:
# Define categorical features
categorical_features = ['GCS - Eye Opening', 'GCS - Motor Response', 'Richmond-RAS Scale']

In [None]:
# Process the low frequency data to resaple to every 2 hours
low_frequency_train_resampled, low_frequency_test_resampled, low_frequency_train_mask, low_frequency_test_mask = process_data(low_train_df_copy, low_test_df_copy, low_frequency_features, categorical_features, initial_interval='1T', target_interval='120T')

print(f"Number of patients in low frequency train set: {len(low_frequency_train_resampled['subject_id'].unique())}")
print(f"Number of patients in low frequency test set: {len(low_frequency_test_resampled['subject_id'].unique())}")

Number of patients in low frequency train set: 3760
Number of patients in low frequency test set: 941


In [None]:
# Check processing is correct
check_processed_data(low_train_df_copy, low_frequency_train_resampled, low_frequency_features, target_interval='120T')

True

In [None]:
# Apply to both medium and high frequency
medium_frequency_train_resampled, medium_frequency_test_resampled, medium_frequency_train_mask, medium_frequency_test_mask = process_data(medium_train_df_copy, medium_test_df_copy, medium_frequency_features, categorical_features, initial_interval='1T', target_interval='60T')
print(check_processed_data(medium_train_df_copy, medium_frequency_train_resampled, medium_frequency_features, target_interval='60T'))

high_frequency_train_resampled, high_frequency_test_resampled, high_frequency_train_mask, high_frequency_test_mask = process_data(high_train_df_copy, high_test_df_copy, high_frequency_features, categorical_features, initial_interval='1T', target_interval='30T')
print(check_processed_data(high_train_df_copy, high_frequency_train_resampled, high_frequency_features, target_interval='30T'))

True
True


In [None]:
# Count the number of patients
print(f"Number of patients in low frequency train set: {len(low_frequency_train_resampled['subject_id'].unique())}")
print(f"Number of patients in low frequency test set: {len(low_frequency_test_resampled['subject_id'].unique())}")
print(f"Number of patients in medium frequency train set: {len(medium_frequency_train_resampled['subject_id'].unique())}")
print(f"Number of patients in medium frequency test set: {len(medium_frequency_test_resampled['subject_id'].unique())}")
print(f"Number of patients in high frequency train set: {len(high_frequency_train_resampled['subject_id'].unique())}")
print(f"Number of patients in high frequency test set: {len(high_frequency_test_resampled['subject_id'].unique())}")

Number of patients in low frequency train set: 3760
Number of patients in low frequency test set: 941
Number of patients in medium frequency train set: 3760
Number of patients in medium frequency test set: 941
Number of patients in high frequency train set: 3760
Number of patients in high frequency test set: 941


Now we need to ensure the catgeorical values are whole numbers and within the bounds of the relevant scoring system

In [None]:
# Round all categorical values and ensure they are within the RAS scale range of +4 to -5
def round_and_clip_categorical(df, categorical_features, min_value, max_value):
    """
    Rounds and clips categorical feature values in the DataFrame.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing time series data.
    categorical_features (list): A list of feature labels to be processed as categorical.
    min_value (int): The minimum value for clipping.
    max_value (int): The maximum value for clipping.

    Returns:
    pd.DataFrame: The processed DataFrame with categorical feature values rounded and clipped.
    """
    for feature in categorical_features:
        feature_mask = df['feature_label'] == feature
        df.loc[feature_mask & df['valuenum'].notna(), 'valuenum'] = df.loc[feature_mask & df['valuenum'].notna(), 'valuenum'].round().clip(min_value, max_value)
    return df

In [None]:
ras_feature = 'Richmond-RAS Scale'

In [None]:
# RAS score has scale between -5 and 4
medium_frequency_train_resampled = round_and_clip_categorical(medium_frequency_train_resampled_copy, [ras_feature], -5, 4)
medium_frequency_test_resampled = round_and_clip_categorical(medium_frequency_test_resampled_copy, [ras_feature], -5, 4)

In [None]:
gcs_eyes = 'GCS - Eye Opening'
gcs_motor = 'GCS - Motor Response'

GCS - Eyes is between 1 and 4

GCS - Motor is between 1 to 6

In [None]:
# Round GCS values
medium_frequency_train_resampled = round_and_clip_categorical(medium_frequency_train_resampled_copy, [gcs_eyes], 1, 4)
medium_frequency_test_resampled = round_and_clip_categorical(medium_frequency_test_resampled_copy, [gcs_eyes], 1, 4)

medium_frequency_train_resampled = round_and_clip_categorical(medium_frequency_train_resampled_copy, [gcs_motor], 1, 6)
medium_frequency_test_resampled = round_and_clip_categorical(medium_frequency_test_resampled_copy, [gcs_motor], 1, 6)

In [None]:
medium_frequency_train_resampled[medium_frequency_train_resampled['feature_label'] == gcs_motor]

Unnamed: 0,time_from_window_start_mins,subject_id,feature_label,valuenum,extubation_failure
14,0 days 00:00:00,10001884,GCS - Motor Response,6.0,1.0
15,0 days 01:00:00,10001884,GCS - Motor Response,6.0,1.0
16,0 days 02:00:00,10001884,GCS - Motor Response,6.0,1.0
17,0 days 03:00:00,10001884,GCS - Motor Response,6.0,1.0
18,0 days 04:00:00,10001884,GCS - Motor Response,6.0,1.0
...,...,...,...,...,...
263146,0 days 02:00:00,17923146,GCS - Motor Response,5.0,0.0
263147,0 days 03:00:00,17923146,GCS - Motor Response,6.0,0.0
263148,0 days 04:00:00,17923146,GCS - Motor Response,6.0,0.0
263149,0 days 05:00:00,17923146,GCS - Motor Response,6.0,0.0


In [None]:
# Make copies
low_frequency_train_resampled_copy = low_frequency_train_resampled.copy()
low_frequency_test_resampled_copy = low_frequency_test_resampled.copy()
medium_frequency_train_resampled_copy = medium_frequency_train_resampled.copy()
medium_frequency_test_resampled_copy = medium_frequency_test_resampled.copy()
high_frequency_train_resampled_copy = high_frequency_train_resampled.copy()
high_frequency_test_resampled_copy = high_frequency_test_resampled.copy()

In [None]:
# Save progress
data_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/07_data_preprocessing/02_feature_set_2/feature_subsets_v2'
low_frequency_train_resampled.to_parquet(f'{data_path}/02_low_frequency_train_resampled.parquet')
low_frequency_test_resampled.to_parquet(f'{data_path}/02_low_frequency_test_resampled.parquet')
medium_frequency_train_resampled.to_parquet(f'{data_path}/02_medium_frequency_train_resampled.parquet')
medium_frequency_test_resampled.to_parquet(f'{data_path}/02_medium_frequency_test_resampled.parquet')
high_frequency_train_resampled.to_parquet(f'{data_path}/02_high_frequency_train_resampled.parquet')
high_frequency_test_resampled.to_parquet(f'{data_path}/02_high_frequency_test_resampled.parquet')


# **Step 6: Feature scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
def scale_features(train_df, test_df, numerical_features):
    """
    Scale numerical features in the train and test dataframes using MinMaxScaler.

    Parameters:
    train_df (pd.DataFrame): The training dataframe.
    test_df (pd.DataFrame): The testing dataframe.
    numerical_features (list): List of feature labels to be scaled.

    Returns:
    tuple: The scaled train and test dataframes.
    """
    scalers = {}

    for feature in numerical_features:
        # Initialize the MinMaxScaler for the feature
        scalers[feature] = MinMaxScaler()

        # Create masks for the feature in train and test dataframes
        feature_mask_train = train_df['feature_label'] == feature
        feature_mask_test = test_df['feature_label'] == feature

        # Store the original NaN masks
        nan_mask_train = train_df.loc[feature_mask_train, 'valuenum'].isna()
        nan_mask_test = test_df.loc[feature_mask_test, 'valuenum'].isna()

        # Fill NaNs with a temporary value
        train_values = train_df.loc[feature_mask_train, 'valuenum'].fillna(0).values.reshape(-1, 1)
        test_values = test_df.loc[feature_mask_test, 'valuenum'].fillna(0).values.reshape(-1, 1)

        # Fit and transform the train dataframe values
        train_scaled = scalers[feature].fit_transform(train_values)
        # Transform the test dataframe values
        test_scaled = scalers[feature].transform(test_values)

        # Restore the NaN values
        train_scaled[nan_mask_train.values] = np.nan
        test_scaled[nan_mask_test.values] = np.nan

        # Assign the scaled values back to the dataframes
        train_df.loc[feature_mask_train, 'valuenum'] = train_scaled
        test_df.loc[feature_mask_test, 'valuenum'] = test_scaled

        print(f'Feature {feature} has been normalized')

    # Ensure indices align if necessary
    train_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    # Display the sizes after normalization
    print(f"Number of rows in train dataframe after normalization: {len(train_df)}")
    print(f"Number of rows in test dataframe after normalization: {len(test_df)}")

    return train_df, test_df

In [None]:
# Scale all data
low_frequency_train_scaled, low_frequency_test_scaled = scale_features(low_frequency_train_resampled, low_frequency_test_resampled, low_frequency_features)
medium_frequency_train_scaled, medium_frequency_test_scaled = scale_features(medium_frequency_train_resampled, medium_frequency_test_resampled, medium_frequency_features)
high_frequency_train_scaled, high_frequency_test_scaled = scale_features(high_frequency_train_resampled, high_frequency_test_resampled, high_frequency_features)


Feature PH (Arterial) has been normalized
Feature Arterial O2 pressure has been normalized
Feature Arterial CO2 Pressure has been normalized
Number of rows in train dataframe after normalization: 45120
Number of rows in test dataframe after normalization: 11292
Feature Inspired O2 Fraction has been normalized
Feature GCS - Eye Opening has been normalized
Feature GCS - Motor Response has been normalized
Feature Tidal Volume (observed) has been normalized
Feature Minute Volume has been normalized
Feature Mean Airway Pressure has been normalized
Feature Peak Insp. Pressure has been normalized
Feature Temperature Fahrenheit has been normalized
Feature Tidal Volume (spontaneous) has been normalized
Feature Richmond-RAS Scale has been normalized
Number of rows in train dataframe after normalization: 263200
Number of rows in test dataframe after normalization: 65870
Feature Heart Rate has been normalized
Feature O2 saturation pulseoxymetry has been normalized
Feature Respiratory Rate has been

In [None]:
# Count the number of patients in each dataset
print(f"Number of patients in low frequency train set: {low_frequency_train_scaled['subject_id'].nunique()}")
print(f"Number of patients in low frequency test set: {low_frequency_test_scaled['subject_id'].nunique()}")
print(f"Number of patients in medium frequency train set: {medium_frequency_train_scaled['subject_id'].nunique()}")
print(f"Number of patients in medium frequency test set: {medium_frequency_test_scaled['subject_id'].nunique()}")
print(f"Number of patients in high frequency train set: {high_frequency_train_scaled['subject_id'].nunique()}")
print(f"Number of patients in high frequency test set: {high_frequency_test_scaled['subject_id'].nunique()}")


Number of patients in low frequency train set: 3760
Number of patients in low frequency test set: 941
Number of patients in medium frequency train set: 3760
Number of patients in medium frequency test set: 941
Number of patients in high frequency train set: 3760
Number of patients in high frequency test set: 941


In [None]:
# Save scaled data
data_path = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/07_data_preprocessing/02_feature_set_2/feature_subsets_v2'
low_frequency_train_scaled.to_parquet(f'{data_path}/03_low_frequency_train_scaled.parquet')
low_frequency_test_scaled.to_parquet(f'{data_path}/03_low_frequency_test_scaled.parquet')
medium_frequency_train_scaled.to_parquet(f'{data_path}/03_medium_frequency_train_scaled.parquet')
medium_frequency_test_scaled.to_parquet(f'{data_path}/03_medium_frequency_test_scaled.parquet')
high_frequency_train_scaled.to_parquet(f'{data_path}/03_high_frequency_train_scaled.parquet')
high_frequency_test_scaled.to_parquet(f'{data_path}/03_high_frequency_test_scaled.parquet')

**Step 7: Create sequences for LSTM**

In [None]:
# Convert extubation column to int64
low_frequency_train_scaled['extubation_failure'] = low_frequency_train_scaled['extubation_failure'].astype('int64')
low_frequency_test_scaled['extubation_failure'] = low_frequency_test_scaled['extubation_failure'].astype('int64')
medium_frequency_train_scaled['extubation_failure'] = medium_frequency_train_scaled['extubation_failure'].astype('int64')
medium_frequency_test_scaled['extubation_failure'] = medium_frequency_test_scaled['extubation_failure'].astype('int64')
high_frequency_train_scaled['extubation_failure'] = high_frequency_train_scaled['extubation_failure'].astype('int64')
high_frequency_test_scaled['extubation_failure'] = high_frequency_test_scaled['extubation_failure'].astype('int64')


In [None]:
low_frequency_train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45120 entries, 0 to 45119
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype          
---  ------                       --------------  -----          
 0   time_from_window_start_mins  45120 non-null  timedelta64[ns]
 1   subject_id                   45120 non-null  int64          
 2   feature_label                45120 non-null  object         
 3   valuenum                     16516 non-null  float64        
 4   extubation_failure           45120 non-null  int64          
dtypes: float64(1), int64(2), object(1), timedelta64[ns](1)
memory usage: 1.7+ MB


In [None]:
# Create copies of all datasets
low_frequency_train_scaled_copy = low_frequency_train_scaled.copy()
low_frequency_test_scaled_copy = low_frequency_test_scaled.copy()
medium_frequency_train_scaled_copy = medium_frequency_train_scaled.copy()
medium_frequency_test_scaled_copy = medium_frequency_test_scaled.copy()
high_frequency_train_scaled_copy = high_frequency_train_scaled.copy()
high_frequency_test_scaled_copy = high_frequency_test_scaled.copy()

**Function to create sequences**

In [None]:
def create_sequences(data, sequence_length):
    """
    Convert time series data into sequences for LSTM input, including subject IDs.

    Parameters:
    data (pd.DataFrame): The input DataFrame containing time series data.
    sequence_length (int): The length of each sequence.

    Returns:
    np.array: Numpy array of sequences.
    np.array: Numpy array of labels.
    list: List of subject IDs corresponding to each sequence.
    """
    sequences = []
    labels = []
    subject_ids = []

    # Extract unique feature labels
    feature_labels = data['feature_label'].unique()

    # Group data by patient
    grouped = data.groupby('subject_id')

    # Ensure the subject IDs are processed in a consistent order
    sorted_subject_ids = sorted(grouped.groups.keys())

    for subject_id in sorted_subject_ids:
        group = grouped.get_group(subject_id)

        # Ensure data is sorted by time
        group = group.sort_values(by='time_from_window_start_mins')

        # Pivot the data to ensure all features are included
        pivoted_data = group.pivot(index='time_from_window_start_mins', columns='feature_label', values='valuenum')

        # Ensure the pivoted data has the correct order of columns
        pivoted_data = pivoted_data[feature_labels]

        # Create sequences
        for i in range(len(pivoted_data) - sequence_length + 1):
            sequence = pivoted_data.iloc[i:i + sequence_length].values
            sequences.append(sequence)
            labels.append(group['extubation_failure'].iloc[i + sequence_length - 1])
            subject_ids.append(subject_id)

    return np.array(sequences), np.array(labels), subject_ids

In [None]:
# Set the sequence length for each dataset
low_frequency_seq_length = 360 // 120 + 1
medium_frequency_seq_length = 360 // 60 + 1
high_frequency_seq_length = 360 // 30 + 1

print(f"Low frequency sequence length: {low_frequency_seq_length}")
print(f"Medium frequency sequence length: {medium_frequency_seq_length}")
print(f"High frequency sequence length: {high_frequency_seq_length}")

Low frequency sequence length: 4
Medium frequency sequence length: 7
High frequency sequence length: 13


In [None]:
# Create sequences for all datasets
low_frequency_train_sequences, low_frequency_train_labels, low_frequency_train_subject_ids = create_sequences(low_frequency_train_scaled_copy, low_frequency_seq_length)
medium_frequency_train_sequences, medium_frequency_train_labels, medium_frequency_train_subject_ids = create_sequences(medium_frequency_train_scaled_copy, medium_frequency_seq_length)
high_frequency_train_sequences, high_frequency_train_labels, high_frequency_train_subject_ids = create_sequences(high_frequency_train_scaled_copy, high_frequency_seq_length)

low_frequency_test_sequences, low_frequency_test_labels, low_frequency_test_subject_ids = create_sequences(low_frequency_test_scaled_copy, low_frequency_seq_length)
medium_frequency_test_sequences, medium_frequency_test_labels, medium_frequency_test_subject_ids = create_sequences(medium_frequency_test_scaled_copy, medium_frequency_seq_length)
high_frequency_test_sequences, high_frequency_test_labels, high_frequency_test_subject_ids = create_sequences(high_frequency_test_scaled_copy, high_frequency_seq_length)

print(f"Low frequency train sequences shape: {low_frequency_train_sequences.shape}")
print(f"Low frequency train labels shape: {low_frequency_train_labels.shape}")
print(f"Low frequency test sequences shape: {low_frequency_test_sequences.shape}")
print(f"Low frequency test labels shape: {low_frequency_test_labels.shape}")

print(f"Medium frequency train sequences shape: {medium_frequency_train_sequences.shape}")
print(f"Medium frequency train labels shape: {medium_frequency_train_labels.shape}")
print(f"Medium frequency test sequences shape: {medium_frequency_test_sequences.shape}")
print(f"Medium frequency test labels shape: {medium_frequency_test_labels.shape}")

print(f"High frequency train sequences shape: {high_frequency_train_sequences.shape}")
print(f"High frequency train labels shape: {high_frequency_train_labels.shape}")
print(f"High frequency test sequences shape: {high_frequency_test_sequences.shape}")
print(f"High frequency test labels shape: {high_frequency_test_labels.shape}")

Low frequency train sequences shape: (3760, 4, 3)
Low frequency train labels shape: (3760,)
Low frequency test sequences shape: (941, 4, 3)
Low frequency test labels shape: (941,)
Medium frequency train sequences shape: (3760, 7, 10)
Medium frequency train labels shape: (3760,)
Medium frequency test sequences shape: (941, 7, 10)
Medium frequency test labels shape: (941,)
High frequency train sequences shape: (3760, 13, 6)
High frequency train labels shape: (3760,)
High frequency test sequences shape: (941, 13, 6)
High frequency test labels shape: (941,)


In [None]:
# Check the all the subject ids are in the same order
def check_subject_ids_order(subject_ids_list):
    """
    Check if the subject_ids lists are in the same order.

    Parameters:
    subject_ids_list (list of lists): A list containing lists of subject_ids from different subsets.

    Returns:
    bool: True if all subject_ids lists are in the same order, False otherwise.
    """
    if len(subject_ids_list) < 2:
        return True

    # Use the first list as the reference
    reference_list = subject_ids_list[0]

    for subject_ids in subject_ids_list[1:]:
        if reference_list != subject_ids:
            return False

    return True

In [None]:
subject_id_list_train = [low_frequency_train_subject_ids, medium_frequency_train_subject_ids, high_frequency_train_subject_ids]
subject_id_list_test = [low_frequency_test_subject_ids, medium_frequency_test_subject_ids, high_frequency_test_subject_ids]

print(check_subject_ids_order(subject_id_list_train))
print(check_subject_ids_order(subject_id_list_test))

True
True


In [None]:
output_dir = '/content/drive/MyDrive/MSc_Final_Project/02_data_analysis/mimic/data_analysis/datasets/08_model_input_data/02_feature_set_2/01_lstm_data/dynamic_data'

In [None]:
# Save the sequences and labels
np.save(f'{output_dir}/low_frequency_train_sequences_v1.npy', low_frequency_train_sequences)
np.save(f'{output_dir}/low_frequency_train_labels_v1.npy', low_frequency_train_labels)
np.save(f'{output_dir}/low_frequency_test_sequences_v1.npy', low_frequency_test_sequences)
np.save(f'{output_dir}/low_frequency_test_labels_v1.npy', low_frequency_test_labels)

np.save(f'{output_dir}/medium_frequency_train_sequences_v1.npy', medium_frequency_train_sequences)
np.save(f'{output_dir}/medium_frequency_train_labels_v1.npy', medium_frequency_train_labels)
np.save(f'{output_dir}/medium_frequency_test_sequences_v1.npy', medium_frequency_test_sequences)
np.save(f'{output_dir}/medium_frequency_test_labels_v1.npy', medium_frequency_test_labels)

np.save(f'{output_dir}/high_frequency_train_sequences_v1.npy', high_frequency_train_sequences)
np.save(f'{output_dir}/high_frequency_train_labels_v1.npy', high_frequency_train_labels)
np.save(f'{output_dir}/high_frequency_test_sequences_v1.npy', high_frequency_test_sequences)
np.save(f'{output_dir}/high_frequency_test_labels_v1.npy', high_frequency_test_labels)

In [None]:
# Save the subject ids for ordering static data
np.save(f'{output_dir}/train_subject_ids_v1.npy', low_frequency_train_subject_ids)
np.save(f'{output_dir}/test_subject_ids_v1.npy', low_frequency_test_subject_ids)

In [None]:
# Save the feature names for feature ablation
low_freq_feature_names = low_frequency_train_scaled['feature_label'].unique()
np.save(f'{output_dir}/low_freq_feature_names_v1.npy', low_freq_feature_names)

medium_freq_feature_names = medium_frequency_train_scaled['feature_label'].unique()
np.save(f'{output_dir}/medium_freq_feature_names_v1.npy', medium_freq_feature_names)

high_freq_feature_names = high_frequency_train_scaled['feature_label'].unique()
np.save(f'{output_dir}/high_freq_feature_names_v1.npy', high_freq_feature_names)