This notebook performs data preparation tasks such as splitting the data into train and test sets, as well as data standardizaton for numeric features and one-hot encoding for categorical features. It also balances the data due to a 95% (0) and 5% (1) class imbalance of the target variable.

### Imports

In [89]:
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Set maximum rows and columns to display (None for unlimited)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 0)  # Adjust display width for console-like output
pd.set_option('display.max_colwidth', None)  # Show full content of each column

### Read in Data

In [90]:
# Change working directory
wd = "C://Users//alexm//OneDrive//Desktop//Northwestern//Winter 2025//MSDS 498//Capstone Project//PE_Predictions//EDA_Engineering"
os.chdir(wd)

In [91]:
# Load data (replace 'your_data.csv' with your actual dataset)
df = pd.read_csv('engineered.csv')

### Define Features and Target

In [None]:
# Define target variable and columns to drop
target = 'pe_outcome'
not_including = ['subject_id', 'hadm_id', 'dvt_date', 'pe_date', 'pe_outcome', 'length_of_stay', 'num_dvt_admissions', 'dvt_icd_code', 'dvt_icd_version', 'dvt_diagnosis', 'pe_icd_code', 'pe_icd_version', 'pe_diagnosis', 
                 'num_dvt_diagnoses', 'log_num_dvt_diagnoses', 'hx_dvt', 'num_pe_events', 'cat_days_to_pe'] # Fields are ether identifiers, target field, non-transformed fields, or are just used for EDA and not for modeling

In [70]:
# Separate features and target
X = df.drop(columns = not_including, axis = 1)
y = df[[target]]  

### Split Data Into Train, Validation, and Test Sets

In [71]:
# Train-Validation-Test Split 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 8) # 70% Train

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, stratify = y_temp, random_state = 8)  
# 15% Validation, 15% Test

### Define One-Hot Encoding and Standardization Pipeline

In [72]:
# Identify numeric and categorical features (excluding identifiers)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [73]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),  # Standardize numerical features
    ("cat", OneHotEncoder(handle_unknown = "ignore"), categorical_features)  # One-hot encode categorical features
])

### Apply Pipeline to Training Data

In [74]:
X_train_preprocessed = preprocessor.fit_transform(X_train)  # Fit & transform on train set

# Convert back to a dataframe with column names
X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns = preprocessor.get_feature_names_out())

# Remove `num__` and `cat__` prefixes from column names
X_train_preprocessed.columns = X_train_preprocessed.columns.str.replace("num__", "", regex = True).str.replace("cat__", "", regex = True)

# Verify that column names are corect
X_train_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3087 entries, 0 to 3086
Data columns (total 76 columns):
 #   Column                                                           Non-Null Count  Dtype  
---  ------                                                           --------------  -----  
 0   had_dvt_as_pri_diagnosis                                         3087 non-null   float64
 1   had_icu_stay                                                     3087 non-null   float64
 2   hx_ac                                                            3087 non-null   float64
 3   hx_pe                                                            3087 non-null   float64
 4   hx_vte                                                           3087 non-null   float64
 5   age                                                              3087 non-null   float64
 6   myocardial_infarct                                               3087 non-null   float64
 7   congestive_heart_failure                  

### Variance-Based Feature Selection
This step identifies features with minimal variance, meaning they exhibit little variation across samples and may not contribute meaningful information to the model. VarianceThreshold is used to detect these low-variance features based on the training data, allowing for potential later removal to improve model efficiency and reduce noise.

In [75]:
# Set variance threshold
variance_threshold = 0.01
selector = VarianceThreshold(threshold = variance_threshold)

# Fit the selector on preprocessed training data
selector.fit(X_train_preprocessed)

# Get selected feature mask
support_mask = selector.get_support() 

# Identify Low-Variance Features
low_variance_features = X_train_preprocessed.columns[~support_mask]

# Print results
print("Low-Variance Features:", low_variance_features.tolist())
print(f"Number of low-variance features: {len(low_variance_features)}")

Low-Variance Features: ['insurance_Unknown', 'cat_days_to_init_treatment_More than 7 days', 'treatment_grouped_MT', 'treatment_grouped_Multiple Interventions', 'race_grouped_American Indian/Alaska Native', 'race_grouped_Multiracial', 'race_grouped_Native Hawaiian/Pacific Islander', 'race_grouped_Portuguese', 'discharge_location_grouped_Against Medical Advice', 'admission_location_grouped_Unknown']
Number of low-variance features: 10


In [93]:
# Convert low-variance features to a DataFrame
low_variance_df = pd.DataFrame(low_variance_features, columns=["LowVarianceFeatures"])

# Change working directory
wd = "C://Users//alexm//OneDrive//Desktop//Northwestern//Winter 2025//MSDS 498//Capstone Project//PE_Predictions//Modeling//Feature_Selection"
os.chdir(wd)

# Save as CSV
low_variance_df.to_csv("low_variance_features.csv", index = False)

In [79]:
# Verify that X_train preprocessed and y_train look correct
X_train_preprocessed.head()
X_train_preprocessed.shape

Unnamed: 0,had_dvt_as_pri_diagnosis,had_icu_stay,hx_ac,hx_pe,hx_vte,age,myocardial_infarct,congestive_heart_failure,peripheral_vascular_disease,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,peptic_ulcer_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,charlson_comorbidity_index,had_ddimer,had_o2_sat,log_length_of_stay,log_num_dvt_admissions,insurance_Medicaid,insurance_Medicare,insurance_Other,insurance_Private,insurance_Unknown,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_Unknown,marital_status_WIDOWED,gender_F,gender_M,dvt_chronicity_Acute,dvt_chronicity_Chronic,dvt_chronicity_Unspecified,dvt_location_Lower,dvt_location_Unspecified,dvt_location_Upper,cat_days_to_init_treatment_1-3 days,cat_days_to_init_treatment_4-7 days,cat_days_to_init_treatment_More than 7 days,cat_days_to_init_treatment_No Treatment,cat_days_to_init_treatment_Same day,cat_days_to_init_treatment_Unknown,treatment_grouped_AC Only,treatment_grouped_CDT,treatment_grouped_Lytics,treatment_grouped_MT,treatment_grouped_Multiple Interventions,treatment_grouped_No Treatment,race_grouped_American Indian/Alaska Native,race_grouped_Asian,race_grouped_Black,race_grouped_Hispanic/Latino,race_grouped_Multiracial,race_grouped_Native Hawaiian/Pacific Islander,race_grouped_Portuguese,race_grouped_Unknown,race_grouped_White,discharge_location_grouped_Against Medical Advice,discharge_location_grouped_Facility-Based Care,discharge_location_grouped_Home/Community-Based Care,discharge_location_grouped_Unknown,admission_location_grouped_Emergency/Urgent Care,admission_location_grouped_Referral-Based Admissions,admission_location_grouped_Scheduled/Procedure-Based Admissions,admission_location_grouped_Transfer from Another Facility,admission_location_grouped_Unknown
0,-0.56251,0.888575,0.968428,2.653123,0.981703,-0.477586,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,2.031725,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.921879,-0.168269,-0.348313,0.088602,1.117888,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.56251,0.888575,0.968428,-0.376914,0.981703,0.07105,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,2.031725,-0.196709,-0.167248,-0.325052,1.980769,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.304961,-0.168269,-0.348313,-0.175036,1.117888,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.56251,-1.125398,-1.032601,-0.376914,0.981703,0.13201,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,3.003785,4.908927,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.311956,-0.168269,-0.348313,0.299114,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.777745,-1.125398,0.968428,-0.376914,0.981703,-0.538545,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-1.230338,-0.168269,-0.348313,-1.063884,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.56251,0.888575,-1.032601,-0.376914,0.981703,-1.879656,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-1.538796,-0.168269,2.870978,0.624478,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


(3087, 76)

In [80]:
y_train.head()
y_train.shape

Unnamed: 0,pe_outcome
2053,1
2255,0
2175,0
2738,0
2162,0


(3087, 1)

### Apply One-Hot Encoding and Standardization to Validation Set

In [81]:
X_val_preprocessed = preprocessor.transform(X_val)  # Apply to validation set

# Convert back to a dataframe with column names
X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns = preprocessor.get_feature_names_out())

# Remove `num__` and `cat__` prefixes from column names
X_val_preprocessed.columns = X_val_preprocessed.columns.str.replace("num__", "", regex = True).str.replace("cat__", "", regex = True)

In [82]:
# Verify X_val_preprocessed and y_val
X_val_preprocessed.head()
X_val_preprocessed.shape

Unnamed: 0,had_dvt_as_pri_diagnosis,had_icu_stay,hx_ac,hx_pe,hx_vte,age,myocardial_infarct,congestive_heart_failure,peripheral_vascular_disease,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,peptic_ulcer_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,charlson_comorbidity_index,had_ddimer,had_o2_sat,log_length_of_stay,log_num_dvt_admissions,insurance_Medicaid,insurance_Medicare,insurance_Other,insurance_Private,insurance_Unknown,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_Unknown,marital_status_WIDOWED,gender_F,gender_M,dvt_chronicity_Acute,dvt_chronicity_Chronic,dvt_chronicity_Unspecified,dvt_location_Lower,dvt_location_Unspecified,dvt_location_Upper,cat_days_to_init_treatment_1-3 days,cat_days_to_init_treatment_4-7 days,cat_days_to_init_treatment_More than 7 days,cat_days_to_init_treatment_No Treatment,cat_days_to_init_treatment_Same day,cat_days_to_init_treatment_Unknown,treatment_grouped_AC Only,treatment_grouped_CDT,treatment_grouped_Lytics,treatment_grouped_MT,treatment_grouped_Multiple Interventions,treatment_grouped_No Treatment,race_grouped_American Indian/Alaska Native,race_grouped_Asian,race_grouped_Black,race_grouped_Hispanic/Latino,race_grouped_Multiracial,race_grouped_Native Hawaiian/Pacific Islander,race_grouped_Portuguese,race_grouped_Unknown,race_grouped_White,discharge_location_grouped_Against Medical Advice,discharge_location_grouped_Facility-Based Care,discharge_location_grouped_Home/Community-Based Care,discharge_location_grouped_Unknown,admission_location_grouped_Emergency/Urgent Care,admission_location_grouped_Referral-Based Admissions,admission_location_grouped_Scheduled/Procedure-Based Admissions,admission_location_grouped_Transfer from Another Facility,admission_location_grouped_Unknown
0,-0.56251,0.888575,0.968428,-0.376914,0.981703,-0.782384,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,5.083648,-0.167248,-0.325052,-0.504854,3.003785,-0.203711,1.90478,1.751341,-0.221201,-0.413633,-0.084722,0.928873,-0.168269,-0.348313,-0.336746,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.56251,0.888575,0.968428,-0.376914,0.981703,-0.721424,-0.300712,-0.475366,3.185028,2.920236,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.61342,-0.168269,-0.348313,1.026251,-0.463015,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.56251,0.888575,0.968428,-0.376914,0.981703,0.985444,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.61342,-0.168269,2.870978,0.97745,1.117888,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-0.56251,0.888575,0.968428,-0.376914,0.981703,-0.721424,-0.300712,-0.475366,3.185028,2.920236,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.61342,-0.168269,-0.348313,0.088602,1.117888,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.56251,0.888575,0.968428,2.653123,0.981703,-0.965262,3.325446,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,3.076428,-0.504854,3.003785,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-0.304961,-0.168269,2.870978,-0.336746,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


(661, 76)

In [83]:
y_val.head()
y_val.shape

Unnamed: 0,pe_outcome
3595,0
2899,0
3985,0
1869,0
1470,0


(661, 1)

### Apply One-Hot Encoding and Standardization to Test Set

In [84]:
X_test_preprocessed = preprocessor.transform(X_test)  # Apply to test set

# Convert back to a dataframe with column names
X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns = preprocessor.get_feature_names_out())

# Remove `num__` and `cat__` prefixes from column names
X_test_preprocessed.columns = X_test_preprocessed.columns.str.replace("num__", "", regex = True).str.replace("cat__", "", regex = True)

In [85]:
# Verify X_test_preprocessed and y_test
X_test_preprocessed.head()
X_test_preprocessed.shape

Unnamed: 0,had_dvt_as_pri_diagnosis,had_icu_stay,hx_ac,hx_pe,hx_vte,age,myocardial_infarct,congestive_heart_failure,peripheral_vascular_disease,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,peptic_ulcer_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,charlson_comorbidity_index,had_ddimer,had_o2_sat,log_length_of_stay,log_num_dvt_admissions,insurance_Medicaid,insurance_Medicare,insurance_Other,insurance_Private,insurance_Unknown,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_Unknown,marital_status_WIDOWED,gender_F,gender_M,dvt_chronicity_Acute,dvt_chronicity_Chronic,dvt_chronicity_Unspecified,dvt_location_Lower,dvt_location_Unspecified,dvt_location_Upper,cat_days_to_init_treatment_1-3 days,cat_days_to_init_treatment_4-7 days,cat_days_to_init_treatment_More than 7 days,cat_days_to_init_treatment_No Treatment,cat_days_to_init_treatment_Same day,cat_days_to_init_treatment_Unknown,treatment_grouped_AC Only,treatment_grouped_CDT,treatment_grouped_Lytics,treatment_grouped_MT,treatment_grouped_Multiple Interventions,treatment_grouped_No Treatment,race_grouped_American Indian/Alaska Native,race_grouped_Asian,race_grouped_Black,race_grouped_Hispanic/Latino,race_grouped_Multiracial,race_grouped_Native Hawaiian/Pacific Islander,race_grouped_Portuguese,race_grouped_Unknown,race_grouped_White,discharge_location_grouped_Against Medical Advice,discharge_location_grouped_Facility-Based Care,discharge_location_grouped_Home/Community-Based Care,discharge_location_grouped_Unknown,admission_location_grouped_Emergency/Urgent Care,admission_location_grouped_Referral-Based Admissions,admission_location_grouped_Scheduled/Procedure-Based Admissions,admission_location_grouped_Transfer from Another Facility,admission_location_grouped_Unknown
0,-0.56251,0.888575,0.968428,-0.376914,-1.018638,0.13201,-0.300712,-0.475366,-0.313969,2.920236,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,4.908927,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.003497,-0.168269,-0.348313,1.279239,-0.463015,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.56251,0.888575,0.968428,2.653123,0.981703,-0.843343,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,1.980769,-0.332913,-0.203711,1.90478,-0.570991,-0.221201,-0.413633,-0.084722,-0.61342,-0.168269,-0.348313,-1.489232,-0.463015,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.56251,-1.125398,0.968428,-0.376914,-1.018638,-1.696777,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-1.538796,-0.168269,-0.348313,-0.336746,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.777745,-1.125398,-1.032601,-0.376914,0.981703,1.655999,-0.300712,-0.475366,-0.313969,-0.342438,4.951,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.003497,-0.168269,-0.348313,-1.489232,-0.463015,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,-0.56251,0.888575,0.968428,-0.376914,0.981703,0.375848,3.325446,2.103641,-0.313969,2.920236,-0.201979,2.031725,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.311956,-0.168269,-0.348313,1.026251,-0.463015,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


(662, 76)

In [86]:
X_test_preprocessed.head()
X_test_preprocessed.shape

Unnamed: 0,had_dvt_as_pri_diagnosis,had_icu_stay,hx_ac,hx_pe,hx_vte,age,myocardial_infarct,congestive_heart_failure,peripheral_vascular_disease,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,peptic_ulcer_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,charlson_comorbidity_index,had_ddimer,had_o2_sat,log_length_of_stay,log_num_dvt_admissions,insurance_Medicaid,insurance_Medicare,insurance_Other,insurance_Private,insurance_Unknown,marital_status_DIVORCED,marital_status_MARRIED,marital_status_SINGLE,marital_status_Unknown,marital_status_WIDOWED,gender_F,gender_M,dvt_chronicity_Acute,dvt_chronicity_Chronic,dvt_chronicity_Unspecified,dvt_location_Lower,dvt_location_Unspecified,dvt_location_Upper,cat_days_to_init_treatment_1-3 days,cat_days_to_init_treatment_4-7 days,cat_days_to_init_treatment_More than 7 days,cat_days_to_init_treatment_No Treatment,cat_days_to_init_treatment_Same day,cat_days_to_init_treatment_Unknown,treatment_grouped_AC Only,treatment_grouped_CDT,treatment_grouped_Lytics,treatment_grouped_MT,treatment_grouped_Multiple Interventions,treatment_grouped_No Treatment,race_grouped_American Indian/Alaska Native,race_grouped_Asian,race_grouped_Black,race_grouped_Hispanic/Latino,race_grouped_Multiracial,race_grouped_Native Hawaiian/Pacific Islander,race_grouped_Portuguese,race_grouped_Unknown,race_grouped_White,discharge_location_grouped_Against Medical Advice,discharge_location_grouped_Facility-Based Care,discharge_location_grouped_Home/Community-Based Care,discharge_location_grouped_Unknown,admission_location_grouped_Emergency/Urgent Care,admission_location_grouped_Referral-Based Admissions,admission_location_grouped_Scheduled/Procedure-Based Admissions,admission_location_grouped_Transfer from Another Facility,admission_location_grouped_Unknown
0,-0.56251,0.888575,0.968428,-0.376914,-1.018638,0.13201,-0.300712,-0.475366,-0.313969,2.920236,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,4.908927,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.003497,-0.168269,-0.348313,1.279239,-0.463015,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.56251,0.888575,0.968428,2.653123,0.981703,-0.843343,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,1.980769,-0.332913,-0.203711,1.90478,-0.570991,-0.221201,-0.413633,-0.084722,-0.61342,-0.168269,-0.348313,-1.489232,-0.463015,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.56251,-1.125398,0.968428,-0.376914,-1.018638,-1.696777,-0.300712,-0.475366,-0.313969,-0.342438,-0.201979,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,-1.538796,-0.168269,-0.348313,-0.336746,-0.463015,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.777745,-1.125398,-1.032601,-0.376914,0.981703,1.655999,-0.300712,-0.475366,-0.313969,-0.342438,4.951,-0.492193,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.003497,-0.168269,-0.348313,-1.489232,-0.463015,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,-0.56251,0.888575,0.968428,-0.376914,0.981703,0.375848,3.325446,2.103641,-0.313969,2.920236,-0.201979,2.031725,-0.196709,-0.167248,-0.325052,-0.504854,-0.332913,-0.203711,-0.524995,-0.570991,-0.221201,-0.413633,-0.084722,0.311956,-0.168269,-0.348313,1.026251,-0.463015,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


(662, 76)

In [87]:
y_test.head()
y_test.shape

Unnamed: 0,pe_outcome
3249,0
3697,1
3209,0
3880,0
2114,0


(662, 1)

### Save train, validation, and test sets to CSV 

In [88]:
# Define file paths
X_train_file = "X_train.csv"
y_train_file = "y_train.csv"

X_val_file = "X_val.csv"
y_val_file = "y_val.csv"

X_test_file = "X_test.csv"
y_test_file = "y_test.csv"

# Save X (features) and y (target) separately
X_train_preprocessed.to_csv(X_train_file, index = False)
y_train.to_csv(y_train_file, index = False)

X_val_preprocessed.to_csv(X_val_file, index = False)
y_val.to_csv(y_val_file, index = False)

X_test_preprocessed.to_csv(X_test_file, index = False)
y_test.to_csv(y_test_file, index = False)