In [None]:
# Import the necessary libraries
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import seaborn as sns
import seaborn.objects as so
import matplotlib.pyplot as plt


In [None]:
# Load the dataset
df = pd.read_csv("C:/Users/anthony.greco/OneDrive - sv-sb.org/Data Analysis Training/Covid_Project.csv")

# Set display options for scrolling through the dataframe
pd.set_option('display.max_rows', None)


In [None]:
# Encode categorical variables
window_categories = ['0-2', '2-4', '4-6', '6-12', 'ABOVE_12']
oencW = OrdinalEncoder(categories=[window_categories])
df['WINDOW'] = oencW.fit_transform(df[['WINDOW']])

age_categories = ["10th", "20th", "30th", "40th", "50th", "60th", "70th", "80th", "90th", "Above 90th"]
oencA = OrdinalEncoder(categories=[age_categories])
df['AGE_PERCENTIL'] = oencA.fit_transform(df[['AGE_PERCENTIL']])


In [None]:
# Explore the dataset: Null values seem to decrease as the patient's hospital stay lengthens.
corr_matrix = df.corr()['ICU']
corr_matrix = pd.DataFrame(corr_matrix)
corr_matrix['y'] = abs(corr_matrix['ICU'])
corr_matrix = corr_matrix.sort_values(by=['ICU'])
not_selected_features = corr_matrix[corr_matrix['ICU'] < 0.004].drop(['ICU'], axis=1)

print(corr_matrix)
print(not_selected_features)


In [None]:
# Most influential features
# RESPIRATORY_RATE_MAX, BLOODPRESSURE_SISTOLIC_DIFF, RESPIRATORY_RATE_DIFF, BLOODPRESSURE_SISTOLIC_DIFF_REL, RESPIRATORY_RATE_DIFF_REL

so.Plot(df, x="WINDOW", color="ICU").add(so.Bar(), so.Count(), so.Stack())


In [None]:
# Most influential features
# RESPIRATORY_RATE_MAX, BLOODPRESSURE_SISTOLIC_DIFF, RESPIRATORY_RATE_DIFF, BLOODPRESSURE_SISTOLIC_DIFF_REL, RESPIRATORY_RATE_DIFF_REL

so.Plot(df, x="WINDOW", color="ICU").add(so.Bar(), so.Count(), so.Stack())


In [None]:
# Identify patients who did not go to the ICU by the fifth window.
dfw_1 = df[df['WINDOW'] == 1]
print(dfw_1.shape, dfw_1.isna().sum())

# Use null values to predict ICU admission in window 1. Can we predict who will go home based on this data?
dfw_1 = dfw_1.dropna()
print(dfw_1.shape)


In [None]:
# Explore windows 2 to 5
dfw_2 = df[df['WINDOW'] == 2]
dfw_3 = df[df['WINDOW'] == 3]
dfw_4 = df[df['WINDOW'] == 4]
dfw_5 = df[df['WINDOW'] == 5]

# Number of null values
print(dfw_2.shape, dfw_2.isna().sum())
print(dfw_3.shape, dfw_3.isna().sum())
print(dfw_4.shape, dfw_4.isna().sum())
print(dfw_5.shape, dfw_5.isna().sum())


In [None]:
# Handle missing values by backfilling and forward-filling
df.set_index('PATIENT_VISIT_IDENTIFIER', inplace=True)
df.fillna(method='bfill', inplace=True)
df.fillna(method='ffill', inplace=True)

# Check for null values and data types
print(df.isna().sum())
print(df.dtypes)


In [None]:
# Train and test a decision tree
from sklearn.model_selection import train_test_split
y = df["ICU"]
X = df.drop(columns=["ICU"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=2)
model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# Train and test a decision tree
from sklearn.model_selection import train_test_split
y = df["ICU"]
X = df.drop(columns=["ICU"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=2)
model.fit(X_train, y_train)

# Predict on the test set and evaluate
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()


In [None]:
# Hyperparameter tuning with GridSearchCV
from sklearn import decomposition, tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

std_slc = StandardScaler()
pca = decomposition.PCA()
dec_tree = tree.DecisionTreeClassifier()

pipe = Pipeline(steps=[('std_slc', std_slc), ('pca', pca), ('dec_tree', dec_tree)])

n_components = list(range(1, X.shape[1] + 1, 1))
criterion = ['gini', 'entropy']
max_depth = [2, 4, 6, 8, 10, 12]
parameters = dict(pca__n_components=n_components, dec_tree__criterion=criterion, dec_tree__max_depth=max_depth)

clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X, y)

# Best parameters
print('Best Criterion:', clf_GS.best_estimator_.get_params()['dec_tree__criterion'])
print('Best max_depth:', clf_GS.best_estimator_.get_params()['dec_tree__max_depth'])
print('Best Number Of Components:', clf_GS.best_estimator_.get_params()['pca__n_components'])
print(); print(clf_GS.best_estimator_.get_params()['dec_tree'])
