# Capstone: Covid-19

In [6]:
import pandas as pd

## Data Preparation

### Import Data

In [None]:
from sqlalchemy import create_engine

# Define the connection parameters
server = 'B-SURFACE\\SQLEXPRESS'  # Note the double backslashes for escaping
database = 'Covid19Capstone'    # Replace with your actual database name
driver = 'SQL Server'

# Create the connection string using Windows Authentication
connection_string = f"mssql+pyodbc://@{server}/{database}?driver={driver}&trusted_connection=yes"

# Create the engine
engine = create_engine(connection_string)

# Query Data

query = "SELECT * FROM Kaggle_Sirio_Libanes_ICU_Prediction"
data = pd.read_sql(query, engine)




#### Convert Categorical Data to Numeric

In [None]:
# Convert Age_Percentile, Window  to Float
from sklearn.preprocessing import LabelEncoder

#Display first few rows of original data
print("Original Data:")
print(data.head())

# Identify Categorical Columns
categorical_columns = ['AGE_PERCENTIL', 'WINDOW']

# Create a label encoder object
le = LabelEncoder()

# Apply label encoding to 'AGE_PERCENTIL' column
data['Age_Encoded'] = le.fit_transform(data['AGE_PERCENTIL'])
print("\nDataFrame after Label Encoding 'AGE_PERCENTIL':")
print(data.head())

# Apply label encoding to 'WINDOW' column
data['Window_Encoded'] = le.fit_transform(data['WINDOW'])
print("\nDataFrame after Label Encoding 'WINDOW':")
print(data['WINDOW'])
print(data['Window_Encoded'])

#Drop Categorical Columns
data.drop(columns=['WINDOW', 'AGE_PERCENTIL'], inplace=True)

#### Handle Null Values

In [None]:
# Data Imputation
from sklearn.impute import SimpleImputer, KNNImputer  # This means 'k' number of nearest neighbors

# Impute with median
imputer_median = SimpleImputer(strategy='median')
data_median_imputed = pd.DataFrame(imputer_median.fit_transform(data), columns=data.columns)

# Initialize the KNNImputer
imputer_knn = KNNImputer(n_neighbors=2)

# Perform the imputation and Convert the resulting numPy array back to a dataframe.
data_knn_imputed = pd.DataFrame(imputer_knn.fit_transform(data_knn_imputed), columns=data_knn_imputed.columns)

# Display the first few rows of the imputed data
print("Imputed data (KNN):")
print(data_knn_imputed.head())

# Check for any remaining missing values
print("Missing values after imputation (KNN):")
print(data_knn_imputed.isnull().sum())

#### Remove Data Where Target Variable ICU = 1
Per Kaggle: "Beware NOT to use data when the target variable is present, as it is unknown the order of the event (maybe the target event happened before the results were obtained). They were kept there so we can grow this dataset in other outcomes [later] on."

In [34]:
print(data_knn_imputed.columns)

Index(['PATIENT_VISIT_IDENTIFIER', 'AGE_ABOVE65', 'GENDER',
       'DISEASE_GROUPING_1', 'DISEASE_GROUPING_2', 'DISEASE_GROUPING_3',
       'DISEASE_GROUPING_4', 'DISEASE_GROUPING_5', 'DISEASE_GROUPING_6', 'HTN',
       ...
       'OXYGEN_SATURATION_DIFF', 'BLOODPRESSURE_DIASTOLIC_DIFF_REL',
       'BLOODPRESSURE_SISTOLIC_DIFF_REL', 'HEART_RATE_DIFF_REL',
       'RESPIRATORY_RATE_DIFF_REL', 'TEMPERATURE_DIFF_REL',
       'OXYGEN_SATURATION_DIFF_REL', 'ICU', 'Age_Encoded', 'Window_Encoded'],
      dtype='object', length=231)


In [35]:
# Sort the dataframe by PATIENT_VISIT_IDENTIFIER and WINDOW
data_knn_imputed.sort_values(by=['PATIENT_VISIT_IDENTIFIER', 'Window_Encoded'] , inplace=True)

# Shift the ICU column by 1 within each PATIENT_VISIT_IDENTIFIER group
data_knn_imputed['ICU_PREV'] = data_knn_imputed.groupby('PATIENT_VISIT_IDENTIFIER')['ICU'].shift(-1)

# Drop rows where ICU was 1 in the previous window
df_cleaned = data_knn_imputed[data_knn_imputed['ICU_PREV'] != 1]

# Drop the ICU_PREV column as it's no longer needed
df_cleaned.drop(columns=['ICU_PREV'], inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=['ICU_PREV'], inplace=True)


In [69]:
from pandas import option_context
with option_context('display.max_rows', 10, 'display.max_columns', 231):
    pass

print(df_cleaned.shape)
print(df_cleaned.columns)
# print(df_cleaned.head)

df_cleaned.to_csv('Capstone_Cleaned_Data.csv', index=False)

(1442, 231)
Index(['PATIENT_VISIT_IDENTIFIER', 'AGE_ABOVE65', 'GENDER',
       'DISEASE_GROUPING_1', 'DISEASE_GROUPING_2', 'DISEASE_GROUPING_3',
       'DISEASE_GROUPING_4', 'DISEASE_GROUPING_5', 'DISEASE_GROUPING_6', 'HTN',
       ...
       'OXYGEN_SATURATION_DIFF', 'BLOODPRESSURE_DIASTOLIC_DIFF_REL',
       'BLOODPRESSURE_SISTOLIC_DIFF_REL', 'HEART_RATE_DIFF_REL',
       'RESPIRATORY_RATE_DIFF_REL', 'TEMPERATURE_DIFF_REL',
       'OXYGEN_SATURATION_DIFF_REL', 'ICU', 'Age_Encoded', 'Window_Encoded'],
      dtype='object', length=231)


## ML Model Development

### 1. Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the data
X = df_cleaned.drop(columns=['ICU']) #input
y = df_cleaned['ICU'] #output

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Evaluate
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_log))
print('Logistic Regression Report:\n', classification_report(y_test, y_pred_log))


Logistic Regression Accuracy: 0.972318339100346
Logistic Regression Report:
               precision    recall  f1-score   support

         0.0       0.98      0.99      0.98       251
         1.0       0.94      0.84      0.89        38

    accuracy                           0.97       289
   macro avg       0.96      0.92      0.94       289
weighted avg       0.97      0.97      0.97       289



### 2. Decision Trees

In [47]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dec = dt.predict(X_test)

# Evaluate
print('Decision Tree Accuracy:', accuracy_score(y_test, y_pred_dec))
print('Decision Tree Report:\n', classification_report(y_test, y_pred_dec))


Decision Tree Accuracy: 0.9411764705882353
Decision Tree Report:
               precision    recall  f1-score   support

         0.0       0.97      0.96      0.97       251
         1.0       0.76      0.82      0.78        38

    accuracy                           0.94       289
   macro avg       0.86      0.89      0.88       289
weighted avg       0.94      0.94      0.94       289



### 3. Random Forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Random Forest Report:\n', classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.9653979238754326
Random Forest Report:
               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98       251
         1.0       0.89      0.84      0.86        38

    accuracy                           0.97       289
   macro avg       0.93      0.91      0.92       289
weighted avg       0.96      0.97      0.96       289



### 4. Support Vector Machines

In [53]:
from sklearn.svm import SVC

# Support Vector Machine
svc = SVC(kernel='rbf', probability=True)
svc.fit(X_train, y_train)
y_pred_svm = svc.predict(X_test)

# Evaluate
print('SVM Accuracy:', accuracy_score(y_test, y_pred_svm))
print('SVM Report:\n', classification_report(y_test, y_pred_svm))


SVM Accuracy: 0.8685121107266436
SVM Report:
               precision    recall  f1-score   support

         0.0       0.87      1.00      0.93       251
         1.0       0.00      0.00      0.00        38

    accuracy                           0.87       289
   macro avg       0.43      0.50      0.46       289
weighted avg       0.75      0.87      0.81       289



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 5. Naive Bayes
Bayes theorem states the probability of some event B occurring provided the prior knowledge of another event(s) A, given that B is dependent on event A (even partially). Naive Bayes' assumes feature independence.

In [56]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# Evaluate
print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred_nb))
print('Naive Bayes Report:\n', classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.9411764705882353
Naive Bayes Report:
               precision    recall  f1-score   support

         0.0       0.95      0.98      0.97       251
         1.0       0.86      0.66      0.75        38

    accuracy                           0.94       289
   macro avg       0.91      0.82      0.86       289
weighted avg       0.94      0.94      0.94       289



### Model Retraining

In [58]:
# Using Smote to Oversample the minor class (yes) - (Synthetic Minority Over-sampling Technique) 
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to your training data
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Re-initialize the models (if needed)
logreg_model_smote = LogisticRegression(max_iter=2000)
dt_model_smote = DecisionTreeClassifier()

# Re-train Logistic Regression with SMOTE data
logreg_model_smote.fit(X_train_smote, y_train_smote)

# Re-train Decision Tree Classifier with SMOTE data
dt_model_smote.fit(X_train_smote, y_train_smote)


#### Retrained Results

In [59]:
dt_predictions_smote = dt_model_smote.predict(X_test)
logreg_predictions_smote = logreg_model_smote.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
dt_accuracy_smote = accuracy_score(y_test, dt_predictions_smote)
logreg_accuracy_smote = accuracy_score(y_test, logreg_predictions_smote)

# Generate classification reports
dt_report_smote = classification_report(y_test, dt_predictions_smote)
logreg_report_smote = classification_report(y_test, logreg_predictions_smote)

print("Decision Tree Classifier (SMOTE) Accuracy:", dt_accuracy_smote)
print("Decision Tree Classifier (SMOTE) Classification Report:\n", dt_report_smote)

print("Logistic Regression (SMOTE) Accuracy:", logreg_accuracy_smote)
print("Logistic Regression (SMOTE) Classification Report:\n", logreg_report_smote)

Decision Tree Classifier (SMOTE) Accuracy: 0.9446366782006921
Decision Tree Classifier (SMOTE) Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       251
         1.0       0.79      0.79      0.79        38

    accuracy                           0.94       289
   macro avg       0.88      0.88      0.88       289
weighted avg       0.94      0.94      0.94       289

Logistic Regression (SMOTE) Accuracy: 0.9584775086505191
Logistic Regression (SMOTE) Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.96      0.98       251
         1.0       0.80      0.92      0.85        38

    accuracy                           0.96       289
   macro avg       0.89      0.94      0.91       289
weighted avg       0.96      0.96      0.96       289



In [71]:

engine.dispose()