In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report

# Load and preprocess data
covid = pd.read_csv('Covid Data.csv')

# Clean data
cols = ['PNEUMONIA', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION',
        'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO']
for col in cols:
    covid = covid[(covid[col] == 1)|(covid[col] == 2)]

# Create death column
covid['DEATH'] = [2 if row=='9999-99-99' else 1 for row in covid['DATE_DIED']]

# Drop unnecessary columns
covid.drop(columns=['INTUBED', 'ICU', 'DATE_DIED'], inplace=True)

# Handle pregnancy column
covid['PREGNANT'] = covid['PREGNANT'].replace({97:2, 98:2})

# Scale age
standard_scaler = StandardScaler()
covid['AGE'] = standard_scaler.fit_transform(covid.loc[:,['AGE']])

# Prepare features and target
y = covid['DEATH']
X = covid.drop('DEATH', axis=1)

# Apply undersampling
undersampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Create a DataFrame with all metrics
metrics = {
  'Metric': ['Accuracy', 'F1 Score', 'Precision', 'Recall', 'ROC AUC'],
  'Value': [
    accuracy_score(y_test, y_pred),
    f1_score(y_test, y_pred, average="weighted"),
    precision_score(y_test, y_pred, average="weighted"),
    recall_score(y_test, y_pred, average="weighted"),
    roc_auc_score(y_test, y_pred)
  ]
}
metrics_df = pd.DataFrame(metrics)
metrics_df.set_index('Metric', inplace=True)

# Create confusion matrix DataFrame
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, 
          columns=['Predicted Negative', 'Predicted Positive'],
          index=['Actual Negative', 'Actual Positive'])

# Display results
print('Model Performance Metrics:')
print('-----------------------')
print(metrics_df)
print('\nConfusion Matrix:')
print(cm_df)
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred))

Model Performance Metrics:
-----------------------
              Value
Metric             
Accuracy   0.908486
F1 Score   0.908457
Precision  0.909046
Recall     0.908486
ROC AUC    0.908509

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               13832                1092
Actual Positive                1643               13319

Detailed Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.93      0.91     14924
           2       0.92      0.89      0.91     14962

    accuracy                           0.91     29886
   macro avg       0.91      0.91      0.91     29886
weighted avg       0.91      0.91      0.91     29886

