In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Create a small synthetic dataset
data = pd.DataFrame({
    'time_in_hospital': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'num_lab_procedures': [10, 15, 20, 25, 30, 5, 10, 15, 20, 25],
    'num_medications': [5, 7, 10, 8, 6, 5, 7, 8, 6, 5],
    'number_outpatient': [1, 2, 1, 3, 2, 1, 2, 3, 2, 1],
    'number_emergency': [0, 1, 0, 1, 1, 0, 1, 0, 1, 0],
    'number_inpatient': [0, 1, 1, 1, 0, 0, 1, 1, 1, 0],
    'race': ['Caucasian', 'Hispanic', 'Caucasian', 'AfricanAmerican', 'Hispanic', 'Caucasian', 'AfricanAmerican', 'Hispanic', 'Caucasian', 'AfricanAmerican'],
    'gender': ['Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'age': ['60-70', '50-60', '60-70', '70-80', '50-60', '60-70', '70-80', '50-60', '60-70', '70-80'],
    'admission_type_id': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
    'discharge_disposition_id': [1, 2, 3, 2, 1, 3, 2, 1, 2, 3],
    'admission_source_id': [1, 2, 1, 3, 2, 1, 3, 2, 1, 2],
    'readmitted': ['<30', 'NO', '<30', 'NO', '<30', 'NO', '<30', 'NO', '<30', 'NO']
})

# Preprocessing
data.replace('?', np.nan, inplace=True)  # Handle missing values if any
data.dropna(inplace=True)  # Drop rows with NaN values

# Define features and target
features = data.drop('readmitted', axis=1)
target = data['readmitted'].apply(lambda x: 1 if x == '<30' else 0)  # Convert target to binary

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.3, random_state=42, stratify=target
)

# Define preprocessing pipeline
numeric_features = ['time_in_hospital', 'num_lab_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient']
categorical_features = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define and train models
gbm_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3))])

xgb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='logloss'))])

# Train models
gbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Predictions
gbm_pred = gbm_model.predict(X_test)
xgb_pred = xgb_model.predict(X_test)

# Check class distribution in y_test
print("Distribution in y_test:")
print(y_test.value_counts())

# Evaluate GBM Model
print("\nGBM Model")
print(f"Accuracy: {accuracy_score(y_test, gbm_pred)}")
print(f"Precision: {precision_score(y_test, gbm_pred, zero_division=0)}")
print(f"Recall: {recall_score(y_test, gbm_pred, zero_division=0)}")
if len(np.unique(y_test)) > 1:  # Ensure ROC AUC is only computed if both classes are present
    print(f"AUC-ROC: {roc_auc_score(y_test, gbm_pred)}")
else:
    print("ROC AUC score cannot be computed. Only one class present in y_test.")

# Evaluate XGBoost Model
print("\nXGBoost Model")
print(f"Accuracy: {accuracy_score(y_test, xgb_pred)}")
print(f"Precision: {precision_score(y_test, xgb_pred, zero_division=0)}")
print(f"Recall: {recall_score(y_test, xgb_pred, zero_division=0)}")
if len(np.unique(y_test)) > 1:  # Ensure ROC AUC is only computed if both classes are present
    print(f"AUC-ROC: {roc_auc_score(y_test, xgb_pred)}")
else:
    print("ROC AUC score cannot be computed. Only one class present in y_test.")


Distribution in y_test:
readmitted
0    2
1    1
Name: count, dtype: int64

GBM Model
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
AUC-ROC: 1.0

XGBoost Model
Accuracy: 0.3333333333333333
Precision: 0.3333333333333333
Recall: 1.0
AUC-ROC: 0.5
