In [2]:
# imports
import pandas as pd
import numpy as np
import joblib  # For saving the model and preprocessor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Import SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Use this pipeline to chain preprocessor and model

print("Libraries imported successfully.")

CLEANED_DATA_PATH = 'cleaned_diabetic_data.csv'

try:
    df = pd.read_csv(CLEANED_DATA_PATH)
    print(f"Loaded cleaned data. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: '{CLEANED_DATA_PATH}' not found. Please check the file path.")
    # Stop execution if file isn't found
    raise

TARGET_COLUMN = 'target'

COLS_TO_DROP_FOR_MODELING = ['readmitted', 'diag_1', 'diag_2', 'diag_3']

try:
    y = df[TARGET_COLUMN]
    X = df.drop([TARGET_COLUMN] + COLS_TO_DROP_FOR_MODELING, axis=1)
    print("Features (X) and Target (y) are defined.")
    print(f"X shape: {X.shape}, y shape: {y.shape}")

except KeyError as e:
    print(f"Error: A required column is missing. {e}")
    print("Please ensure your cleaned data contains 'target' and the original 'diag' columns.")
    raise


numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(categorical_cols)} categorical features.")

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# We split FIRST, before applying SMOTE, to keep our test set realistic.
# stratify=y is CRITICAL for imbalanced datasets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,    # 20% for testing
    random_state=42,  # For reproducibility
    stratify=y        # Ensures same class balance in train and test
)

print(f"\nData split into training and testing sets:")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Class 1 (Readmitted) in y_train: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
print(f"Class 1 (Readmitted) in y_test:  {y_test.sum()} ({y_test.mean()*100:.2f}%)")


# Define the Random Forest model
# n_jobs=-1 uses all available CPU cores for speed
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Create an imblearn pipeline that:
# 1. Applies the 'preprocessor' (scaling and one-hot encoding)
# 2. Applies 'smote' to the training data (and *only* the training data)
# 3. Trains the 'rf_model'
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('model', rf_model)
])

print("\nStarting Model Training (with SMOTE)")
# Fit the entire pipeline on the training data
pipeline.fit(X_train, y_train)
print("Model Training Complete")

print("\nModel Evaluation (on Unseen Test Set)")

# Make predictions on the original, non-resampled test set
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)[:, 1] # Probabilities for class 1

# Print the classification report
# This is the key evaluation table
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Readmitted (0)', 'Readmitted (1)']))

# Print the AUC-ROC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC Score: {auc_score:.4f}")

print("\nConfusion Matrix")
# This shows the raw numbers for 'Social Welfare' analysis
cm = confusion_matrix(y_test, y_pred)
print("                 Predicted 0   |   Predicted 1")
print(f"Actual 0:    {cm[0,0]:>10}    |    {cm[0,1]:>10}  (False Positives)")
print(f"Actual 1:    {cm[1,0]:>10}    |    {cm[1,1]:>10}  (True Positives)")
print(f"\nKey Metric (Recall): The model correctly identified {cm[1,1]} out of {cm[1,0] + cm[1,1]} actual readmissions.")

# Save the entire trained pipeline (preprocessor + SMOTE + model)
pipeline_filename = 'rf_smote_pipeline.joblib'
joblib.dump(pipeline, pipeline_filename)
print(f"\nSuccessfully saved the full pipeline to '{pipeline_filename}'")

X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
print("Successfully saved 'X_test.csv' and 'y_test.csv' for the visualization notebook.")

Libraries imported successfully.
Loaded cleaned data. Shape: (67113, 46)
Features (X) and Target (y) are defined.
X shape: (67113, 41), y shape: (67113,)

Identified 11 numerical features.
Identified 30 categorical features.

Data split into training and testing sets:
X_train shape: (53690, 41), y_train shape: (53690,)
X_test shape: (13423, 41), y_test shape: (13423,)
Class 1 (Readmitted) in y_train: 4888 (9.10%)
Class 1 (Readmitted) in y_test:  1222 (9.10%)

Starting Model Training (with SMOTE)
Model Training Complete

Model Evaluation (on Unseen Test Set)
Classification Report:
                    precision    recall  f1-score   support

Not Readmitted (0)       0.91      1.00      0.95     12201
    Readmitted (1)       0.10      0.00      0.00      1222

          accuracy                           0.91     13423
         macro avg       0.51      0.50      0.48     13423
      weighted avg       0.84      0.91      0.87     13423

AUC-ROC Score: 0.6068

Confusion Matrix
          

In [4]:
# imports
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print("Libraries imported successfully.")

CLEANED_DATA_PATH = 'cleaned_diabetic_data.csv'
try:
    df = pd.read_csv(CLEANED_DATA_PATH)
    print(f"Loaded cleaned data. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: '{CLEANED_DATA_PATH}' not found.")
    raise

TARGET_COLUMN = 'target'
COLS_TO_DROP_FOR_MODELING = ['readmitted', 'diag_1', 'diag_2', 'diag_3']

try:
    y = df[TARGET_COLUMN]
    X = df.drop([TARGET_COLUMN] + COLS_TO_DROP_FOR_MODELING, axis=1)
    print(f"Features (X) and Target (y) are defined. X shape: {X.shape}")
except KeyError as e:
    print(f"Error: A required column is missing. {e}")
    raise

numerical_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

print(f"Identified {len(numerical_cols)} numerical features.")
print(f"Identified {len(categorical_cols)} categorical features.")

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(f"Data split into training and testing sets.")

# standard sklearn pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42, n_jobs=-1))
])

print("\nModel Pipeline Created")


# Define the "grid" of parameters to search
# We'll test a few key parameters
param_grid = {
    'model__n_estimators': [100, 150],            # Number of trees
    'model__max_depth': [10, 20],                 # Max depth of each tree
    'model__min_samples_leaf': [5, 10],           # Min samples at a leaf node (helps prevent overfitting)
    'model__class_weight': ['balanced', 'balanced_subsample'] # This is the KEY change
}

# Set up the Grid Search
# IMPORTANT: scoring='recall' tells the search to find the
# model with the highest RECALL for the positive class.
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='recall',
    n_jobs=-1,        # Use all cores
    verbose=2         # Show progress
)

print("\nStarting Hyperparameter Tuning (GridSearchCV)")
print("This will take several minutes")
# Fit the Grid Search on the training data
grid_search.fit(X_train, y_train)

print("Tuning Complete")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best recall score (on validation data): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

print("\nEvaluation of BEST Model (on Unseen Test Set)")

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1] # Probabilities for class 1

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Readmitted (0)', 'Readmitted (1)']))

# Print the AUC-ROC score
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC-ROC Score: {auc_score:.4f}")

print("\nConfusion Matrix")
cm = confusion_matrix(y_test, y_pred)
print("                 Predicted 0   |   Predicted 1")
print(f"Actual 0:    {cm[0,0]:>10}    |    {cm[0,1]:>10}  (False Positives)")
print(f"Actual 1:    {cm[1,0]:>10}    |    {cm[1,1]:>10}  (True Positives)")

recall = cm[1,1] / (cm[1,0] + cm[1,1])
print(f"\nKey Metric (Recall): The model correctly identified {cm[1,1]} out of {cm[1,0] + cm[1,1]} actual readmissions. (Recall = {recall:.2%})")

# Save the entire pipeline
pipeline_filename = 'rf_tuned_pipeline.joblib'
joblib.dump(best_model, pipeline_filename)
print(f"\nSuccessfully saved the tuned pipeline to '{pipeline_filename}'")

# Save the test sets (no change here)
X_test.to_csv('X_test.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
print("Successfully saved 'X_test.csv' and 'y_test.csv' for the visualization notebook.")

Libraries imported successfully.
Loaded cleaned data. Shape: (67113, 46)
Features (X) and Target (y) are defined. X shape: (67113, 41)
Identified 11 numerical features.
Identified 30 categorical features.
Data split into training and testing sets.

Model Pipeline Created

Starting Hyperparameter Tuning (GridSearchCV)
This will take several minutes
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Tuning Complete
Best parameters found: {'model__class_weight': 'balanced_subsample', 'model__max_depth': 10, 'model__min_samples_leaf': 10, 'model__n_estimators': 150}
Best recall score (on validation data): 0.4673

Evaluation of BEST Model (on Unseen Test Set)
Classification Report:
                    precision    recall  f1-score   support

Not Readmitted (0)       0.93      0.69      0.79     12201
    Readmitted (1)       0.14      0.49      0.21      1222

          accuracy                           0.67     13423
         macro avg       0.53      0.59      0.50     13423
  