#### Hospital Readmission Prediction - Model Training and Evaluation

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

print('‚úÖ Libraries imported successfully')

In [None]:
# Initialize Spark Session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HospitalReadmissionPrediction") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print(f"‚úÖ Spark Session initialized")
print(f"Spark Version: {spark.version}")

In [None]:
# Load preprocessed data
processed_data_path = "../data/processed/diabetes_processed.csv"

if os.path.exists(processed_data_path):
    df_spark = spark.read.csv(processed_data_path, header=True, inferSchema=True)
    print(f"‚úÖ Data loaded: {df_spark.count()} rows, {len(df_spark.columns)} columns")
    df_spark.show(5)
else:
    print(f"‚ùå File not found: {processed_data_path}")
    print("Please run the data preprocessing notebook first")

In [None]:
# Prepare features and target
target_col = 'readmitted'
feature_cols = [col for col in df_spark.columns if col != target_col]

print(f"Target column: {target_col}")
print(f"Feature columns ({len(feature_cols)}): {feature_cols[:5]}...")

# Check target distribution
df_spark.groupBy(target_col).count().show()

In [None]:
# Create feature vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

df_features = assembler.transform(df_spark)
df_final = df_features.select("features", target_col)

print(f"‚úÖ Feature vector created")
print(f"Final dataset: {df_final.count()} rows")

In [None]:
# Split data
train_data, test_data = df_final.randomSplit([0.7, 0.3], seed=42)

print(f"Training data: {train_data.count()} rows")
print(f"Test data: {test_data.count()} rows")

print("\nTarget distribution in training set:")
train_data.groupBy(target_col).count().show()

In [None]:
# Initialize and train models
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

models = {
    'logistic_regression': LogisticRegression(featuresCol='features', labelCol=target_col),
    'random_forest': RandomForestClassifier(featuresCol='features', labelCol=target_col, numTrees=100),
    'gradient_boosting': GBTClassifier(featuresCol='features', labelCol=target_col, maxIter=100)
}

# Evaluators
auc_evaluator = BinaryClassificationEvaluator(labelCol=target_col, metricName="areaUnderROC")
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=target_col, metricName="accuracy")
f1_evaluator = MulticlassClassificationEvaluator(labelCol=target_col, metricName="f1")

results = []
trained_models = {}

print("üöÄ Training models...")

for name, model in models.items():
    print(f"\nTraining {name}...")
    try:
        trained_model = model.fit(train_data)
        trained_models[name] = trained_model
        
        # Evaluate
        train_pred = trained_model.transform(train_data)
        test_pred = trained_model.transform(test_data)
        
        result = {
            'Model_Name': name,
            'Train_auc_roc': round(auc_evaluator.evaluate(train_pred), 4),
            'Test_auc_roc': round(auc_evaluator.evaluate(test_pred), 4),
            'Train_Accuracy': round(accuracy_evaluator.evaluate(train_pred), 4),
            'Test_Accuracy': round(accuracy_evaluator.evaluate(test_pred), 4),
            'Train_f1': round(f1_evaluator.evaluate(train_pred), 4),
            'Test_f1': round(f1_evaluator.evaluate(test_pred), 4)
        }
        results.append(result)
        
        print(f"  ‚úÖ {name} - AUC: {result['Test_auc_roc']}, Accuracy: {result['Test_Accuracy']}")
        
    except Exception as e:
        print(f"  ‚ùå Error training {name}: {e}")

print(f"\n‚úÖ Training completed!")

In [None]:
# Display and save results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test_auc_roc', ascending=False)

print("üìä Model Performance Summary:")
print(results_df)

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"../model/run_{timestamp}"
os.makedirs(output_dir, exist_ok=True)

# Save performance
results_path = os.path.join(output_dir, "model_performance.csv")
results_df.to_csv(results_path, index=False)

# Save test data
test_data_path = os.path.join(output_dir, "test_data.csv")
test_data.toPandas().to_csv(test_data_path, index=False)

# Save best model
best_model_name = results_df.iloc[0]['Model_Name']
best_model = trained_models[best_model_name]
best_model_path = os.path.join(output_dir, f"best_model_{best_model_name}")
best_model.write().overwrite().save(best_model_path)

print(f"\nüíæ Results saved to: {output_dir}")
print(f"üèÜ Best model: {best_model_name}")

spark.stop()