In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, roc_auc_score, confusion_matrix, 
                           classification_report, roc_curve, auc)
import joblib

In [None]:
# 1. Data Loading and Preparation
def load_medical_data():
    """Load and prepare medical dataset"""
    # Note: In practice, you would use your actual medical dataset
    # This example uses the Heart Disease UCI dataset from Kaggle as a stand-in
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
    column_names = [
        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
        'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
    ]
    
    try:
        data = pd.read_csv(url, names=column_names, na_values='?')
    except:
        print("Unable to fetch online data, loading local copy if available")
        # Fallback to local file if online fetch fails
        data = pd.read_csv('heart_disease.csv', names=column_names, na_values='?')
    
    # Convert target to binary (0 = no disease, 1 = disease)
    data['target'] = data['target'].apply(lambda x: 1 if x > 0 else 0)
    
    return data


Replace the load_medical_data() function with your dataset

In [None]:
# 2. Preprocessing Setup
def get_medical_preprocessor():
    """Create preprocessing pipeline for medical data"""
    # Identify feature types
    categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
    numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
    
    # Numeric preprocessing
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    # Categorical preprocessing
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor, categorical_features, numerical_features

Adjust feature lists in get_medical_preprocessor() for your data

In [None]:
# 3. Model Training with Random Forest
def train_medical_model(X_train, y_train, preprocessor):
    """Train and tune a Random Forest classifier for medical prediction"""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ])
    
    # Reduced parameter grid for faster execution
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    return grid_search

In [None]:
# 4. Model Evaluation
def evaluate_medical_model(model, X_test, y_test):
    """Evaluate model performance with medical metrics"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall (Sensitivity)': recall_score(y_test, y_pred),
        'Specificity': recall_score(y_test, y_pred, pos_label=0),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba)
    }
    
    # Print metrics
    print("\nModel Performance Metrics:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix with labels
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['No Disease', 'Disease'],
                yticklabels=['No Disease', 'Disease'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
    return metrics

In [None]:
# 5. Feature Importance Analysis
def plot_medical_feature_importance(model, preprocessor, numerical_features, categorical_features):
    """Plot medical feature importances"""
    # Get feature names
    categorical_transformer = preprocessor.named_transformers_['cat']
    if hasattr(categorical_transformer, 'named_steps'):
        onehot = categorical_transformer.named_steps['onehot']
        cat_feature_names = onehot.get_feature_names_out(categorical_features)
    else:
        cat_feature_names = categorical_transformer.get_feature_names_out(categorical_features)
    
    all_feature_names = numerical_features + list(cat_feature_names)
    
    # Get feature importances
    importances = model.named_steps['classifier'].feature_importances_
    
    # Create DataFrame and sort
    feature_importances = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False).head(15)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.xlabel('Importance')
    plt.title('Top 15 Predictive Features')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# 6. Model Deployment Example
def save_and_predict_medical(model):
    """Save model and demonstrate prediction with medical data"""
    # Save the model
    joblib.dump(model, 'disease_prediction_model.pkl')
    print("\nModel saved as 'disease_prediction_model.pkl'")
    
    # Create sample patient data
    sample_patient = {
        'age': 58,
        'sex': 1,          # 1 = male, 0 = female
        'cp': 3,           # chest pain type (0-3)
        'trestbps': 140,   # resting blood pressure
        'chol': 289,       # serum cholesterol
        'fbs': 0,          # fasting blood sugar > 120 mg/dl
        'restecg': 1,      # resting electrocardiographic results
        'thalach': 140,    # maximum heart rate achieved
        'exang': 0,        # exercise induced angina
        'oldpeak': 3.5,    # ST depression induced by exercise
        'slope': 2,        # slope of peak exercise ST segment
        'ca': 1,           # number of major vessels colored by flourosopy
        'thal': 3          # thalassemia (3 = normal, 6 = fixed defect, 7 = reversible defect)
    }
    
    sample_df = pd.DataFrame([sample_patient])
    
    # Make prediction
    prediction = model.predict(sample_df)
    probability = model.predict_proba(sample_df)[:, 1]
    
    print("\nSample Patient Prediction:")
    print(f"Predicted Disease Status: {'Disease Present' if prediction[0] == 1 else 'No Disease'}")
    print(f"Probability of Disease: {probability[0]:.1%}")


You can adjust samples according to the testing you want to do.

In [None]:
# Main Execution
def main():
    print("1. Loading medical data...")
    data = load_medical_data()
    
    # Basic data exploration
    print("\nData Overview:")
    print(f"Total records: {len(data)}")
    print(f"Disease prevalence: {data['target'].mean():.1%}")
    print("\nFirst 5 records:")
    print(data.head())
    
    # Check for missing values
    print("\nMissing values per column:")
    print(data.isnull().sum())
    
    # Handle missing values (simple imputation for this example)
    data.fillna(data.median(numeric_only=True), inplace=True)
    data.fillna(data.mode().iloc[0], inplace=True)
    
    X = data.drop('target', axis=1)
    y = data['target']
    
    print("\n2. Splitting data into train/test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    
    print("\n3. Setting up medical data preprocessing...")
    preprocessor, categorical_features, numerical_features = get_medical_preprocessor()
    
    print("\n4. Training Random Forest model...")
    grid_search = train_medical_model(X_train, y_train, preprocessor)
    best_model = grid_search.best_estimator_
    print(f"\nBest parameters: {grid_search.best_params_}")
    
    print("\n5. Evaluating model performance...")
    metrics = evaluate_medical_model(best_model, X_test, y_test)
    
    print("\n6. Analyzing important medical features...")
    plot_medical_feature_importance(best_model, preprocessor, numerical_features, categorical_features)
    
    print("\n7. Saving model and testing prediction...")
    save_and_predict_medical(best_model)

if __name__ == "__main__":
    main()