In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report, roc_curve, auc)
import joblib

In [None]:
# 1. Data Loading and Preparation
def load_data():
    """Load and prepare the German Credit dataset"""
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
    column_names = [
        'checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount',
        'savings_account', 'employment', 'installment_rate', 'personal_status',
        'other_debtors', 'residence_since', 'property', 'age', 'other_installment_plans',
        'housing', 'existing_credits', 'job', 'dependents', 'telephone', 'foreign_worker', 'credit_risk'
    ]
    
    data = pd.read_csv(url, delimiter=' ', header=None, names=column_names)
    data['credit_risk'] = data['credit_risk'].replace({1: 1, 2: 0})  # 1=Good, 0=Bad
    return data

In [None]:
# 2. Preprocessing Setup
def get_preprocessor():
    """Create preprocessing pipeline for numeric and categorical features"""
    categorical_features = ['checking_account', 'credit_history', 'purpose', 
                          'savings_account', 'employment', 'personal_status',
                          'other_debtors', 'property', 'other_installment_plans',
                          'housing', 'job', 'telephone', 'foreign_worker']
    numerical_features = ['duration', 'credit_amount', 'installment_rate',
                         'residence_since', 'age', 'existing_credits', 'dependents']
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor

In [None]:
# 3. Model Training with Hyperparameter Tuning
def train_random_forest(X_train, y_train, preprocessor):
    """Train and tune a Random Forest classifier"""
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
    ])
    
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2],
        'classifier__max_features': ['sqrt']
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    return grid_search


In [None]:
# 4. Model Evaluation
def evaluate_model(model, X_test, y_test):
    """Evaluate model performance and plot ROC curve"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba)
    }
    
    # Print metrics
    print("\nModel Performance:")
    for name, value in metrics.items():
        print(f"{name}: {value:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
    return metrics


In [None]:
# 5. Feature Importance Analysis
def plot_feature_importance(model, preprocessor):
    """Plot feature importances from the trained model"""
    # Get feature names
    numeric_features = preprocessor.transformers_[0][2]
    categorical_features = preprocessor.transformers_[1][2]
    
    # Get categorical feature names
    categorical_transformer = preprocessor.transformers_[1][1]
    if hasattr(categorical_transformer, 'named_steps'):
        onehot = categorical_transformer.named_steps['onehot']
        cat_feature_names = onehot.get_feature_names_out(categorical_features)
    else:
        cat_feature_names = categorical_transformer.get_feature_names_out(categorical_features)
    
    all_feature_names = numeric_features + list(cat_feature_names)
    
    # Get feature importances
    importances = model.named_steps['classifier'].feature_importances_
    
    # Create DataFrame and sort
    feature_importances = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False).head(20)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importances['feature'], feature_importances['importance'])
    plt.xlabel('Importance')
    plt.title('Top 20 Feature Importances')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# 6. Model Saving and Prediction Example
def save_and_predict(model, preprocessor):
    """Save model and demonstrate prediction"""
    # Save the model
    joblib.dump(model, 'random_forest_credit_model.pkl')
    print("\nModel saved as 'random_forest_credit_model.pkl'")
    
    # Create sample data
    sample_data = {
        'checking_account': 'A11',
        'duration': 24,
        'credit_history': 'A32',
        'purpose': 'A43',
        'credit_amount': 5000,
        'savings_account': 'A61',
        'employment': 'A73',
        'installment_rate': 4,
        'personal_status': 'A93',
        'other_debtors': 'A101',
        'residence_since': 4,
        'property': 'A121',
        'age': 35,
        'other_installment_plans': 'A143',
        'housing': 'A152',
        'existing_credits': 2,
        'job': 'A173',
        'dependents': 1,
        'telephone': 'A192',
        'foreign_worker': 'A201'
    }
    
    sample_df = pd.DataFrame([sample_data])
    
    # Make prediction
    prediction = model.predict(sample_df)
    probability = model.predict_proba(sample_df)[:, 1]
    
    print("\nSample Prediction:")
    print(f"Predicted Credit Risk: {'Good' if prediction[0] == 1 else 'Bad'}")
    print(f"Probability of Good Credit: {probability[0]:.2%}")

# Main Execution
def main():
    print("1. Loading data...")
    data = load_data()
    X = data.drop('credit_risk', axis=1)
    y = data['credit_risk']
    
    print("\n2. Splitting data into train/test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    
    print("\n3. Setting up preprocessing...")
    preprocessor = get_preprocessor()
    
    print("\n4. Training Random Forest model with hyperparameter tuning...")
    grid_search = train_random_forest(X_train, y_train, preprocessor)
    best_model = grid_search.best_estimator_
    print(f"\nBest parameters: {grid_search.best_params_}")
    
    print("\n5. Evaluating model performance...")
    metrics = evaluate_model(best_model, X_test, y_test)
    
    print("\n6. Analyzing feature importance...")
    plot_feature_importance(best_model, preprocessor)
    
    print("\n7. Saving model and testing prediction...")
    save_and_predict(best_model, preprocessor)

if __name__ == "__main__":
    main()