In [205]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from loguru import logger

In [206]:
# Set up logging
logger.add("../logs/model_training.log", rotation="500 MB")

17

In [207]:
logger.info("Starting model training")

[32m2024-08-08 12:46:40.494[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting model training[0m


In [208]:
def load_data():
    """Load the diabetes dataset from Parquet files."""
    X_train = pq.read_table('../data/interim/X_train.parquet').to_pandas()
    X_val = pq.read_table('../data/interim/X_val.parquet').to_pandas()
    X_test = pq.read_table('../data/interim/X_test.parquet').to_pandas()
    y_train = pq.read_table('../data/interim/y_train.parquet').to_pandas()['target']
    y_val = pq.read_table('../data/interim/y_val.parquet').to_pandas()['target']
    y_test = pq.read_table('../data/interim/y_test.parquet').to_pandas()['target']
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [209]:
# Load data
X_train, X_val, X_test, y_train, y_val, y_test = load_data()
logger.info(f"Data loaded. Training set shape: {X_train.shape}")

[32m2024-08-08 12:46:40.796[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mData loaded. Training set shape: (264, 10)[0m


In [210]:
def create_pipeline(X):
    """
    Create a pipeline (compatible with the MS RAI Toolkit).

    This function creates a scikit-learn pipeline that preprocesses both numeric
    and categorical features, and includes a Random Forest Regressor as the final
    estimator. The pipeline is designed to be compatible with the Responsible AI
    (RAI) Toolkit.

    Args:
        X (pandas.DataFrame): The input features DataFrame.

    Returns:
        sklearn.pipeline.Pipeline: A scikit-learn pipeline that includes
        preprocessing steps for both numeric and categorical features,
        followed by a Random Forest Regressor.

    The pipeline includes the following steps:
    1. Preprocessing:
        a. For numeric features:
            - Imputation of missing values with median strategy
            - Standard scaling
            - Discretization using K-bins with 10 bins
        b. For categorical features:
            - Imputation of missing values with 'missing' as the fill value
            - One-hot encoding
    2. Random Forest Regressor with 1000 estimators

    Note:
        The function automatically detects numeric and categorical columns
        in the input DataFrame.
    """

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        #('discretizer', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile'))
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=1000, random_state=42))
    ])
    
    return pipeline

In [211]:
# Create and train the model
model = create_pipeline(X_train)
model.fit(X_train, y_train)
logger.info("Model trained")

[32m2024-08-08 12:46:48.084[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mModel trained[0m


In [212]:
def evaluate_model(model, X, y, dataset_name):
    """Evaluate the model and log the results."""
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    logger.info(f"{dataset_name} - MSE: {mse:.4f}, R2: {r2:.4f}")

In [213]:
# Evaluate the model
evaluate_model(model, X_train, y_train, "Training")
evaluate_model(model, X_val, y_val, "Validation")
evaluate_model(model, X_test, y_test, "Test")

[32m2024-08-08 12:46:48.882[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m6[0m - [1mTraining - MSE: 489.7641, R2: 0.9212[0m
[32m2024-08-08 12:46:49.154[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m6[0m - [1mValidation - MSE: 2933.4075, R2: 0.4646[0m


[32m2024-08-08 12:46:49.371[0m | [1mINFO    [0m | [36m__main__[0m:[36mevaluate_model[0m:[36m6[0m - [1mTest - MSE: 3044.6671, R2: 0.4253[0m


In [214]:
# Get feature importance
feature_names = X_train.columns.tolist()
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': model.named_steps['regressor'].feature_importances_
}).sort_values('importance', ascending=False)

logger.info("Top 5 important features:")
logger.info(feature_importance.head().to_string(index=False))

logger.success("Model training and evaluation complete.")

[32m2024-08-08 12:46:49.676[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mTop 5 important features:[0m
[32m2024-08-08 12:46:49.687[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mfeature  importance
    bmi    0.354923
     s5    0.238905
     bp    0.087345
     s6    0.066214
    age    0.062073[0m
[32m2024-08-08 12:46:49.700[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [32m[1mModel training and evaluation complete.[0m


In [215]:
from responsibleai import RAIInsights
from raiwidgets import ResponsibleAIDashboard
from responsibleai.feature_metadata import FeatureMetadata

In [216]:
def prepare_rai_data(model, X_test, y_test):
    """
    Prepare data for RAI Insights.
    
    Args:
    model: Trained model
    X_test: Test features
    y_test: Test target
    
    Returns:
    Tuple of (model, X_test, y_test) prepared for RAI Insights
    """
    print("Shape of X_test:", X_test.shape)
    print("Shape of y_test:", y_test.shape)
    
    # Ensure X_test is a DataFrame
    if not isinstance(X_test, pd.DataFrame):
        X_test = pd.DataFrame(X_test)
    
    # Ensure y_test is a DataFrame
    if not isinstance(y_test, pd.DataFrame):
        y_test = pd.DataFrame(y_test, columns=['target'])
    
    print("Shape of y_test after conversion:", y_test.shape)
    
    # Reset index of X_test and y_test
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    # Combine X_test and y_test into a single DataFrame
    test_df = pd.concat([X_test, y_test], axis=1)
    
    print("Shape of final test_df:", test_df.shape)
    print("Columns in test_df:", test_df.columns)
    print("First few rows of 'target' column:")
    print(test_df['target'].head())
    
    return model, test_df, 'target'

In [217]:
def create_rai_insights(model, X_test, y_test, task_type='regression'):
    """
    Create and compute comprehensive RAI Insights.
    
    Args:
    model: Trained model
    X_test: Test features
    y_test: Test target
    task_type: 'regression' or 'classification'
    
    Returns:
    RAIInsights object
    """
    # Prepare data for RAI Insights
    model, test_df, target_column = prepare_rai_data(model, X_test, y_test)
    
    print("Shape of test_df after prepare_rai_data:", test_df.shape)
    print("Columns in test_df:", test_df.columns)
    print("First few rows of 'target' column:")
    print(test_df[target_column].head())
    
    # Define feature metadata
    feature_metadata = FeatureMetadata()
    # Add your categorical features here, for example:
    feature_metadata.add_categorical_feature('bmi')
    
    # Create RAI Insights object
    rai_insights = RAIInsights(model, test_df, test_df, target_column,
                               task_type=task_type,
                               feature_metadata=feature_metadata)
    
    # Add explainer
    rai_insights.explainer.add()
    
    # Add error analysis
    rai_insights.error_analysis.add()
    
    # Add causal analysis
    # Note: This requires specific treatment features
    rai_insights.causal.add(treatment_features=['bmi', 'bp', 's5'])
    
    # Add counterfactuals
    if task_type == 'regression':
        # For regression, we need to specify a desired range
        y_min, y_max = test_df[target_column].min(), test_df[target_column].max()
        y_range = y_max - y_min
        desired_range = [y_min - 0.1 * y_range, y_max + 0.1 * y_range]  # Extend slightly beyond observed range
        rai_insights.counterfactual.add(total_CFs=10, desired_range=desired_range)
    else:
        rai_insights.counterfactual.add(total_CFs=10)
    
    # Add data balance measures only for classification tasks
    if task_type == 'classification':
        cols_of_interest = [col for col in test_df.columns if col != target_column]
        rai_insights.data_balance.add(cols_of_interest=cols_of_interest)
    
    # Compute insights
    rai_insights.compute()
    
    return rai_insights

In [218]:
# Create and compute RAI Insights
rai_insights = create_rai_insights(model, X_test, y_test)
logger.info("RAI Insights computed")

# Display the RAI Dashboard
ResponsibleAIDashboard(rai_insights)
logger.success("RAI Dashboard launched. Model training and analysis complete.")

Shape of X_test: (89, 10)
Shape of y_test: (89,)
Shape of y_test after conversion: (89, 1)
Shape of final test_df: (89, 11)
Columns in test_df: Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'target'],
      dtype='object')
First few rows of 'target' column:
0    219.0
1     70.0
2    202.0
3    230.0
4    111.0
Name: target, dtype: float64
Shape of test_df after prepare_rai_data: (89, 11)
Columns in test_df: Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6',
       'target'],
      dtype='object')
First few rows of 'target' column:
0    219.0
1     70.0
2    202.0
3    230.0
4    111.0
Name: target, dtype: float64


Causal Effects
Current Status: Generating Causal Effects.
Current Status: Finished generating causal effects.
Time taken: 0.0 min 14.611036987000034 sec
Counterfactual
Current Status: Generating 10 counterfactuals for 89 samples


100%|██████████| 89/89 [06:08<00:00,  4.14s/it]


Current Status: Generated 10 counterfactuals for 89 samples.
Time taken: 6.0 min 13.077335656999821 sec
Error Analysis
Current Status: Generating error analysis reports.
Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 1.5836292359999788 sec
Explanations
Current Status: Explaining 10 features
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 222
[LightGBM] [Info] Number of data points in the train set: 89, number of used features: 10
[LightGBM] [Info] Start training from score 145.304696


[32m2024-08-08 12:53:21.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mRAI Insights computed[0m


Current Status: Explained 10 features.
Time taken: 0.0 min 1.4240740999998707 sec


[32m2024-08-08 12:53:25.021[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [32m[1mRAI Dashboard launched. Model training and analysis complete.[0m


ResponsibleAI started at http://localhost:8705
