In [7]:
!pip install dice_ml

Collecting dice_ml
  Downloading dice_ml-0.11-py3-none-any.whl.metadata (20 kB)
Collecting pandas<2.0.0 (from dice_ml)
  Using cached pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting raiutils>=0.4.0 (from dice_ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.11-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl (10.8 MB)
Downloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: pandas, raiutils, dice_ml
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4
Successfully installed dice_ml-0.11 pandas-1.5.3 raiutils-0.4.2


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import dice_ml
from dice_ml import Dice

In [26]:
# Step 1: Load and prepare the dataset
df = pd.read_csv("insurance.csv")

# Step 2: Define features and target
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
target = 'charges'

X = df[features]
y = df[target]

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Preprocessing pipeline
cat_features = ['sex', 'smoker', 'region']
num_features = ['age', 'bmi', 'children']
target = 'charges'

preprocessor = ColumnTransformer([
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing', add_indicator=True)),
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
    ]), cat_features),
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
        ('scaler', StandardScaler())
    ]), num_features)
])


In [40]:
# Step 4: Define and train model
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42))
])
model_pipeline.fit(X_train, y_train)

# Step 5: Evaluate model
y_pred = model_pipeline.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R^2: {r2_score(y_test, y_pred):.4f}")

# Step 6: Prepare data for DiCE
X_train_transformed = pd.DataFrame(
    preprocessor.transform(X_train),
    columns=preprocessor.get_feature_names_out()
)
X_train_transformed['charges'] = y_train.reset_index(drop=True)

# Step 7: Define DiCE data and model
data_dice = dice_ml.Data(
    dataframe=X_train_transformed,
    continuous_features=[col for col in X_train_transformed.columns if col != 'charges'],
    outcome_name='charges'
)

model_dice = dice_ml.Model(
    model=model_pipeline.named_steps['regressor'],
    backend='sklearn',
    model_type='regressor'
)

# Step 8: Generate a query instance
query_instance = pd.DataFrame([{
    'age': 35,
    'sex': 'female',
    'bmi': 33.0,
    'children': 2,
    'smoker': 'yes',
    'region': 'southeast'
}])

# Step 9: Apply preprocessing to query
query_transformed = preprocessor.transform(query_instance)
query_df = pd.DataFrame(query_transformed, columns=preprocessor.get_feature_names_out())

# Step 10: Generate counterfactuals (don't set desired_range for regression)
exp = Dice(data_dice, model_dice, method='random')
dice_exp = exp.generate_counterfactuals(
    query_df,
    total_CFs=3,
    desired_range=[0, 10000],  # e.g. suggest counterfactuals with cost under 8000
    features_to_vary='all'
)

# Step 11: Get DataFrame of counterfactuals
cf_df = dice_exp.cf_examples_list[0].final_cfs_df

# Step 12: Inverse transform to human-readable format
def inverse_transform_counterfactuals(counterfactuals_df, preprocessor, cat_features, num_features):
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline

    # Extract transformers
    cat_transformer = preprocessor.named_transformers_['cat']
    num_transformer = preprocessor.named_transformers_['num']

    # Get encoded column names
    ohe = cat_transformer.named_steps['encoder']
    cat_columns = ohe.get_feature_names_out(cat_features)
    cat_columns_prefixed = [f'cat__{col}' for col in cat_columns]
    num_columns_prefixed = [f'num__{col}' for col in num_features]

    # Slice data
    encoded_cat_df = counterfactuals_df[cat_columns_prefixed]
    scaled_num_df = counterfactuals_df[num_columns_prefixed]

    # Inverse transform categorical
    recovered_cats = ohe.inverse_transform(encoded_cat_df)
    recovered_cat_df = pd.DataFrame(recovered_cats, columns=cat_features)

    # Inverse transform numerical
    scaler = num_transformer.named_steps['scaler']
    imputer = num_transformer.named_steps['imputer']
    unscaled_num = scaler.inverse_transform(scaled_num_df)
    unscaled_num = imputer.inverse_transform(unscaled_num)
    recovered_num_df = pd.DataFrame(unscaled_num, columns=num_features)

    # Combine results
    readable_df = pd.concat([recovered_cat_df, recovered_num_df], axis=1)

    # Add predicted charges
    if 'charges' in counterfactuals_df.columns:
        readable_df['charges'] = counterfactuals_df['charges'].values

    return readable_df

original_input_transformed = preprocessor.transform(query_instance)
original_pred = model_pipeline.predict(query_instance)[0]

print("\nOriginal Input with Predicted Charge:")
original = query_instance.copy()
original['predicted_charge'] = np.round(original_pred, 2)
print(original)
readable_cf_df = inverse_transform_counterfactuals(cf_df, preprocessor, cat_features, num_features)
print("\nReadable Counterfactual Explanations:")
print(readable_cf_df)

RMSE: 4480.18
R^2: 0.8707


100%|██████████| 1/1 [00:00<00:00, 16.08it/s]


Original Input with Predicted Charge:
   age     sex   bmi  children smoker     region  predicted_charge
0   35  female  33.0         2    yes  southeast      40052.820312

Readable Counterfactual Explanations:
      sex smoker     region   age        bmi  children  charges
0  female     no  southeast  35.0  37.822377  2.000000   7501.0
1  female     no  southeast  35.0  33.000000  3.740609   7198.0
2  female     no  southeast  35.0  33.000000  2.000000   7517.0



