In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import dice_ml
from dice_ml.utils import helpers 

import pandas as pd
from sklearn.datasets import fetch_openml

import json

datasetKey = "income"

In [None]:
import pandas as pd

df = helpers.load_adult_income_dataset() 

print(df.info())  
print(df.head())  

In [None]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display only columns with missing values
print("Columns with missing values:\n", missing_values[missing_values > 0])

In [None]:
df.head()

In [None]:
d = dice_ml.Data(dataframe=df,continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [None]:
target = df["income"]
# Split data into train and test
datasetX = df.drop("income", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

In [None]:
#see some examples
young_female_condition = (df['gender'] == 'Female') & (df['age'] == 23) & (df['education'] == 'HS-grad') & (df['hours_per_week'] == 40) & (df['marital_status'] == 'Single') 
df[young_female_condition]

In [None]:
x_test_first_20 = x_test[:20].copy()

print(f"Indices in dataset: {x_test_first_20.index.tolist()}")

x_test_indices = x_test_first_20.index
x_test_first_20["target"] = y_test.loc[x_test_indices].values

x_test_first_20_with_target = x_test_first_20

data_dict = {"originalDataset": x_test_first_20_with_target.to_dict(orient="records")}

json_filename = f"{datasetKey}_x_test_with_target.json"


with open(json_filename, "w") as json_file:
    json.dump(data_dict, json_file, indent=4)

print(f"Saved {len(x_test_first_20)} rows to '{json_filename}' including the target values.")

In [None]:
numerical = ['age', 'hours_per_week']
categorical = x_train.columns.difference(numerical)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [None]:
# provide the trained ML model to DiCE's model object
backend = 'sklearn'
m = dice_ml.Model(model=model, backend=backend)

In [None]:
# initiate DiCE
exp_random = dice_ml.Dice(d, m, method="random")

In [None]:
x_test_first_20_with_target

In [None]:
x_test_first_20_features = x_test_first_20_with_target.drop('target', axis=1)

dice_exp_random = exp_random.generate_counterfactuals(x_test_first_20_features, total_CFs=25, desired_class="opposite", verbose=False)

In [None]:
dice_exp_random.visualize_as_dataframe(show_only_changes=True)

In [None]:
import pickle

def save_pickle(dice_exp_object, datasetKey):

    filename = f"dice_exp_{datasetKey}.pkl"  

    with open(filename, "wb") as file:
        pickle.dump(dice_exp_object, file)

    print(f"{datasetKey} counterfactuals have been pickled and saved as '{filename}'")

    return filename  

save_pickle(dice_exp_random, datasetKey)