In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from shap import TreeExplainer
# import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from aprofs import code, utils

In [3]:
import pandas as pd

# Read the CSV file from the data folder
data = pd.read_csv("insurance.csv")

In [4]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
# foor loop over a pandas dataframe columns and chnate the typos off all string columns to category
for col in data.select_dtypes(include="object").columns:
    data[col] = data[col].astype("category")
# Display the data
data['is_female'] = (data['sex'] == 'female').astype(int)
data = data.drop(columns=["sex"])
data

Unnamed: 0,age,bmi,children,smoker,region,charges,is_female
0,19,27.900,0,yes,southwest,16884.92400,1
1,18,33.770,1,no,southeast,1725.55230,0
2,28,33.000,3,no,southeast,4449.46200,0
3,33,22.705,0,no,northwest,21984.47061,0
4,32,28.880,0,no,northwest,3866.85520,0
...,...,...,...,...,...,...,...
1333,50,30.970,3,no,northwest,10600.54830,0
1334,18,31.920,0,no,northeast,2205.98080,1
1335,18,36.850,0,no,southeast,1629.83350,1
1336,21,25.800,0,no,southwest,2007.94500,1


In [6]:
target = "is_female"  # target
T = "charges"  # Treatment as price change
features = [
    "age",
    "bmi",
    "children",
    "smoker",
    "region",
    "charges",
] 

In [7]:
from sklearn.model_selection import train_test_split
seed = 42
X, y = data[features], data[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, random_state=seed)

In [8]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

monotone_constraints = [1 if col == T else 0 for col in features]

callbacks = [lgb.early_stopping(10, verbose=0), lgb.log_evaluation(period=0)]


model = LGBMClassifier(
    verbose=-1, n_estimators=100, monotone_constraints=monotone_constraints,random_state=seed
).fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=callbacks,
)
pred_valid = model.predict_proba(X_valid)

In [9]:
print(f"The original prediction has an AUC of {roc_auc_score(y_valid, pred_valid[:, 1])}")

The original prediction has an AUC of 0.5312254936907392


In [10]:
feature_importance_df = model.feature_importances_
feature_names = model.feature_name_
feature_importance_df = pd.DataFrame({"Feature": feature_names, "Importance": feature_importance_df})

# Sort the DataFrame by importance
feature_importance_df.sort_values("Importance", ascending=False, inplace=True)

print(feature_importance_df)

    Feature  Importance
1       bmi          56
0       age          49
2  children          20
5   charges          19
3    smoker           5
4    region           1


In [11]:
from shap import TreeExplainer

shap_explainer = TreeExplainer(model)
shap_valid = shap_explainer.shap_values(X_valid)
shap_expected_value = shap_explainer.expected_value



In [12]:
shaps_values = pd.DataFrame(shap_valid, index=X_valid.index, columns=X_valid.columns)

In [13]:
shaps_values

Unnamed: 0,age,bmi,children,smoker,region,charges
563,-0.025771,-0.000257,0.035974,0.050899,-0.010099,0.036391
1327,-0.058785,-0.087476,-0.002521,0.038555,0.006270,0.024491
1114,0.169415,-0.033496,0.059202,0.027443,-0.003710,-0.068301
678,0.011816,0.015024,0.032997,0.056120,-0.006188,0.070275
490,0.249472,0.100665,0.048708,0.054452,0.001997,-0.370703
...,...,...,...,...,...,...
1225,-0.084233,-0.104990,0.025577,0.031098,-0.010582,-0.015548
956,-0.057537,-0.084549,0.006919,-0.151000,0.000211,0.067590
189,-0.041293,-0.089155,-0.030936,0.027893,-0.004784,-0.029952
265,0.027805,-0.039058,-0.012704,-0.108263,-0.002953,0.095409


In [14]:
apos_test = code.Aprofs(X_valid, y_valid)

In [15]:
apos_test

Aprofs(current_data shape =(335, 6), target_column =[0 1], link=logistic)
  Shapley values have not been calculated!

In [16]:
apos_test.calculate_shaps(model)




In [17]:
apos_test

Aprofs(current_data shape =(335, 6), target_column =[0 1], link=logistic), shap_mean=0.012810903840487525, shap_values.shape=(335, 6)

In [18]:
apos_test.shap_values

Unnamed: 0,age,bmi,children,smoker,region,charges
563,-0.025771,-0.000257,0.035974,0.050899,-0.010099,0.036391
1327,-0.058785,-0.087476,-0.002521,0.038555,0.006270,0.024491
1114,0.169415,-0.033496,0.059202,0.027443,-0.003710,-0.068301
678,0.011816,0.015024,0.032997,0.056120,-0.006188,0.070275
490,0.249472,0.100665,0.048708,0.054452,0.001997,-0.370703
...,...,...,...,...,...,...
1225,-0.084233,-0.104990,0.025577,0.031098,-0.010582,-0.015548
956,-0.057537,-0.084549,0.006919,-0.151000,0.000211,0.067590
189,-0.041293,-0.089155,-0.030936,0.027893,-0.004784,-0.029952
265,0.027805,-0.039058,-0.012704,-0.108263,-0.002953,0.095409


In [19]:
perf = apos_test.get_feature_performance(features)
perf

0.5315819490981678

In [20]:
best_solution = apos_test.brute_force_selection(features)
best_solution_feature = list(best_solution)

Processing 63 combinations: 100%|██████████| 63/63 [00:00<00:00, 530.46it/s]

the best list is ('smoker', 'region', 'charges') with auc 0.5942111641833606





In [21]:
#best_solution_feature= ['bmi','age','children']

In [22]:
monotone_constraints = [1 if col == T else 0 for col in X_train[best_solution_feature].columns]
monotone_constraints

[0, 0, 1]

In [23]:
monotone_constraints = [1 if col == T else 0 for col in X_train[best_solution_feature].columns]

model_best = LGBMClassifier(
    verbose=-1, n_estimators=100, monotone_constraints=monotone_constraints, random_state=seed
).fit(
    X_train[best_solution_feature],
    y_train,
    eval_set=[(X_valid[best_solution_feature], y_valid)],
    callbacks=callbacks,
)
pred_valid_best = model_best.predict_proba(X_valid[best_solution_feature])

In [24]:
print(f"The new prediction have an AUC of {roc_auc_score(y_valid, pred_valid_best[:, 1])}")

The new prediction have an AUC of 0.5496542382547943


In [25]:
apos_test_best = code.Aprofs(X_valid[best_solution_feature], y_valid)
apos_test_best.calculate_shaps(model_best)



In [26]:
apos_test_best

Aprofs(current_data shape =(335, 3), target_column =[0 1], link=logistic), shap_mean=-0.028112216240915356, shap_values.shape=(335, 3)

In [27]:
best_solution

('smoker', 'region', 'charges')

In [28]:
shap_p_values = apos_test.get_shap_p_value(features=features)
shap_p_values

100%|██████████| 6/6 [00:03<00:00,  1.69it/s]


Unnamed: 0,Feature,p-value_shap
0,age,0.356
1,bmi,0.766
2,children,0.93
3,smoker,0.074
4,region,0.148
5,charges,0.0


In [29]:
merged_df_shap = shap_p_values.merge(feature_importance_df, on="Feature")
merged_df_shap.sort_values("Importance", ascending=False, inplace=True)

# Define a function to apply color formatting
def color_format(val):
    if val < 0.05:
        return "background-color: green"
    elif val > 0.3:
        return "background-color: red"
    else:
        return "background-color: gray"


# Apply color formatting to the dataframe
styled_df_shap = merged_df_shap.style.applymap(color_format, subset=["p-value_shap"])

# Display the styled dataframe
styled_df_shap

  styled_df_shap = merged_df_shap.style.applymap(color_format, subset=["p-value_shap"])


Unnamed: 0,Feature,p-value_shap,Importance
1,bmi,0.766,56
0,age,0.356,49
2,children,0.93,20
5,charges,0.0,19
3,smoker,0.074,5
4,region,0.148,1


In [30]:
apos_test.visualize_feature(main_feature="children",other_features=None, nbins=10)

In [31]:
apos_test.visualize_feature(main_feature="smoker")

In [32]:
apos_test.visualize_feature(main_feature="charges",other_features=None, nbins=10, type_bins="cut")

In [33]:
utils.temp_plot_compare_data(apos_test,apos_test_best,"charges")

Unnamed: 0,target,charges,charges_shap,charges_shap_prob,charges_shap_compare,charges_shap_prob_compare,shap_model,shap_prob_model
563,0,9058.7303,0.049201,0.512298,0.031309,0.507827,0.192122,0.547883
1327,0,9377.9047,0.037302,0.509324,0.031309,0.507827,0.192122,0.547883
1114,0,2396.0959,-0.055490,0.486131,-0.012833,0.496792,-0.015292,0.496177
678,0,12363.5470,0.083085,0.520759,0.120951,0.530201,0.335903,0.583195
490,1,1748.7740,-0.357892,0.411470,-0.538483,0.368541,-0.345065,0.414580
...,...,...,...,...,...,...,...,...
1225,1,4795.6568,-0.002737,0.499316,0.031309,0.507827,0.192122,0.547883
956,0,41999.5200,0.080401,0.520089,0.120168,0.530006,-0.220810,0.445021
189,1,4922.9159,-0.017141,0.495715,0.041131,0.510281,0.053149,0.513284
265,0,46151.1245,0.108220,0.527029,0.124845,0.531171,-0.213057,0.446936


In [35]:
apos_test.compare_feature(apos_test_best,"charges")