# Dataset Properties

The following cells can be used to compute feature importances, default values, and other general dataset properties.

Basic setup:

In [4]:
from setup import *
from helpers import *

df = df_fetch_and_cleanup()

y_train = df[label_col].to_numpy()
X_train = df.drop(columns=[label_col]).to_numpy()

columns = list(df.columns)
columns.remove(label_col)

print("Loaded data from", data_filepath)

Loaded data from Datasets/diabetes_binned.csv


Compute the infogain of the individual features:

In [2]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

xgb_cl.fit(X_train, y_train)
xgb_cl.get_booster().feature_names = columns

importances_gain = xgb_cl.get_booster().get_score(importance_type='gain')

sorted_gain_importances = sorted(importances_gain.items(), key=lambda x: x[1], reverse=True)
print("sorted importances_gain")
print(sorted_gain_importances)

sorted importances_gain
[('HighBP', 185.56321716308594), ('GenHlth', 45.60859298706055), ('HighChol', 23.22873878479004), ('CholCheck', 15.0233793258667), ('HvyAlcoholConsump', 12.258978843688965), ('Age', 10.940640449523926), ('HeartDiseaseorAttack', 9.211546897888184), ('BMI', 8.456352233886719), ('DiffWalk', 8.234146118164062), ('Sex', 6.14013147354126), ('Stroke', 4.1117353439331055), ('Income', 4.1084370613098145), ('Education', 3.1805970668792725), ('NoDocbcCost', 3.051327705383301), ('Veggies', 3.0505924224853516), ('PhysHlth', 3.0102550983428955), ('Smoker', 2.8902156352996826), ('MentHlth', 2.869657278060913), ('PhysActivity', 2.7924416065216064), ('Fruits', 2.7762556076049805), ('AnyHealthcare', 2.662822961807251)]


Computing mode values (defaults for the simple models). Formatted to be easily copied into the dictionary format in `setup.py`

In [3]:
for i in range(len(columns)):
    print("'" + columns[i] + "'" + ": " + str(df[columns[i]].mode(dropna=False)[0]) + ",")

'HighBP': 1,
'HighChol': 1,
'CholCheck': 1,
'BMI': 27,
'Smoker': 0,
'Stroke': 0,
'HeartDiseaseorAttack': 0,
'PhysActivity': 1,
'Fruits': 1,
'Veggies': 1,
'HvyAlcoholConsump': 0,
'AnyHealthcare': 1,
'NoDocbcCost': 0,
'GenHlth': 3,
'MentHlth': 0,
'PhysHlth': 0,
'DiffWalk': 0,
'Sex': 0,
'Age': 10,
'Education': 6,
'Income': 8,
'BMI_binned': 1,
'MentHlth_binned': 0,
'PhysHlth_binned': 0,
'Age_binned': 0,
'Education_binned': 2,
'Income_binned': 2,
