In [None]:
import mlflow
import pyspark.pandas as ps
import pandas as pd

# Use MLflow to track experiments
mlflow.set_experiment(f"/Users/<your username>/census_prediction")

target_col = "never_married"

In [None]:
df_loaded = ps.sql("SELECT * FROM census_data.census_adult_income_features").to_pandas()

df_loaded.head(5)

In [None]:
from databricks.automl_runtime.sklearn.column_selector import ColumnSelector
supported_cols = ["workclass_Local-gov", "occupation_Protective-serv", "occupation_Farming-fishing", "occupation_Other-service", "education_7th-8th", "occupation_Adm-clerical", "education_Bachelors", "education_HS-grad", "hours_per_week", "occupation_Sales", "native_country_Haiti", "race_Other", "workclass_Unknown", "education_Some-college", "native_country_Germany", "education_Prof-school", "education_Assoc-voc", "education_Assoc-acdm", "education_12th", "occupation_Handlers-cleaners", "workclass_State-gov", "native_country_Unknown", "occupation_Priv-house-serv", "occupation_Craft-repair", "race_White", "workclass_Private", "native_country_India", "native_country_Laos", "sex_Female", "native_country_Puerto-Rico", "native_country_Guatemala", "capital_loss", "race_Asian-Pac-Islander", "workclass_Self-emp-not-inc", "education_9th", "native_country_Philippines", "occupation_Exec-managerial", "occupation_Tech-support", "education_Masters", "native_country_South", "workclass_Self-emp-inc", "education_11th", "workclass_Federal-gov", "occupation_Unknown", "native_country_United-States", "native_country_Cuba", "native_country_Mexico", "native_country_Canada", "native_country_Columbia", "education_Preschool", "occupation_Machine-op-inspct", "race_Amer-Indian-Eskimo", "native_country_Italy", "native_country_Jamaica", "occupation_Prof-specialty", "native_country_El-Salvador", "income_bracket_gt_50k", "native_country_Yugoslavia", "education_Doctorate", "occupation_Transport-moving", "sex_Male", "native_country_China", "age_bins", "education_5th-6th", "capital_gain", "education_1st-4th", "education_10th", "race_Black", "native_country_Cambodia"]
col_selector = ColumnSelector(supported_cols)

## Preprocessors

In [None]:
transformers = []

### Numerical columns

Missing values for numerical columns are imputed with mean for consistency

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

numerical_pipeline = Pipeline(steps=[
    ("converter", FunctionTransformer(lambda df: df.apply(pd.to_numeric, errors="coerce"))),
    ("imputer", SimpleImputer(strategy="mean"))
])

transformers.append(("numerical", numerical_pipeline, ["hours_per_week", "capital_gain", "capital_loss"]))

### Categorical columns

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers, remainder="passthrough", sparse_threshold=0)

### Feature standardization
Scale all feature columns to be centered around zero with unit variance.

In [None]:
from sklearn.preprocessing import StandardScaler

standardizer = StandardScaler()

## Train - Validation - Test Split
Split the input data into 3 sets:
- Train (60% of the dataset used to train the model)
- Validation (20% of the dataset used to tune the hyperparameters of the model)
- Test (20% of the dataset used to report the true performance of the model on an unseen dataset)

In [None]:
from sklearn.model_selection import train_test_split

split_X = df_loaded.drop([target_col], axis=1)
split_y = df_loaded[target_col]

# Split out train data
X_train, split_X_rem, y_train, split_y_rem = train_test_split(split_X, split_y, train_size=0.6, random_state=566255445, stratify=split_y)

# Split remaining data equally for validation and test
X_val, X_test, y_val, y_test = train_test_split(split_X_rem, split_y_rem, test_size=0.5, random_state=566255445, stratify=split_y_rem)

## Train classification model
- Log relevant metrics to MLflow to track runs

In [None]:
from lightgbm import LGBMClassifier

help(LGBMClassifier)

In [None]:
import mlflow
import sklearn
from sklearn import set_config
from sklearn.pipeline import Pipeline

set_config(display="diagram")

lgbmc_classifier = LGBMClassifier(
  colsample_bytree=0.7537070084014925,
  lambda_l1=3.849236121050169,
  lambda_l2=4.738457166263764,
  learning_rate=1.105459115849592,
  max_bin=180,
  max_depth=6,
  min_child_samples=27,
  n_estimators=121,
  num_leaves=33,
  path_smooth=5.820351078666985,
  subsample=0.7347509917731021,
  random_state=566255445,
)

model = Pipeline([
    ("column_selector", col_selector),
    ("preprocessor", preprocessor),
    ("standardizer", standardizer),
    ("classifier", lgbmc_classifier),
])

# Create a separate pipeline to transform the validation dataset. This is used for early stopping.
pipeline = Pipeline([
    ("column_selector", col_selector),
    ("preprocessor", preprocessor),
    ("standardizer", standardizer),
])

mlflow.sklearn.autolog(disable=True)
X_val_processed = pipeline.fit_transform(X_val, y_val)

model

In [None]:
# Enable automatic logging of input samples, metrics, parameters, and models
mlflow.sklearn.autolog(log_input_examples=True, silent=True)

with mlflow.start_run(run_name="lightgbm") as mlflow_run:
    model.fit(X_train, y_train, classifier__early_stopping_rounds=5, classifier__eval_set=[(X_val_processed,y_val)], classifier__verbose=False)
    
    # Training metrics are logged by MLflow autologging
    # Log metrics for the validation set
    lgbmc_val_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_val, y_val, prefix="val_")

    # Log metrics for the test set
    lgbmc_test_metrics = mlflow.sklearn.eval_and_log_metrics(model, X_test, y_test, prefix="test_")

    # Display the logged metrics
    lgbmc_val_metrics = {k.replace("val_", ""): v for k, v in lgbmc_val_metrics.items()}
    lgbmc_test_metrics = {k.replace("test_", ""): v for k, v in lgbmc_test_metrics.items()}
    display(pd.DataFrame([lgbmc_val_metrics, lgbmc_test_metrics], index=["validation", "test"]))

In [None]:
# Patch requisite packages to the model environment YAML for model serving
import os
import shutil
import uuid
import yaml

None

import lightgbm
from mlflow.tracking import MlflowClient

lgbmc_temp_dir = os.path.join(os.environ["SPARK_LOCAL_DIRS"], str(uuid.uuid4())[:8])
os.makedirs(lgbmc_temp_dir)
lgbmc_client = MlflowClient()
lgbmc_model_env_path = lgbmc_client.download_artifacts(mlflow_run.info.run_id, "model/conda.yaml", lgbmc_temp_dir)
lgbmc_model_env_str = open(lgbmc_model_env_path)
lgbmc_parsed_model_env_str = yaml.load(lgbmc_model_env_str, Loader=yaml.FullLoader)

lgbmc_parsed_model_env_str["dependencies"][-1]["pip"].append(f"lightgbm=={lightgbm.__version__}")

with open(lgbmc_model_env_path, "w") as f:
  f.write(yaml.dump(lgbmc_parsed_model_env_str))
lgbmc_client.log_artifact(run_id=mlflow_run.info.run_id, local_path=lgbmc_model_env_path, artifact_path="model")
shutil.rmtree(lgbmc_temp_dir)

## Feature importance

SHAP is a game-theoretic approach to explain machine learning models, providing a summary plot
of the relationship between features and model output. Features are ranked in descending order of
importance, and impact/color describe the correlation between the feature and the target variable.
- Generating SHAP feature importance is a very memory intensive operation, so to ensure that AutoML can run trials without
  running out of memory, we disable SHAP by default.<br />
  You can set the flag defined below to `shap_enabled = True` and re-run this notebook to see the SHAP plots.
- To reduce the computational overhead of each trial, a single example is sampled from the validation set to explain.<br />
  For more thorough results, increase the sample size of explanations, or provide your own examples to explain.
- SHAP cannot explain models using data with nulls; if your dataset has any, both the background data and
  examples to explain will be imputed using the mode (most frequent values). This affects the computed
  SHAP values, as the imputed samples may not match the actual data distribution.

For more information on how to read Shapley values, see the [SHAP documentation](https://shap.readthedocs.io/en/latest/example_notebooks/overviews/An%20introduction%20to%20explainable%20AI%20with%20Shapley%20values.html).

In [None]:
# Set this flag to True and re-run the notebook to see the SHAP plots
shap_enabled = True

In [None]:
if shap_enabled:
    from shap import KernelExplainer, summary_plot
    # Sample background data for SHAP Explainer. Increase the sample size to reduce variance.
    train_sample = X_train.sample(n=min(100, len(X_train.index)))

    # Sample a single example from the validation set to explain. Increase the sample size and rerun for more thorough results.
    example = X_val.sample(n=1)

    # Use Kernel SHAP to explain feature importance on the example from the validation set.
    predict = lambda x: model.predict(pd.DataFrame(x, columns=X_train.columns))
    explainer = KernelExplainer(predict, train_sample, link="identity")
    shap_values = explainer.shap_values(example, l1_reg=False)
    summary_plot(shap_values, example, class_names=model.classes_)

In [None]:
# model_uri for the generated model
model_uri = f"runs:/{ mlflow_run.info.run_id }/model"

print(model_uri)

In [None]:
dbutils.notebook.exit(model_uri)