In [0]:
%pip install mlflow
%pip install imblearn
%pip install xgboost

Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Python interpreter will be restarted.
Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3
Python interpreter will be restarted.


In [0]:
fdf = spark.read.table("fraud_demo_txn_data").select("_id","is_fraud")
fdf.drop("is_fraud")

Out[62]: DataFrame[_id: string]

In [0]:
from databricks.feature_store import feature_table
from databricks.feature_store import FeatureStoreClient
from databricks.feature_store import FeatureLookup

feature_table_name = "default.bfsi_txn_features"

feature_names = list(fdf.columns)

feature_lookups = [
    FeatureLookup(
        table_name=feature_table_name,
        lookup_key="_id",
    ),
]
fs = FeatureStoreClient()
training_set = fs.create_training_set(
    fdf,
    feature_lookups=feature_lookups,
    exclude_columns=["_id"],
    label="is_fraud",
)
training_df = training_set.load_df().toPandas()

In [0]:
training_df = training_df.drop(['cc_num',"age", "gender_id"],axis=1)
training_df

Unnamed: 0,amt,cat_id,city_pop,day,hour,lat,long,merch_lat,merch_long,month,zip,is_fraud
0,32.89,12,566,17,6,42.2770,-97.1926,43.211392,-96.208756,4,68723,0
1,116.64,2,863,17,6,38.8089,-78.7776,37.909870,-79.598710,4,22810,0
2,5.88,1,34496,17,6,40.6729,-73.5365,40.027418,-72.829470,4,11710,0
3,151.14,10,75903,17,7,41.2639,-80.8164,41.737472,-80.901010,4,44483,0
4,19.61,10,69,17,12,44.6087,-74.9732,44.106922,-75.142707,4,13647,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,41.33,7,242,21,6,39.7562,-98.4462,40.311743,-98.975228,6,66941,0
1296671,124.97,2,24840,21,8,42.8223,-83.2829,43.431790,-83.541949,6,48371,0
1296672,4.80,12,2258,21,10,41.4575,-74.1659,40.721641,-74.056874,6,12575,0
1296673,35.11,12,1442,21,10,39.3391,-95.0999,39.997240,-95.977897,6,66020,0


In [0]:
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
import mlflow.spark
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score,accuracy_score, classification_report,roc_auc_score
import xgboost as xgb
import numpy as np                   # array, vector, matrix calculations
import pandas as pd                  # DataFrame handling

import os
import time


In [0]:
X = training_df.drop("is_fraud", axis=1)
y = training_df["is_fraud"]
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [0]:
class XGBModelWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        self.model = model

    def predict(self, context, model_input):
        return self.model.predict_proba(model_input)[:,1]

In [0]:
with mlflow.start_run(run_name='fraud_xgb_model_train'):
    n_estimators = 100
    model = xgb.XGBClassifier(n_estimators=n_estimators)
    model.fit(X_train,y_train)

    # predict_proba returns [prob_negative, prob_positive], so slice the output with [:, 1]
    predictions_test = model.predict_proba(X_test)[:,1]
    auc_score = roc_auc_score(y_test, predictions_test)
    mlflow.log_param('n_estimators', n_estimators)
    # Use the area under the ROC curve as a metric.
    mlflow.log_metric('auc', auc_score)
    wrappedModel = XGBModelWrapper(model)
    # Log the model with a signature that defines the schema of the model's inputs and outputs. 
    # When the model is deployed, this signature will be used to validate inputs.
    signature = infer_signature(X_train, wrappedModel.predict(None, X_train))

    # MLflow contains utilities to create a conda environment used to serve models.
    # The necessary dependencies are added to a conda.yaml file which is logged along with the model.
    conda_env =  _mlflow_conda_env(
        additional_conda_deps=None,
        additional_pip_deps=["scikit-learn=={}".format(sklearn.__version__), "xgboost=={}".format(xgb.__version__), "imblearn"],
        additional_conda_channels=None,
    )
    mlflow.pyfunc.log_model("fraud_xgb_model_1", python_model=wrappedModel, conda_env=conda_env, signature=signature)

  inputs = _infer_schema(model_input)


In [0]:
feature_importances = pd.DataFrame(model.feature_importances_, index=X_train.columns.tolist(), columns=['importance'])
feature_importances.sort_values('importance', ascending=False)

Unnamed: 0,importance
hour,0.288997
cat_id,0.277802
amt,0.228918
city_pop,0.035261
lat,0.02881
long,0.028226
month,0.026134
zip,0.02563
day,0.025004
merch_long,0.018958


In [0]:
y_pred = np.vstack(model.predict_proba(X_test))
auc = roc_auc_score(y_test,y_pred[:,1]>0.3)
print(classification_report(y_test,y_pred[:,1]>0.3))
print(f'AUC: {auc}')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257866
           1       0.86      0.83      0.84      1469

    accuracy                           1.00    259335
   macro avg       0.93      0.91      0.92    259335
weighted avg       1.00      1.00      1.00    259335

AUC: 0.9138318112746692


In [0]:
run_id = mlflow.search_runs(filter_string='tags.mlflow.runName = "fraud_xgb_model_train"').iloc[0].run_id
model_name = "fraud_xgb_model_1"
model_version = mlflow.register_model(f"runs:/{run_id}/fraud_xgb_model_1", model_name)

# Registering the model takes a few seconds, so add a small delay
time.sleep(15)

Registered model 'fraud_xgb_model_1' already exists. Creating a new version of this model...
2023/02/01 14:43:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: fraud_xgb_model_1, version 3
Created version '3' of model 'fraud_xgb_model_1'.


In [0]:
if(auc>0.91):
    print(f"Publishing the {model_name}-{run_id} to Staging env")
    from mlflow.tracking import MlflowClient 
    client = MlflowClient()
    client.transition_model_version_stage(
      name=model_name,
      version=model_version.version,
      stage="Staging",
    )

Publishing the fraud_xgb_model_1-1323e29e91804319a77f7ca96a34b011 to Staging env


In [0]:
model = mlflow.pyfunc.load_model(f"models:/{model_name}/staging")
y_pred = model.predict(X_test)
# Sanity-check: This should match the AUC logged by MLflow
print(f'AUC: {roc_auc_score(y_test,y_pred)}')
print(classification_report(y_test,y_pred>0.5))

 - scikit-learn (current: 1.2.1, required: scikit-learn==0.24.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
AUC: 0.9981890544709959
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257866
           1       0.91      0.77      0.83      1469

    accuracy                           1.00    259335
   macro avg       0.96      0.88      0.92    259335
weighted avg       1.00      1.00      1.00    259335



In [0]:
import mlflow.pyfunc
 
apply_model_udf = mlflow.pyfunc.spark_udf(spark, f"models:/{model_name}/staging")


 - scikit-learn (current: 1.2.1, required: scikit-learn==0.24.2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/02/01 14:49:17 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'


In [0]:
from pyspark.sql.functions import struct
 
mip = spark.read.table("fraud_demo_txn_data")
# Apply the model to the new data
udf_inputs = struct(*(X_train.columns.tolist()))
 
new_data = mip.withColumn(
  "prediction",
  apply_model_udf(udf_inputs)
)

In [0]:
new_data.filter("is_fraud like 1").show(10,False)

+------------------------+----------------+-------+-----+-------+------------------+--------+------------------+-------------------+---+----+---+-----+--------+------+---------+--------------------+
|_id                     |cc_num          |amt    |zip  |lat    |long              |city_pop|merch_lat         |merch_long         |age|hour|day|month|is_fraud|cat_id|gender_id|prediction          |
+------------------------+----------------+-------+-----+-------+------------------+--------+------------------+-------------------+---+----+---+-----+--------+------+---------+--------------------+
|63d0c5a4c9e4437035d0d3e3|30234966027947  |287.26 |53061|43.9446|-88.0911          |5196    |43.381248         |-88.065085         |41 |10  |17 |4    |1       |2     |1        |0.9385543465614319  |
|63d0c5a4c9e4437035d0d58c|60422928733     |7.13   |29860|33.6028|-81.9748          |46944   |32.98808          |-81.866199         |77 |16  |17 |4    |1       |12    |1        |0.013526007533073425|
|63d0