In [1]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [2]:
df = pd.read_csv('../data/credit_filtered.csv')
df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,0,2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,65,165,1,CASH_OUT,344464.4,C793293778,0.0,0.0,C766572210,1133312.56,0.0,0,0


In [3]:
X = df[['type','amount','nameOrig','oldbalanceOrg','newbalanceOrig','nameDest','oldbalanceDest','newbalanceDest']]
y = df['isFraud']

le = LabelEncoder()
X['type_encoded'] = le.fit_transform(X['type'])
X = X.drop(['type','nameOrig','nameDest'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type_encoded'] = le.fit_transform(X['type'])


In [5]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('credit_fraud')

2023/05/10 15:26:46 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/05/10 15:26:46 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='./mlruns/1', creation_time=1683757606991, experiment_id='1', last_update_time=1683757606991, lifecycle_stage='active', name='credit_fraud', tags={}>

In [6]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [7]:
with mlflow.start_run():
    # log parameters and log metrics
    # parameters: hyperparameters
    # metrics: model performance metrics

    mlflow.set_tags({"Model":"decision-tree", "Train Data": "all-data"})

    tree_depth = 5
    dt = DecisionTreeClassifier(max_depth=tree_depth)
    dt.fit(X_train, y_train)
    acc = accuracy_score(y_test, dt.predict(X_test))

    mlflow.log_param("max_depth", tree_depth)
    mlflow.log_metric("accuracy", acc)

mlflow.end_run()

In [9]:
#using hyper-opt and RF

def objective(params):
    with mlflow.start_run():
        clf = RandomForestClassifier(**params)
        acc = cross_val_score(clf, X, y).mean()
        mlflow.set_tag("Model", 'RF')
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)

        return {'loss': -acc, 'status': STATUS_OK}

search_space = hp.choice('classifier_type', [
    
    {
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 3,5),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
])

algo = tpe.suggest
trials = Trials()

best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=15,
        trials=trials)

best_result

100%|██████████| 15/15 [24:21<00:00, 97.42s/trial, best loss: -0.9954267294845863] 


{'classifier_type': 0,
 'criterion': 1,
 'rf_max_features': 4,
 'rf_n_estimators': 184}

In [None]:
# using auto-log from sklearn so you do not need to manually specify which things to log

with mlflow.start_run():
    mlflow.sklearn.autolog()
    tree_depth = 7
    dt = DecisionTreeClassifier(max_depth=tree_depth)
    dt.fit(X_train, y_train)
    mlflow.sklearn.autolog(disable=True)

mlflow.end_run()

# artifacts tracking - data, plots etc

In [11]:
X_test.to_parquet('save_data/x_test.parquet')

In [None]:
import os 

os.makedirs('save_data', exist_ok = True)

X_train.to_parquet('save_data/x_train.parquet')
X_test.to_parquet('save_data/x_test.parquet')


mlflow.log_artifact('save_data/x_train.parquet')
mlflow.log_artifacts('save_data/')

mlflow.end_run()

# model registry - save models

In [15]:
mlflow.end_run()
mlflow.set_experiment('credit_fraud')
def objective(params):
    with mlflow.start_run():
        classifier_type = params['type']
        del params['type']
        if classifier_type == 'dt':
            clf = DecisionTreeClassifier(**params)
        elif classifier_type == 'rf':
            clf = RandomForestClassifier(**params)        
        else:
            return 0
        acc = cross_val_score(clf, X, y).mean()

        mlflow.set_tag("Model", classifier_type)
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(clf, artifact_path = 'better_models')

        return {'loss': -acc, 'status': STATUS_OK}
search_space = hp.choice('classifier_type', [
    {
        'type': 'dt',
        'criterion': hp.choice('dtree_criterion', ['gini', 'entropy']),
        'max_depth': hp.choice('dtree_max_depth', [None, hp.randint('dtree_max_depth_int', 1,10)]),
        'min_samples_split': hp.randint('dtree_min_samples_split', 2,10)
    },
    {
        'type': 'rf',
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 2,9),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
])

algo = tpe.suggest
trials = Trials()
best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=3,
        trials=trials)

100%|██████████| 3/3 [02:49<00:00, 56.57s/trial, best loss: -0.99595222907422] 


# loading saved models

In [21]:
logged_model = 'runs:/079ad56c3f514184ad436502751fafb4/better_models'

# Load model as a PyFuncModel.
# loaded_model = mlflow.pyfunc.load_model(logged_model)
# loaded_model

sklearn_model = mlflow.sklearn.load_model(logged_model)
sklearn_model

In [22]:
sklearn_model.fit(X_train, y_train)
preds = sklearn_model.predict(X_test)
preds[:5]

array([0, 0, 0, 1, 0])