In [1]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [2]:
df = pd.read_csv('../data/credit_filtered.csv')

In [3]:
df['isFraud'].value_counts()

0    62196
1     8213
Name: isFraud, dtype: int64

In [4]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('day1')

2023/04/04 11:21:28 INFO mlflow.tracking.fluent: Experiment with name 'day1' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', creation_time=1680632488292, experiment_id='2', last_update_time=1680632488292, lifecycle_stage='active', name='day1', tags={}>

In [5]:
df.head(3)
X = df[['type','amount','nameOrig','oldbalanceOrg','newbalanceOrig','nameDest','oldbalanceDest','newbalanceDest']]
y = df['isFraud']
X.shape, y.shape

((70409, 8), (70409,))

In [6]:
le = LabelEncoder()
X['type_encoded'] = le.fit_transform(X['type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['type_encoded'] = le.fit_transform(X['type'])


In [7]:
X.columns

Index(['type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'type_encoded'],
      dtype='object')

In [8]:
X1 = X.drop(['type','nameOrig','nameDest'],axis=1)

In [9]:
X1

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_encoded
0,181.00,181.00,0.0,0.00,0.00,1
1,181.00,181.00,0.0,21182.00,0.00,0
2,344464.40,0.00,0.0,1133312.56,0.00,0
3,57279.11,0.00,0.0,127206.90,64106.18,0
4,71991.42,0.00,0.0,81682.58,557537.26,0
...,...,...,...,...,...,...
70404,339682.13,339682.13,0.0,0.00,339682.13,0
70405,6311409.28,6311409.28,0.0,0.00,0.00,1
70406,6311409.28,6311409.28,0.0,68488.84,6379898.11,0
70407,850002.52,850002.52,0.0,0.00,0.00,1


In [10]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, shuffle=True)

In [11]:
with mlflow.start_run():
    # log parameters and log metrics
    # parameters: hyperparameters
    # metrics: model performance metrics

    mlflow.set_tags({"Model":"decision-tree", "Train Data": "all-data"})

    tree_depth = 5
    dt = DecisionTreeClassifier(max_depth=tree_depth)
    dt.fit(X_train, y_train)
    acc = accuracy_score(y_test, dt.predict(X_test))

    mlflow.log_param("max_depth", tree_depth)
    mlflow.log_metric("accuracy", acc)

mlflow.end_run()

In [12]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [13]:
preds = rf.predict(X_test)
confusion_matrix(y_test, preds)

array([[12411,    53],
       [   52,  1566]])

In [14]:
f1_score(y_test, preds)

0.9675625579240037

In [15]:
def objective(params):
    with mlflow.start_run():
        clf = RandomForestClassifier(**params)
        acc = cross_val_score(clf, X1, y).mean()
        mlflow.set_tag("Model", 'RF')
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)

        return {'loss': -acc, 'status': STATUS_OK}

search_space = hp.choice('classifier_type', [
    
    {
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 3,5),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
])

algo = tpe.suggest
trials = Trials()

In [16]:
best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=15,
        trials=trials)

best_result

100%|██████████| 15/15 [24:13<00:00, 96.92s/trial, best loss: -0.9953699183637559] 


{'classifier_type': 0,
 'criterion': 1,
 'rf_max_features': 4,
 'rf_n_estimators': 470}

In [None]:
mlflow.end_run()

In [17]:
import os 

os.makedirs('../save_data', exist_ok = True)

X_train.to_parquet('../save_data/x_train.parquet')

mlflow.log_artifact('../save_data/x_train.parquet')

In [18]:
X_test.to_parquet('../save_data/x_test.parquet')

mlflow.log_artifacts('../save_data/')

In [25]:
mlflow.sklearn.log_model(rf, artifact_path = 'better_models')



<mlflow.models.model.ModelInfo at 0x7fe64335e2f0>

In [21]:
def objective(params):
    with mlflow.start_run():
        clf = RandomForestClassifier(**params)
        acc = cross_val_score(clf, X1, y).mean()
        mlflow.set_tag("Model", 'RF')
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", acc)

        return {'loss': -acc, 'status': STATUS_OK}

search_space = hp.choice('classifier_type', [
    
    {
        'n_estimators': hp.randint('rf_n_estimators', 20, 500),
        'max_features': hp.randint('rf_max_features', 3,5),
        'criterion': hp.choice('criterion', ['gini', 'entropy'])
    },
])

algo = tpe.suggest
trials = Trials()

In [22]:
best_result = fmin(
        fn=objective, 
        space=search_space,
        algo=algo,
        max_evals=15,
        trials=trials)

best_result

100%|██████████| 15/15 [25:55<00:00, 103.69s/trial, best loss: -0.9953557168443364]


{'classifier_type': 0,
 'criterion': 1,
 'rf_max_features': 4,
 'rf_n_estimators': 384}

In [26]:
logged_model = './mlruns/2/392fa4ee579c4f19966aae3590aa72bb/artifacts/better_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: better_models
  flavor: mlflow.sklearn
  run_id: 392fa4ee579c4f19966aae3590aa72bb