In [15]:
import pandas as pd
import numpy as np
from sklearn import pipeline
from sklearn import preprocessing
from sklearn import compose
from sklearn import metrics
from sklearn import model_selection
from sklearn import ensemble
import xgboost as xgb
import optuna

In [16]:
df = pd.read_csv('/home/ubuntu/mlops-project/data/heart_failure_clinical_records_dataset.csv')
y = df['DEATH_EVENT']
X = df.drop(['DEATH_EVENT'], axis=1)

X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, random_state=42, test_size=0.2)

In [24]:
numeric_transformer = pipeline.Pipeline(steps=[
        ('outliers', preprocessing.RobustScaler(quantile_range=(5,95))),
        ('scale', preprocessing.MinMaxScaler())
    ])

categorical_tranformer = pipeline.Pipeline(steps=[
        ('encode', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999))
    ])

preprocess_pipeline = compose.ColumnTransformer(
        transformers=
        [
            ('num', numeric_transformer, compose.make_column_selector(dtype_exclude='object')), 
            ('cat', categorical_tranformer, compose.make_column_selector(dtype_include='object')), 
        ],

        remainder='passthrough')


clf = pipeline.Pipeline(steps=[
        ('preprocessor', preprocess_pipeline),      
        ('model', ensemble.RandomForestClassifier(random_state=42))
        ]
    )

In [27]:
def objective(trial, clf, X_train, y_train):
    params = {
        #'model__eta': trial.suggest_float('eta', 0.2, 0.8),
        'model__max_depth': trial.suggest_int('max_depth', 2, 10)
    }
    clf.train(X_train, y_train)
    return metrics.accuracy_score()

In [28]:
sampler = optuna.samplers.TPESampler(seed=10)
study = optuna.create_study(sampler = sampler, direction='maximize')

func = lambda trial: objective(trial, clf, X_train, y_train)
study.optimize(func, n_trials=20)

[32m[I 2022-08-31 14:45:55,249][0m A new study created in memory with name: no-name-b46fecd4-a29a-4799-9290-344aec954bec[0m
[32m[I 2022-08-31 14:45:55,578][0m Trial 0 finished with value: 0.8619747899159664 and parameters: {'max_depth': 8}. Best is trial 0 with value: 0.8619747899159664.[0m
[32m[I 2022-08-31 14:45:55,891][0m Trial 1 finished with value: 0.8619747899159664 and parameters: {'max_depth': 2}. Best is trial 0 with value: 0.8619747899159664.[0m
[32m[I 2022-08-31 14:45:56,215][0m Trial 2 finished with value: 0.8619747899159664 and parameters: {'max_depth': 7}. Best is trial 0 with value: 0.8619747899159664.[0m
[32m[I 2022-08-31 14:45:56,535][0m Trial 3 finished with value: 0.8619747899159664 and parameters: {'max_depth': 8}. Best is trial 0 with value: 0.8619747899159664.[0m
[32m[I 2022-08-31 14:45:56,847][0m Trial 4 finished with value: 0.8619747899159664 and parameters: {'max_depth': 6}. Best is trial 0 with value: 0.8619747899159664.[0m
[32m[I 2022-08-31