In [1]:
!python -V

Python 3.10.13


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics 
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import plot_tree

from joblib import dump

import seaborn as sns
import matplotlib.pyplot as plt

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

import boto3

In [3]:
import mlflow
import os

In [4]:
random_state = 42

# Configuring Mlflow

In [5]:
os.environ["AWS_PROFILE"] = "mlflow-profile"
TRACKING_SERVER_HOST = "ec2-18-222-74-4.us-east-2.compute.amazonaws.com"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("diabetes-uci")

KeyboardInterrupt: 

In [None]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-18-118-50-175.us-east-2.compute.amazonaws.com:5000'


# Loading data

In [6]:
data_path = '../data/raw/diabetes.csv'
def load_data(data_path):
    diabetes_df = pd.read_csv(data_path)
    
    diabetes_df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = diabetes_df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)

    diabetes_df['Glucose'] = diabetes_df['Glucose'].fillna(diabetes_df['Glucose'].mean())
    diabetes_df['BloodPressure'] = diabetes_df['BloodPressure'].fillna(diabetes_df['BloodPressure'].mean())
    diabetes_df['SkinThickness'] = diabetes_df['SkinThickness'].fillna(diabetes_df['SkinThickness'].median())
    diabetes_df['Insulin'] = diabetes_df['Insulin'].fillna(diabetes_df['Insulin'].median())
    diabetes_df['BMI'] = diabetes_df['BMI'].fillna(diabetes_df['BMI'].median())

    return diabetes_df


# Training Random Forest

In [7]:
def train(data_path, max_depth, max_features, n_estimators):

    data_df = load_data(data_path)

    X = data_df.drop('Outcome', axis=1)
    y = data_df['Outcome']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=random_state,
                                                       stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_std = scaler.transform(X_test)

    rf_model = RandomForestClassifier(
        max_depth=max_depth, max_features=max_features, n_estimators=n_estimators
    )
    print(X_train_scaled)
    rf_model.fit(X_train_scaled, y_train)
    
    y_pred = rf_model.predict(X_test_std)

    acc = accuracy_score(y_test, y_pred)

    mlparams = {
        'max_depth': max_depth,
        'max_features': max_features,
        'n_estimators': n_estimators
    }
    mlflow.log_params(mlparams)
    mlflow.set_tag('model', 'random-forest')

    mlmetrics = {'accuracy': acc}
    print(f'accuracy: {acc}')
    mlflow.log_metrics(mlmetrics)

    return rf_model

In [9]:
max_depth = 8
max_features = 0.75
n_estimators = 200

with mlflow.start_run():
    train(data_path, max_depth, max_features, n_estimators)

[[-0.85135507 -1.0575301  -0.82797293 ... -0.76947697  0.31079384
  -0.79216928]
 [ 0.35657564  0.14339202  0.47653222 ... -0.41749769 -0.11643851
   0.56103382]
 [-0.5493724  -0.55714588 -1.15409922 ...  0.3597899  -0.76486207
  -0.70759409]
 ...
 [-0.85135507 -0.82401747 -0.17572035 ...  0.82909561 -0.78607218
  -0.28471812]
 [ 1.86648903 -0.3569922  -0.17572035 ... -0.72547956 -1.01938346
   0.56103382]
 [ 0.05459296  0.74385309 -1.15409922 ... -0.43216349 -0.57700104
   0.30730824]]


MlflowException: API request to http://ec2-18-118-50-175.us-east-2.compute.amazonaws.com:5000/api/2.0/mlflow/runs/create failed with timeout exception HTTPConnectionPool(host='ec2-18-118-50-175.us-east-2.compute.amazonaws.com', port=5000): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7304c1600790>, 'Connection to ec2-18-118-50-175.us-east-2.compute.amazonaws.com timed out. (connect timeout=120)')). To increase the timeout, set the environment variable MLFLOW_HTTP_REQUEST_TIMEOUT (default: 120, type: int) to a larger value.

# Hyperparameter Optimization

In [11]:
data_df = load_data(data_path)

In [12]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag("developer", "alexis")
        mlflow.set_tag("model", "random_forest")
        mlflow.log_param('train_data_path', f'{data_path}')
        
        X = data_df.drop('Outcome', axis=1)
        y = data_df['Outcome']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                            random_state=random_state,
                                                            stratify=y)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    
        rf_model = RandomForestClassifier(**params)
        rf_model.fit(X_train_scaled, y_train)
        
        y_pred = rf_model.predict(X_test_scaled)
        acc = accuracy_score(y_test, y_pred)

        mlmetrics = {'accuracy': acc}
        print(f'accuracy: {acc}')
        mlflow.log_metrics(mlmetrics)
    
    return {'loss': -acc, 'status': STATUS_OK, 'params': params}

In [13]:
# Define the search space
search_space = {
    'max_depth': scope.int(hp.choice('max_depth', [10, 20, 30])),
    'max_features': hp.choice('max_features', ['sqrt','log2']),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 50)),
}

In [14]:
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trials)

# Extract the best parameters
best_params = trials.best_trial['result']['params']

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

accuracy: 0.7402597402597403                          
accuracy: 0.7337662337662337                                                     
accuracy: 0.7402597402597403                                                     
accuracy: 0.7597402597402597                                                     
accuracy: 0.7467532467532467                                                     
accuracy: 0.7597402597402597                                                     
accuracy: 0.7272727272727273                                                     
accuracy: 0.7272727272727273                                                     
accuracy: 0.7272727272727273                                                     
accuracy: 0.7467532467532467                                                     
accuracy: 0.7337662337662337                                                      
accuracy: 0.7337662337662337                                                      
accuracy: 0.7402597402597403             

# Save best model

In [15]:
artifact_path = '../models/random_forest.joblib'

with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.set_tag("developer", "alexis")
    mlflow.set_tag("model", "random_forest")
    mlflow.log_param('train_data_path', f'{data_path}')
    
    print(f'Best params: {best_params}')
    # Retrain the model with the best parameters
    data_df = load_data(data_path)
    X = data_df.drop('Outcome', axis=1)
    y = data_df['Outcome']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=random_state,
                                                        stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    best_model = RandomForestClassifier(**best_params)
    best_model.fit(X_train_scaled, y_train)

    y_pred = best_model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)

    mlmetrics = {'accuracy': acc}
    print(f'accuracy: {acc}')
    mlflow.log_metrics(mlmetrics)
    
    mlflow.sklearn.log_model(
            best_model,
            artifact_path=artifact_path,
        )

Best params: {'max_depth': 30, 'max_features': 'log2', 'n_estimators': 250}
accuracy: 0.7402597402597403
