In [1]:
# Cell 1: Setup
import mlflow
import mlflow.sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import uuid
import sys
print(sys.version)

from hdfs import InsecureClient

# Connect to HDFS via HTTP
client = InsecureClient('http://10.0.2.15:9870', user='gadet')

# Test listing artifacts
print(client.list('/user/gadet/mlflow/artifacts'))

# Load dataset
mlflow.set_tracking_uri("http://0.0.0.0:5001")
mlflow.set_experiment("01 - Sklearn")
digits = load_digits()
X, y = digits.data, digits.target
n_classes = len(np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to log artifacts to HDFS
def log_artifact_hdfs(local_path, hdfs_dir):
    """
    Upload a local file to HDFS and also log it in MLflow UI.
    """
    mlflow.log_artifact(local_path)
    
    file_name = os.path.basename(local_path)
    hdfs_path = os.path.join(hdfs_dir, file_name)
    client.upload(hdfs_path, local_path, overwrite=True)
    
def log_confusion_matrix(y_true, y_pred, run_name):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=range(n_classes))
    disp.plot(cmap="Blues", xticks_rotation=45)
    plt.title(f"Confusion Matrix: {run_name}")
    artifact_path = f"{run_name}_confusion_matrix.png"
    plt.savefig(artifact_path, bbox_inches="tight")
    log_artifact_hdfs(artifact_path, f"/user/gadet/mlflow/artifacts/{run_name}")
    plt.close()
    os.remove(artifact_path)
    
def log_roc_curve(y_true, y_score, run_name):
    # Binarize labels for multiclass
    y_bin = label_binarize(y_true, classes=range(n_classes))
    plt.figure()
    for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_bin[:, i], y_score[:, i])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f"Class {i} (AUC={roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=1, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curve: {run_name}")
    plt.legend(loc="lower right", fontsize="small")
    artifact_path = f"{run_name}_roc_curve.png"
    plt.savefig(artifact_path, bbox_inches="tight")
    log_artifact_hdfs(artifact_path, f"/user/gadet/mlflow/artifacts/{run_name}")
    plt.close()
    os.remove(artifact_path)
    
def log_feature_importance(model, run_name):
    if hasattr(model, "feature_importances_"):
        plt.figure(figsize=(10,5))
        plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
        plt.title(f"Feature Importance: {run_name}")
        artifact_path = f"{run_name}_feature_importance.png"
        plt.savefig(artifact_path, bbox_inches="tight")
        log_artifact_hdfs(artifact_path, f"/user/gadet/mlflow/artifacts/{run_name}")
        plt.close()
        os.remove(artifact_path)

3.10.18 (main, Jun  4 2025, 08:56:00) [GCC 9.4.0]
['40', 'Baseline_LogReg', 'GradientBoosting', 'LogReg_PCA', 'Log_Reg_PCA', 'RandomForest', 'SVC', 'testfile']


In [2]:
# Cell 2: Baseline Logistic Regression
with mlflow.start_run(run_name="Baseline Logistic Regression"):
    start = time.time()
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    end = time.time()
    preds = model.predict(X_test)
    try:
        y_score = model.predict_proba(X_test)
    except:
        y_score = np.zeros((len(y_test), n_classes))  # fallback if model has no proba
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("train_time_sec", end - start)
    
    mlflow.sklearn.log_model(model, "model")
    log_confusion_matrix(y_test, preds, "Baseline_LogReg")
    log_roc_curve(y_test, y_score, "Baseline_LogReg")

print("Baseline Logistic Regression -> acc:", acc, "f1:", f1)



Baseline Logistic Regression -> acc: 0.975 f1: 0.9751092427666409


In [3]:
# Cell 3: Logistic Regression + StandardScaler + PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

with mlflow.start_run(run_name="LogReg + Scaler + PCA"):
    start = time.time()
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_pca, y_train)
    end = time.time()
    preds = model.predict(X_test_pca)
    try:
        y_score = model.predict_proba(X_test_pca)
    except:
        y_score = np.zeros((len(y_test), n_classes))
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("pca_components", 30)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("train_time_sec", end - start)
    
    mlflow.sklearn.log_model(model, "model")
    log_confusion_matrix(y_test, preds, "LogReg_PCA")
    log_roc_curve(y_test, y_score, "LogReg_PCA")

    # PCA explained variance
    plt.figure()
    plt.plot(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_, marker='o')
    plt.title("PCA Explained Variance Ratio")
    plt.xlabel("Principal Component")
    plt.ylabel("Variance Ratio")
    artifact_path = "pca_variance.png"
    plt.savefig(artifact_path, bbox_inches="tight")
    log_artifact_hdfs(artifact_path, f"/user/gadet/mlflow/artifacts/LogReg_PCA")
    plt.close()
    os.remove(artifact_path)

print("LogReg + PCA -> acc:", acc, "f1:", f1)

LogReg + PCA -> acc: 0.9611111111111111 f1: 0.9611570476091673


In [4]:
# Cell 4: SVC with StandardScaler
with mlflow.start_run(run_name="SVC (RBF Kernel)"):
    start = time.time()
    model = SVC(kernel="rbf", C=5, gamma="scale", probability=True)
    model.fit(X_train_scaled, y_train)
    end = time.time()
    preds = model.predict(X_test_scaled)
    y_score = model.predict_proba(X_test_scaled)

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("model", "SVC")
    mlflow.log_param("kernel", "rbf")
    mlflow.log_param("C", 5)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("train_time_sec", end - start)

    mlflow.sklearn.log_model(model, "model")
    log_confusion_matrix(y_test, preds, "SVC")
    log_roc_curve(y_test, y_score, "SVC")

print("SVC -> acc:", acc, "f1:", f1)

SVC -> acc: 0.9805555555555555 f1: 0.9804738636447198


In [5]:
# Cell 5: Random Forest
with mlflow.start_run(run_name="RandomForest"):
    start = time.time()
    model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
    model.fit(X_train, y_train)
    end = time.time()
    preds = model.predict(X_test)
    try:
        y_score = model.predict_proba(X_test)
    except:
        y_score = np.zeros((len(y_test), n_classes))
        
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 15)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("train_time_sec", end - start)
    
    mlflow.sklearn.log_model(model, "model")
    log_confusion_matrix(y_test, preds, "RandomForest")
    log_roc_curve(y_test, y_score, "RandomForest")
    log_feature_importance(model, "RandomForest")

print("RandomForest -> acc:", acc, "f1:", f1)

RandomForest -> acc: 0.9694444444444444 f1: 0.9694520995149207


In [6]:
# Cell 6: Gradient Boosting

run_name = f"GradientBoosting-{uuid.uuid4()}"

with mlflow.start_run(run_name="GradientBoosting"):
    start = time.time()
    model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train)
    end = time.time()
    preds = model.predict(X_test)
    # GradientBoosting doesn't have predict_proba for multiclass before sklearn 0.24
    y_score = model.predict_proba(X_test)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average="weighted")

    mlflow.log_param("model", "GradientBoosting")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("train_time_sec", end - start)
    
    mlflow.sklearn.log_model(model, "model")
    log_confusion_matrix(y_test, preds, "GradientBoosting")
    log_roc_curve(y_test, y_score, "GradientBoosting")
    log_feature_importance(model, "GradientBoosting")

print("GradientBoosting -> acc:", acc, "f1:", f1)

GradientBoosting -> acc: 0.975 f1: 0.9750752888378074
