# Model Training and MLflow Logging

This notebook trains a logistic regression model on the diabetes dataset.  
Model parameters and performance metrics are logged to MLflow for tracking and comparison purposes.

In [1]:
# Core libraries
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# MLflow
import mlflow
import mlflow.sklearn

In [2]:
# Load the cleaned dataset
df = pd.read_csv("../data/diabetes.csv")

# Define features and target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Start a run to log parameters, metrics, and the model
with mlflow.start_run():

    # Train model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Log params & metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # Save model
    mlflow.sklearn.log_model(model, "model")

    print("✅ Run successfully logged to MLflow.")



✅ Run successfully logged to MLflow.


In [4]:
# RandomForest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# RandomForest Evaluation
rf_pred = rf_model.predict(X_test)
print("Random Forest Results:")
print(classification_report(y_test, rf_pred))

# RandomForest Logging
with mlflow.start_run(run_name="random_forest"):
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("model_type", "RandomForestClassifier")

    acc_rf = accuracy_score(y_test, rf_pred)
    f1_rf = f1_score(y_test, rf_pred)
    mlflow.log_metric("accuracy", acc_rf)
    mlflow.log_metric("f1_score", f1_rf)

    mlflow.sklearn.log_model(rf_model, "model")
    print("✅ RandomForest logged to MLflow.")


Random Forest Results:
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154





✅ RandomForest logged to MLflow.


In [5]:
# KNN Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# KNN Evaluation
knn_pred = knn_model.predict(X_test)
print("KNN Results:")
print(classification_report(y_test, knn_pred))

# KNN Logging
with mlflow.start_run(run_name="knn"):
    mlflow.log_param("n_neighbors", 5)
    mlflow.log_param("model_type", "KNeighborsClassifier")

    acc_knn = accuracy_score(y_test, knn_pred)
    f1_knn = f1_score(y_test, knn_pred)
    mlflow.log_metric("accuracy", acc_knn)
    mlflow.log_metric("f1_score", f1_knn)

    mlflow.sklearn.log_model(knn_model, "model")
    print("✅ KNN logged to MLflow.")

KNN Results:
              precision    recall  f1-score   support

           0       0.75      0.71      0.73        99
           1       0.52      0.58      0.55        55

    accuracy                           0.66       154
   macro avg       0.64      0.64      0.64       154
weighted avg       0.67      0.66      0.67       154





✅ KNN logged to MLflow.


In [6]:
print("\nModel Performance Comparison:")
print(f"Logistic Regression - Accuracy: {acc:.2f}, F1: {f1:.2f}")
print(f"RandomForest - Accuracy: {acc_rf:.2f}, F1: {f1_rf:.2f}")
print(f"KNN - Accuracy: {acc_knn:.2f}, F1: {f1_knn:.2f}")



Model Performance Comparison:
Logistic Regression - Accuracy: 0.75, F1: 0.65
RandomForest - Accuracy: 0.72, F1: 0.61
KNN - Accuracy: 0.66, F1: 0.55


In [7]:
with mlflow.start_run() as run:
    model = LogisticRegression()
    model.fit(X_train, y_train)
    mlflow.sklearn.log_model(model, "model")  # Bu satır önemli!
    print("Run ID:", run.info.run_id)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Run ID: f3a5bdd1596a4ce2b09e3626cdc5cfeb
