### M2: Process and Tooling

#### 1.	Experiment Tracking:
* Use MLflow to track experiments for a machine learning project - Diabetes Dataset.
* Record metrics, parameters, and results of at least three different model training runs.

In [7]:
# Load dataset from CSV file stored above
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Set the tracking URI to a local directory
mlflow.set_tracking_uri("./m2_logs")

# Create or set an experiment
mlflow.set_experiment("diabetes_experiment")

# Load the versioned dataset
df = pd.read_csv('dataset/diabetes_dataset_v0.csv')

def train_model(n_estimators, max_depth):
    # Split data
    X = df.drop(columns=['target'])
    y = df['target']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)

    # Log parameters, metrics, and model
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("mean_squared_error", mse)
    mlflow.sklearn.log_model(model, "model")

    print(f"Run with n_estimators={n_estimators}, max_depth={max_depth}, mean_squared_error={mse}")

# Run experiments
with mlflow.start_run():
    train_model(n_estimators=10, max_depth=3)

with mlflow.start_run():
    train_model(n_estimators=50, max_depth=5)

with mlflow.start_run():
    train_model(n_estimators=100, max_depth=7)


2025/01/25 16:49:57 INFO mlflow.tracking.fluent: Experiment with name 'diabetes_experiment' does not exist. Creating a new experiment.


Run with n_estimators=10, max_depth=3, mean_squared_error=2906.4925059558755




Run with n_estimators=50, max_depth=5, mean_squared_error=2964.4844789896415




Run with n_estimators=100, max_depth=7, mean_squared_error=2974.3046746678074


In [None]:
# Check results on mlflow UI
mlflow ui --backend-store-uri ./m2_logs

#### 2.	Data Versioning:
* Use DVC (Data Version Control) to version control a dataset used in your project.
* Show how to revert to a previous version of the dataset.


In [None]:
# Step 1: Initialize DVC
dvc init
git add .dvc .gitignore
git commit -m "Initialize DVC"

In [None]:
# Step 2: Save the initial version of the Diabetes dataset to a CSV file
from sklearn.datasets import load_diabetes
import pandas as pd

# Load the Diabetes dataset
data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Save to CSV file
df.to_csv('../dataset/diabetes_dataset_v1.csv', index=False)


In [None]:
# Step 3: Add the first version of the dataset to DVC
dvc add diabetes_dataset_v1.csv
git add diabetes_dataset_v1.csv.dvc .gitignore
git commit -m "Add version 1 of diabetes dataset to DVC"

In [None]:
# Step 4: Modify the dataset to create a new version
# Add a new feature (square of BMI)
df['BMI_squared'] = df['bmi'] ** 2

# Save to CSV file
df.to_csv('../dataset/diabetes_dataset_v2.csv', index=False)

In [None]:
# Step 5: Add the second version of the dataset to DVC
dvc add diabetes_dataset_v2.csv
git add diabetes_dataset_v2.csv.dvc .gitignore
git commit -m "Add version 2 of diabetes dataset to DVC"

In [None]:
# Step 6: Revert to a Previous Version
# List the commits to find the hash of the commit with the first version
git log
# Checkout the commit with the first version
git checkout <commit_hash_of_version_1>
# Revert the dataset to the version at the checked-out commit
dvc checkout