In [39]:
import kagglehub
import os
import shutil

import sys
from pathlib import Path
from utils.etl import run_new_etl 

dataset_id = "uciml/red-wine-quality-cortez-et-al-2009"
local_dataset_name = dataset_id.split('/')[-1] # Uses the last part of the ID


base_data_dir = Path("/home/jovyan/data") # Standard for jovyan user
# base_data_dir = project_root / "data" # Alternative for local non-Docker runs
dataset_specific_base_path = base_data_dir / local_dataset_name
raw_data_target_dir = dataset_specific_base_path / "raw"
print(f"Target dataset base path: {dataset_specific_base_path}")

# expected_raw_file_name = "winequality-red.csv"
# final_raw_file_path = raw_data_target_dir / expected_raw_file_name
# print(f"Target raw data file: {final_raw_file_path}")

# Check if dataset already exists in destination
destination_path = f"/home/jovyan/data/{local_dataset_name}"
if os.path.exists(raw_data_target_dir) and os.listdir(dataset_specific_base_path):
    print(f"Raw dataset already exists at {raw_data_target_dir}")
    print("Skipping download...")
else:
    print(f"Dataset not found locally. Downloading {dataset_id}...")
    download_path = kagglehub.dataset_download(dataset_id)
    print(f"Dataset downloaded to temporary path in container: {download_path}")
    print(f"Copying dataset to persistent raw data volume: {raw_data_target_dir}")
    # Ensure the destination directory exists 
    os.makedirs(raw_data_target_dir, exist_ok=True)
    for item in os.listdir(download_path):
        s = os.path.join(download_path, item)
        d = os.path.join(raw_data_target_dir, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks=False, ignore=None, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)

    print(f"Dataset '{dataset_id}' successfully copied to {raw_data_target_dir} in shared volume.")
print(f"Raw dataset saved to persistent volume at: {raw_data_target_dir}")


Target dataset base path: /home/jovyan/data/red-wine-quality-cortez-et-al-2009
Dataset not found locally. Downloading uciml/red-wine-quality-cortez-et-al-2009...
Dataset downloaded to temporary path in container: /home/jovyan/.cache/kagglehub/datasets/uciml/red-wine-quality-cortez-et-al-2009/versions/2
Copying dataset to persistent raw data volume: /home/jovyan/data/red-wine-quality-cortez-et-al-2009/raw
Dataset 'uciml/red-wine-quality-cortez-et-al-2009' successfully copied to /home/jovyan/data/red-wine-quality-cortez-et-al-2009/raw in shared volume.
Raw dataset saved to persistent volume at: /home/jovyan/data/red-wine-quality-cortez-et-al-2009/raw


In [43]:
# --- Cell 2: Import Libraries and Load Data ---

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os
import cProfile # For basic profiling
import pstats # For processing profiling results
import io # For capturing profiling output

print(f"Listing files in {raw_data_target_dir}:")
try:
    for root, dirs, files in os.walk(raw_data_target_dir):
        level = root.replace(str(raw_data_target_dir), '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f'{subindent}{f}')
except FileNotFoundError:
    print(f"Error: Directory not found: {raw_data_target_dir}. Please ensure the dataset was downloaded and copied correctly.")
    # Exit or handle the error appropriately if the directory is not found

# Adjust this path based on the actual file name and structure
# Find the first CSV file in the dataset directory

csv_files = [f for f in os.listdir(raw_data_target_dir) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {raw_data_target_dir}")
if len(csv_files) > 1:
    print(f"Warning: Multiple CSV files found. Using the first one: {csv_files[0]}")

# Load the data
data_file_path = os.path.join(raw_data_target_dir, csv_files[0])
print(f"\nLoading data from: {data_file_path}")
df = pd.read_csv(data_file_path)

# Display available columns and prompt for target
print("\nAvailable columns in the dataset:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Create a flag file to check if target has been selected
target_flag_file = os.path.join(dataset_path, '.target_selected')
# target_column = 'quality'
if not os.path.exists(target_flag_file):
    print("\nPlease set target_column variable above and run this cell again.")
    print("Example: target_column = 'column_name'")
    # Create an empty flag file to indicate we need target selection
    with open(target_flag_file, 'w') as f:
        pass
    raise SystemExit("Waiting for target column selection...")

# If target_column is defined and valid, save it to the flag file
try:
    if target_column in df.columns:
        with open(target_flag_file, 'w') as f:
            f.write(target_column)
        print(f"\nTarget column '{target_column}' has been saved.")
    else:
        os.remove(target_flag_file)  # Remove flag file if target is invalid
        raise ValueError(f"Selected target column '{target_column}' not found in dataset columns")
except NameError:
    os.remove(target_flag_file)  # Remove flag file if target_column not defined
    raise NameError("target_column variable not defined. Please define it and run again.")



# data_file_path = os.path.join(dataset_path, 'winequality-red.csv') # ** ADJUST THIS **
# Check if target_column is defined and valid
try:
    with open(target_flag_file, 'r') as f:
        target_column = f.read().strip()
    if not target_column:
        raise ValueError("Target column not found in flag file")
    
    # Load the dataset
    df = pd.read_csv(data_file_path)
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
    print("Dataset columns:", df.columns.tolist())
    print("Dataset head:\n", df.head())

    # Verify target column exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset.")

    X = df.drop(target_column, axis=1)
    y = df[target_column]
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

except FileNotFoundError:
    print(f"Error: Data file not found at {data_file_path}. Please check the file name and path.")
except ValueError as e:
    print(f"Error during data preparation: {e}")
except Exception as e:
    print(f"An unexpected error occurred during data loading or preparation: {e}")


Listing files in /home/jovyan/data/red-wine-quality-cortez-et-al-2009/raw:
raw/
    winequality-red.csv

Loading data from: /home/jovyan/data/red-wine-quality-cortez-et-al-2009/raw/winequality-red.csv

Available columns in the dataset:
0: fixed acidity
1: volatile acidity
2: citric acid
3: residual sugar
4: chlorides
5: free sulfur dioxide
6: total sulfur dioxide
7: density
8: pH
9: sulphates
10: alcohol
11: quality

Target column 'quality' has been saved.
Dataset loaded successfully.
Dataset shape: (1599, 12)
Dataset columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
Dataset head:
    fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76 

In [45]:
# --- MLflow and W&B Setup, Profiling Start ---
import mlflow
import mlflow.sklearn
# os.environ["WANDB_BASE_URL"] = "http://host.docker.internal:8082"
os.environ["WANDB_BASE_URL"] = "http://wandb:8080"
os.environ["WANDB_DEBUG"] = "true"
os.environ["WANDB_DEBUG_LOG_PATH"] = "/tmp/wandb_debug.log"
import wandb
# Ensure MLflow tracking URI is set (should be from environment variable)
# mlflow.set_tracking_uri("http://mlflow:5000") # This should be set by docker-compose env var

# Set the MLflow experiment name
mlflow_experiment_name = f"{local_dataset_name}_Decision_Tree"
print(f"\nSetting MLflow experiment: {mlflow_experiment_name}")
mlflow.set_experiment(mlflow_experiment_name)

mlflow.log_param("dataset", local_dataset_name)
mlflow.log_param("dataset_description", "Red wine quality dataset from Cortez et al., 2009")
mlflow.log_param("dataset_version", "v1.0")

# Only start a new run if none is active
active_run = mlflow.active_run()
if active_run is None:
    mlflow_run = mlflow.start_run()
    print(f"Started new MLflow run with ID: {mlflow_run.info.run_id}")
else:
    print(f"MLflow run already active with ID: {active_run.info.run_id}")
    

# W&B: Initialize a new run
# The project name helps organize runs in the W&B UI
# The WANDB_DIR environment variable in docker-compose.yml ensures data goes to the shared volume
wandb_project_name = f"kaggle_{local_dataset_name}"
wandb_run_name = "decision-tree-training"
print("Get wandb API key from localhost:8082/authorize")
print(f"Initializing W&B run: Project='{wandb_project_name}', Name='{wandb_run_name}'")
wandb.login(relogin=True, host="http://host.docker.internal:8082")
print("logged in")
wandb.init(project=wandb_project_name, name=wandb_run_name)
print(f"Started W&B run with ID: {wandb.run.id}")

# Define model parameters
max_depth = 4 # Example hyperparameter
random_state = 20

# Log parameters to MLflow and W&B
print("Logging parameters to MLflow and W&B...")
mlflow.log_param("max_depth", max_depth)
mlflow.log_param("random_state", random_state)
wandb.config.max_depth = max_depth
wandb.config.random_state = random_state
print("Parameters logged.")

# --- Start Profiling ---
# Profiling the training process to understand where time is spent
print("Starting profiling...")
pr = cProfile.Profile()
pr.enable()



# --- Cell 4: Model Training ---

# Create and train the Decision Tree model
print("Training Decision Tree model...")
model = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
model.fit(X_train, y_train)
print("Model training complete.")


# --- Cell 5: Profiling Stop and Processing ---

# --- Stop Profiling ---
print("Stopping profiling...")
pr.disable()
print("Profiling stopped.")

# Process profiling results
print("Processing profiling results...")
s = io.StringIO()
sortby = 'cumulative' # Sort results by cumulative time
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
profiling_output = s.getvalue()
print("Profiling results processed.")

# Print a snippet of profiling results (optional)
print("\n--- Profiling Snippet (Top 10 by Cumulative Time) ---")
print('\n'.join(profiling_output.splitlines()[:15])) # Print header and top few lines
print("----------------------------------------------------")


# --- Cell 6: Model Evaluation and Metric Logging ---

# Make predictions on the test set
print("Making predictions on the test set...")
y_pred = model.predict(X_test)
print("Predictions made.")

# Calculate evaluation metrics
print("Calculating evaluation metrics...")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) # Use weighted average for multi-class
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Metrics calculated.")

# Log metrics to MLflow and W&B
print("Logging metrics to MLflow and W&B...")
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

wandb.log({
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1
})
print("Metrics logged.")

# --- Cell 7: Model and Artifact Logging ---

# Log the trained model to MLflow
print("Logging model with MLflow...")
# The model will be saved under the 'artifacts' directory of the MLflow run
mlflow.sklearn.log_model(model, "decision_tree_model")
print("Model logged to MLflow.")

# Log profiling results as an artifact to MLflow and W&B
print("Logging profiling results as artifacts...")
profiling_output_filename = "profiling_results.txt"
with open(profiling_output_filename, "w") as f:
    f.write(profiling_output)

mlflow.log_artifact(profiling_output_filename)
wandb.save(profiling_output_filename)

print(f"Profiling results logged as artifact: {profiling_output_filename}")

# Clean up the temporary profiling file
os.remove(profiling_output_filename)
print(f"Temporary profiling file removed: {profiling_output_filename}")


# --- Cell 8: End Runs ---

# End the MLflow run
print("Ending MLflow run...")
mlflow.end_run()
print("MLflow run ended.")

# End the W&B run
print("Ending W&B run...")
wandb.finish()
print("W&B run finished.")

print("\nExperiment complete. Check MLflow UI at http://localhost:5000 and W&B UI at http://localhost:8082")


Setting MLflow experiment: red-wine-quality-cortez-et-al-2009_Decision_Tree


[34m[1mwandb[0m: You can find your API key in your browser here: http://wandb:8080/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

MLflow run already active with ID: 040dca790add4ad4b16591e41ad8e0fb
Get wandb API key from localhost:8082/authorize
Initializing W&B run: Project='kaggle_red-wine-quality-cortez-et-al-2009', Name='decision-tree-training'


  ········


[34m[1mwandb[0m: Appending key for host.docker.internal:8082 to your netrc file: /home/jovyan/.netrc


logged in


Started W&B run with ID: yeb8ztxn
Logging parameters to MLflow and W&B...
Parameters logged.
Starting profiling...
Training Decision Tree model...
Model training complete.
Stopping profiling...
Profiling stopped.
Processing profiling results...
Profiling results processed.

--- Profiling Snippet (Top 10 by Cumulative Time) ---
         3939 function calls (3889 primitive calls) in 0.009 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        6    0.000    0.000    0.009    0.002 /opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3490(run_code)
        6    0.000    0.000    0.009    0.002 {built-in method builtins.exec}
        1    0.000    0.000    0.009    0.009 /opt/conda/lib/python3.11/site-packages/sklearn/base.py:1372(wrapper)
        1    0.000    0.000    0.008    0.008 /opt/conda/lib/python3.11/site-packages/sklearn/tree/_classes.py:993(fit)
        1    0.006    0.006    0.008    0.008 /op



Model logged to MLflow.
Logging profiling results as artifacts...
Profiling results logged as artifact: profiling_results.txt
Temporary profiling file removed: profiling_results.txt
Ending MLflow run...
🏃 View run omniscient-conch-714 at: http://mlflow:5000/#/experiments/212578242154088023/runs/040dca790add4ad4b16591e41ad8e0fb
🧪 View experiment at: http://mlflow:5000/#/experiments/212578242154088023
MLflow run ended.
Ending W&B run...


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.55312
f1_score,0.5409
precision,0.53197
recall,0.55312


W&B run finished.

Experiment complete. Check MLflow UI at http://localhost:5000 and W&B UI at http://localhost:8082


In [None]:
import os
import cProfile, io, pstats
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, mean_absolute_error, r2_score
)
import numpy as np
from sklearn.base import BaseEstimator
import mlflow
import mlflow.sklearn
os.environ["WANDB_BASE_URL"] = "http://wandb:8080"
os.environ["WANDB_DEBUG"] = "true"
os.environ["WANDB_DEBUG_LOG_PATH"] = "/tmp/wandb_debug.log"
import wandb
from wandb.sdk.wandb_settings import Settings

def setup_tracking(dataset_name: str, model_name: str, config: dict):
    # MLflow
    experiment_name = f"{dataset_name}_{model_name}"
    mlflow.set_experiment(experiment_name)
    if mlflow.active_run() is None:
        mlflow.start_run()
        print(f"MLflow run started: {mlflow.active_run().info.run_id}")

    for k, v in config.items():
        mlflow.log_param(k, v)

    # W&B
    wandb_project_name = f"kaggle_{dataset_name}"
    wandb_run_name = f"{model_name}-training"
    wandb_run = wandb.init(
        project=wandb_project_name,
        name=wandb_run_name,
        config=config,
        settings=Settings(init_timeout=60)
    )
    return wandb_run

def evaluate_and_log(model, X_test, y_test):
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

    print("Evaluation metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
        mlflow.log_metric(k, v)
        wandb.log({k: v})

    return metrics

def evaluate_and_log(model, X_test, y_test, task_type=None):
    y_pred = model.predict(X_test)

    # Try to infer task type if not provided
    if task_type is None:
        if len(np.unique(y_test)) > 12 and y_test.dtype.kind in "if":
            task_type = "regression"
        else:
            task_type = "classification"

    metrics = {}
    if task_type == "classification":
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
            "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
            "f1_score": f1_score(y_test, y_pred, average="weighted", zero_division=0)
        }
    elif task_type == "regression":
        metrics = {
            "mse": mean_squared_error(y_test, y_pred),
            "mae": mean_absolute_error(y_test, y_pred),
            "r2": r2_score(y_test, y_pred)
        }
    elif task_type == "llm":
        # Placeholder for LLM-style task evaluation
        metrics = {
            "bleu": 1, #your_bleu_fn(y_test, y_pred),
            "rouge": 1, #your_rouge_fn(y_test, y_pred),
            "BERTScore" : 1,
            # Add more as needed
        }
    else:
        raise ValueError(f"Unsupported task type: {task_type}")

    print(f"Task type: {task_type}")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
        mlflow.log_metric(k, v)
        wandb.log({k: v})

    return metrics

def profile_and_log(func, filename="profiling_results.txt"):
    pr = cProfile.Profile()
    pr.enable()
    func()
    pr.disable()
    s = io.StringIO()
    ps = pstats.Stats(pr, stream=s).sort_stats('cumulative')
    ps.print_stats()
    profiling_output = s.getvalue()

    with open(filename, "w") as f:
        f.write(profiling_output)

    mlflow.log_artifact(filename)
    wandb.save(filename)
    os.remove(filename)

def run_experiment(
    model_class: BaseEstimator,
    model_params: dict,
    X_train, y_train, X_test, y_test,
    dataset_name="red-wine-quality-cortez-et-al-2009",
    etl_version="v1"
):
    # Add ETL version and other metadata
    config = model_params.copy()
    config["etl_version"] = etl_version

    wandb_run = setup_tracking(dataset_name, model_class.__name__, config)

    def train_model():
        print("Training model...")
        model = model_class(**model_params)
        model.fit(X_train, y_train)
        print("Model trained.")
        evaluate_and_log(model, X_test, y_test, task_type="classification")
        mlflow.sklearn.log_model(model, f"{model_class.__name__}_model")

    profile_and_log(train_model)

    # Finish runs
    mlflow.end_run()
    wandb.finish()
    print("Experiment complete.")



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

run_experiment(
    model_class=DecisionTreeClassifier,
    model_params={"max_depth": 10, "random_state": 42},
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    etl_version="2025-05-cleaned"
)

run_experiment(
    model_class=RandomForestClassifier,
    model_params={"n_estimators": 100, "max_depth": 8, "random_state": 42},
    X_train=X_train, y_train=y_train,
    X_test=X_test, y_test=y_test,
    etl_version="2025-05-cleaned"
)