In [2]:
import kagglehub
import os
import shutil



dataset_id = "uciml/red-wine-quality-cortez-et-al-2009"


local_dataset_name = dataset_id.split('/')[-1] # Uses the last part of the ID

# Check if dataset already exists in destination
destination_path = f"/home/jovyan/data/{local_dataset_name}"
if os.path.exists(destination_path) and os.listdir(destination_path):
    print(f"Dataset already exists at {destination_path}")
    print("Skipping download...")
else:
    print(f"Dataset not found locally. Downloading {dataset_id}...")


    print(f"Downloading dataset: {dataset_id}")

    # downloads to a cache location inside the container
    download_path = kagglehub.dataset_download(dataset_id)
    print(f"Dataset downloaded to temporary path in container: {download_path}")

    # destination path in mounted data volume - inside the 'data' folder on local 
    destination_path = f"/home/jovyan/data/{local_dataset_name}"
    print(f"Copying dataset to shared data volume: {destination_path}")

    # Ensure the destination directory exists 
    os.makedirs(destination_path, exist_ok=True)

    # Copy the contents of the downloaded dataset directory to data volume
    # This makes the data persistent and accessible from host and other services
    for item in os.listdir(download_path):
        s = os.path.join(download_path, item)
        d = os.path.join(destination_path, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks=False, ignore=None, dirs_exist_ok=True)
        else:
            shutil.copy2(s, d)

    print(f"Dataset '{dataset_id}' successfully copied to {destination_path} in your shared volume.")
print(f"You can now access the dataset files from: {destination_path}")


Dataset already exists at /home/jovyan/data/red-wine-quality-cortez-et-al-2009
Skipping download...
You can now access the dataset files from: /home/jovyan/data/red-wine-quality-cortez-et-al-2009


In [5]:
# --- Cell 2: Import Libraries and Load Data ---

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import wandb
import os
import cProfile # For basic profiling
import pstats # For processing profiling results
import io # For capturing profiling output

# Define the name of the dataset directory in your shared data volume
# This should match the local_dataset_name used in the download step
# Use the local_dataset_name from cell 1 - no need to redefine it here
# local_dataset_name = "red-wine-quality-cortez-et-al-2009" # Example dataset name
dataset_path = f"/home/jovyan/data/{local_dataset_name}"

# Define the name of the dataset file within that directory
# You might need to check the contents of the downloaded dataset to find the correct file name
# For the red wine quality dataset, let's assume it's 'winequality-red.csv' or similar.
# You might need to adjust this based on the actual dataset structure.
# Let's list files in the downloaded directory to be sure
print(f"Listing files in {dataset_path}:")
try:
    for root, dirs, files in os.walk(dataset_path):
        level = root.replace(dataset_path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f'{indent}{os.path.basename(root)}/')
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print(f'{subindent}{f}')
except FileNotFoundError:
    print(f"Error: Directory not found: {dataset_path}. Please ensure the dataset was downloaded and copied correctly.")
    # Exit or handle the error appropriately if the directory is not found

# Assuming the data file is named 'winequality-red.csv' within the downloaded directory
# Adjust this path based on the actual file name and structure
# Find the first CSV file in the dataset directory
csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')]
if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {dataset_path}")
if len(csv_files) > 1:
    print(f"Warning: Multiple CSV files found. Using the first one: {csv_files[0]}")

# Load the data
data_file_path = os.path.join(dataset_path, csv_files[0])
print(f"\nLoading data from: {data_file_path}")
df = pd.read_csv(data_file_path)

# Display available columns and prompt for target
print("\nAvailable columns in the dataset:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Create a flag file to check if target has been selected
target_flag_file = os.path.join(dataset_path, '.target_selected')
target_column = 'quality'
if not os.path.exists(target_flag_file):
    print("\nPlease set target_column variable above and run this cell again.")
    print("Example: target_column = 'column_name'")
    # Create an empty flag file to indicate we need target selection
    with open(target_flag_file, 'w') as f:
        pass
    raise SystemExit("Waiting for target column selection...")

# If target_column is defined and valid, save it to the flag file
try:
    if target_column in df.columns:
        with open(target_flag_file, 'w') as f:
            f.write(target_column)
        print(f"\nTarget column '{target_column}' has been saved.")
    else:
        os.remove(target_flag_file)  # Remove flag file if target is invalid
        raise ValueError(f"Selected target column '{target_column}' not found in dataset columns")
except NameError:
    os.remove(target_flag_file)  # Remove flag file if target_column not defined
    raise NameError("target_column variable not defined. Please define it and run again.")



# data_file_path = os.path.join(dataset_path, 'winequality-red.csv') # ** ADJUST THIS **
# Check if target_column is defined and valid
try:
    with open(target_flag_file, 'r') as f:
        target_column = f.read().strip()
    if not target_column:
        raise ValueError("Target column not found in flag file")
    
    # Load the dataset
    df = pd.read_csv(data_file_path)
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
    print("Dataset columns:", df.columns.tolist())
    print("Dataset head:\n", df.head())

    # Verify target column exists
    if target_column not in df.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset.")

    X = df.drop(target_column, axis=1)
    y = df[target_column]
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples).")

except FileNotFoundError:
    print(f"Error: Data file not found at {data_file_path}. Please check the file name and path.")
except ValueError as e:
    print(f"Error during data preparation: {e}")
except Exception as e:
    print(f"An unexpected error occurred during data loading or preparation: {e}")


Listing files in /home/jovyan/data/red-wine-quality-cortez-et-al-2009:
red-wine-quality-cortez-et-al-2009/
    winequality-red.csv

Loading data from: /home/jovyan/data/red-wine-quality-cortez-et-al-2009/winequality-red.csv

Available columns in the dataset:
0: fixed acidity
1: volatile acidity
2: citric acid
3: residual sugar
4: chlorides
5: free sulfur dioxide
6: total sulfur dioxide
7: density
8: pH
9: sulphates
10: alcohol
11: quality

Please set target_column variable above and run this cell again.
Example: target_column = 'column_name'


SystemExit: Waiting for target column selection...

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# --- Cell 3: MLflow and W&B Setup, Profiling Start ---

# Ensure MLflow tracking URI is set (should be from environment variable)
# mlflow.set_tracking_uri("http://mlflow:5000") # This should be set by docker-compose env var

# Set the MLflow experiment name
mlflow_experiment_name = f"{local_dataset_name}_Decision_Tree"
print(f"\nSetting MLflow experiment: {mlflow_experiment_name}")
mlflow.set_experiment(mlflow_experiment_name)

# Start a new MLflow run
mlflow_run = mlflow.start_run()
print(f"Started MLflow run with ID: {mlflow_run.info.run_id}")

# W&B: Initialize a new run
# The project name helps organize runs in the W&B UI
# The WANDB_DIR environment variable in docker-compose.yml ensures data goes to the shared volume
wandb_project_name = f"kaggle_{local_dataset_name}"
wandb_run_name = "decision-tree-training"
print(f"Initializing W&B run: Project='{wandb_project_name}', Name='{wandb_run_name}'")
wandb.init(project=wandb_project_name, name=wandb_run_name)
print(f"Started W&B run with ID: {wandb.run.id}")


# Define model parameters
max_depth = 10 # Example hyperparameter
random_state = 42

# Log parameters to MLflow and W&B
print("Logging parameters to MLflow and W&B...")
mlflow.log_param("max_depth", max_depth)
mlflow.log_param("random_state", random_state)
wandb.config.max_depth = max_depth
wandb.config.random_state = random_state
print("Parameters logged.")

# --- Start Profiling ---
# Profiling the training process to understand where time is spent
print("Starting profiling...")
pr = cProfile.Profile()
pr.enable()


# --- Cell 4: Model Training ---

# Create and train the Decision Tree model
print("Training Decision Tree model...")
model = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
model.fit(X_train, y_train)
print("Model training complete.")


# --- Cell 5: Profiling Stop and Processing ---

# --- Stop Profiling ---
print("Stopping profiling...")
pr.disable()
print("Profiling stopped.")

# Process profiling results
print("Processing profiling results...")
s = io.StringIO()
sortby = 'cumulative' # Sort results by cumulative time
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
profiling_output = s.getvalue()
print("Profiling results processed.")

# Print a snippet of profiling results (optional)
print("\n--- Profiling Snippet (Top 10 by Cumulative Time) ---")
print('\n'.join(profiling_output.splitlines()[:15])) # Print header and top few lines
print("----------------------------------------------------")


# --- Cell 6: Model Evaluation and Metric Logging ---

# Make predictions on the test set
print("Making predictions on the test set...")
y_pred = model.predict(X_test)
print("Predictions made.")

# Calculate evaluation metrics
print("Calculating evaluation metrics...")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) # Use weighted average for multi-class
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("Metrics calculated.")

# Log metrics to MLflow and W&B
print("Logging metrics to MLflow and W&B...")
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("precision", precision)
mlflow.log_metric("recall", recall)
mlflow.log_metric("f1_score", f1)

wandb.log({
    "accuracy": accuracy,
    "precision": precision,
    "recall": recall,
    "f1_score": f1
})
print("Metrics logged.")

# --- Cell 7: Model and Artifact Logging ---

# Log the trained model to MLflow
print("Logging model with MLflow...")
# The model will be saved under the 'artifacts' directory of the MLflow run
mlflow.sklearn.log_model(model, "decision_tree_model")
print("Model logged to MLflow.")

# Log profiling results as an artifact to MLflow and W&B
print("Logging profiling results as artifacts...")
profiling_output_filename = "profiling_results.txt"
with open(profiling_output_filename, "w") as f:
    f.write(profiling_output)

mlflow.log_artifact(profiling_output_filename)
wandb.save(profiling_output_filename)

print(f"Profiling results logged as artifact: {profiling_output_filename}")

# Clean up the temporary profiling file
os.remove(profiling_output_filename)
print(f"Temporary profiling file removed: {profiling_output_filename}")


# --- Cell 8: End Runs ---

# End the MLflow run
print("Ending MLflow run...")
mlflow.end_run()
print("MLflow run ended.")

# End the W&B run
print("Ending W&B run...")
wandb.finish()
print("W&B run finished.")

print("\nExperiment complete. Check MLflow UI at http://localhost:5000 and W&B UI at http://localhost:8082")
