In [8]:
# # Abalone Age Prediction - ML Modeling Pipeline

# This notebook contains the complete machine learning pipeline for predicting abalone age from physical measurements.

# ## Objective
# Build a robust ML pipeline with:
# - Data preprocessing functions
# - Model training with Random Forest
# - Comprehensive evaluation metrics
# - Prediction pipeline for new data

# **Target**: Predict the number of rings (age indicator) from physical measurements

In [9]:
# Abalone Age Prediction - Simple MLflow Pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
import warnings

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

print("üéØ Simple Abalone Age Prediction with MLflow")
print("=" * 50)

ü§ñ Abalone Age Prediction - ML Pipeline


## 1. Data Loading and Inspection Functions

In [10]:
# Setup MLflow
mlflow.set_experiment("abalone_age_prediction")

def inspect_data(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Perform comprehensive data inspection.
    
    Args:
        df (pd.DataFrame): Dataset to inspect
        
    Returns:
        Dict[str, Any]: Inspection summary
    """
    print("\nüîç Data Inspection Report")
    print("=" * 30)
    
    inspection = {}
    
    # Basic info
    inspection['shape'] = df.shape
    inspection['memory_usage'] = df.memory_usage(deep=True).sum() / 1024**2  # MB
    
    # Missing values
    missing = df.isnull().sum()
    inspection['missing_values'] = missing[missing > 0].to_dict()
    
    # Duplicates
    inspection['duplicates'] = df.duplicated().sum()
    
    # Data types
    inspection['dtypes'] = df.dtypes.to_dict()
    
    # Basic statistics for numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    inspection['numerical_summary'] = df[numerical_cols].describe().to_dict()
    
    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    inspection['categorical_summary'] = {}
    
    for col in categorical_cols:
        inspection['categorical_summary'][col] = {
            'unique_values': df[col].unique().tolist(),
            'value_counts': df[col].value_counts().to_dict()
        }
    
    # Print summary
    print(f"üìä Shape: {inspection['shape']}")
    print(f"üíæ Memory usage: {inspection['memory_usage']:.2f} MB")
    print(f"‚ùì Missing values: {len(inspection['missing_values'])} columns affected")
    print(f"üîÑ Duplicate rows: {inspection['duplicates']}")
    
    if inspection['missing_values']:
        print("Missing data details:")
        for col, count in inspection['missing_values'].items():
            print(f"  - {col}: {count} missing")
    
    return inspection

# Load and inspect the data
df = load_data('../data/abalone.csv')
data_inspection = inspect_data(df)

# Display first few rows
print("\nüìã First 5 rows:")
display(df.head())

## 2. Data Preprocessing Pipeline

In [11]:
# Load and explore the data
def load_data(file_path):
    """Load and inspect abalone dataset."""
    df = pd.read_csv(file_path)
    print(f"üìÇ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"Columns: {list(df.columns)}")
    print(f"\nTarget variable (Rings) summary:")
    print(f"Min: {df['Rings'].min()}, Max: {df['Rings'].max()}, Mean: {df['Rings'].mean():.2f}")
    return df

# Load the data
df = load_data('../data/abalone.csv')

# Display basic information
print(f"\n? First 5 rows:")
display(df.head())

print(f"\nüìä Dataset info:")
print(df.info())

# Apply preprocessing to the loaded data
print("\n" + "="*50)
print("üîÑ PREPROCESSING PIPELINE")
print("="*50)

# Clean the data
df_clean = clean_data(df)

# Encode categorical features
df_encoded, encoders = encode_categorical_features(df_clean)

# Engineer features
df_features = feature_engineering(df_encoded)

print(f"\n‚úÖ Preprocessing complete!")
print(f"   Original columns: {len(df.columns)}")
print(f"   Final columns: {len(df_features.columns)}")
print(f"   Final dataset shape: {df_features.shape}")

# Display the processed data
print("\nüìã Processed data sample:")
display(df_features.head())

## 3. Data Splitting Functions

In [12]:
# Simple data preprocessing
def preprocess_data(df):
    """Simple preprocessing without complex feature engineering."""
    df_processed = df.copy()
    
    # Encode Sex column (M, F, I -> 0, 1, 2)
    le = LabelEncoder()
    df_processed['Sex_encoded'] = le.fit_transform(df_processed['Sex'])
    df_processed = df_processed.drop('Sex', axis=1)
    
    print(f"‚úÖ Encoded Sex column: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    
    # Separate features and target
    X = df_processed.drop('Rings', axis=1)
    y = df_processed['Rings']
    
    print(f"üìä Features shape: {X.shape}")
    print(f"üéØ Target shape: {y.shape}")
    
    return X, y, le

# Preprocess data
X, y, label_encoder = preprocess_data(df)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nüìä Data split:")
print(f"Training: {X_train.shape[0]} samples")
print(f"Testing: {X_test.shape[0]} samples")

# Scale the features
X_train_scaled, feature_scaler = scale_features(X_train, fit_scaler=True)
X_test_scaled, _ = scale_features(X_test, fit_scaler=False, scaler=feature_scaler)

print(f"\n‚úÖ Data preparation complete!")
print(f"   Training features: {X_train_scaled.shape}")
print(f"   Test features: {X_test_scaled.shape}")

## 4. Model Training Pipeline

In [13]:
# Model training with MLflow tracking
def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test, **params):
    """Train a model and log metrics with MLflow."""
    
    with mlflow.start_run(run_name=model_name):
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        print(f"\nü§ñ Training {model_name}...")
        model.fit(X_train, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        # Calculate metrics
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        # Log metrics
        mlflow.log_metrics({
            "train_rmse": train_rmse,
            "test_rmse": test_rmse,
            "train_mae": train_mae,
            "test_mae": test_mae,
            "train_r2": train_r2,
            "test_r2": test_r2,
            "overfitting_rmse": train_rmse - test_rmse
        })
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Print results
        print(f"  üìä Results for {model_name}:")
        print(f"    Train RMSE: {train_rmse:.4f}")
        print(f"    Test RMSE:  {test_rmse:.4f}")
        print(f"    Test MAE:   {test_mae:.4f}")
        print(f"    Test R¬≤:    {test_r2:.4f}")
        
        return model, {
            'train_rmse': train_rmse,
            'test_rmse': test_rmse,
            'test_mae': test_mae,
            'test_r2': test_r2
        }

# Train multiple models
models_results = {}

# 1. Linear Regression
lr_model = LinearRegression()
lr_trained, lr_metrics = train_and_log_model(
    lr_model, "Linear_Regression", 
    X_train, X_test, y_train, y_test,
    model_type="Linear Regression"
)
models_results['Linear Regression'] = lr_metrics

# 2. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_trained, rf_metrics = train_and_log_model(
    rf_model, "Random_Forest", 
    X_train, X_test, y_train, y_test,
    model_type="Random Forest",
    n_estimators=100,
    random_state=42
)
models_results['Random Forest'] = rf_metrics

# 3. Decision Tree
dt_model = DecisionTreeRegressor(max_depth=10, random_state=42)
dt_trained, dt_metrics = train_and_log_model(
    dt_model, "Decision_Tree", 
    X_train, X_test, y_train, y_test,
    model_type="Decision Tree",
    max_depth=10,
    random_state=42
)
models_results['Decision Tree'] = dt_metrics

# Display top feature importances
print(f"\nüîù Top 10 Feature Importances:")
top_features = training_info['feature_importance'].head(10)
display(top_features)

## 5. Model Evaluation Functions

In [14]:
# Model comparison and visualization
def plot_model_comparison(models_results):
    """Create visualization comparing model performance."""
    
    # Create comparison DataFrame
    metrics_df = pd.DataFrame(models_results).T
    print("üìä Model Comparison Table:")
    display(metrics_df.round(4))
    
    # Create visualization
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    models = list(models_results.keys())
    test_rmse = [models_results[model]['test_rmse'] for model in models]
    test_mae = [models_results[model]['test_mae'] for model in models]
    test_r2 = [models_results[model]['test_r2'] for model in models]
    
    # RMSE comparison
    axes[0].bar(models, test_rmse, color=['skyblue', 'lightgreen', 'salmon'])
    axes[0].set_title('Test RMSE Comparison')
    axes[0].set_ylabel('RMSE')
    axes[0].tick_params(axis='x', rotation=45)
    
    # MAE comparison
    axes[1].bar(models, test_mae, color=['skyblue', 'lightgreen', 'salmon'])
    axes[1].set_title('Test MAE Comparison')
    axes[1].set_ylabel('MAE')
    axes[1].tick_params(axis='x', rotation=45)
    
    # R¬≤ comparison
    axes[2].bar(models, test_r2, color=['skyblue', 'lightgreen', 'salmon'])
    axes[2].set_title('Test R¬≤ Comparison')
    axes[2].set_ylabel('R¬≤ Score')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Find best model
    best_model = min(models_results.keys(), key=lambda x: models_results[x]['test_rmse'])
    best_rmse = models_results[best_model]['test_rmse']
    
    print(f"\nüèÜ Best Model: {best_model}")
    print(f"   Best Test RMSE: {best_rmse:.4f}")
    
    return metrics_df

# Compare models
comparison_df = plot_model_comparison(models_results)

# Comprehensive evaluation
evaluation_metrics = evaluate_model(
    model=model,
    X_test=X_test_scaled,
    y_test=y_test,
    X_train=X_train_scaled,
    y_train=y_train
)

# Create evaluation visualizations
plot_evaluation_charts(
    model=model,
    X_test=X_test_scaled,
    y_test=y_test,
    X_train=X_train_scaled,
    y_train=y_train,
    feature_names=X_train_scaled.columns.tolist()
)

# Calculate permutation importance
perm_importance = calculate_feature_importance_permutation(
    model=model,
    X_test=X_test_scaled,
    y_test=y_test,
    random_state=42
)

print(f"\nüîù Top 10 Permutation Feature Importances:")
display(perm_importance.head(10))

## 6. Prediction Pipeline

In [15]:
# Predictions and final evaluation
def make_predictions_plot(model, X_test, y_test, model_name):
    """Create actual vs predicted plot."""
    y_pred = model.predict(X_test)
    
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.6, color='blue')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Rings')
    plt.ylabel('Predicted Rings')
    plt.title(f'Actual vs Predicted - {model_name}')
    plt.grid(True, alpha=0.3)
    
    # Add R¬≤ to plot
    r2 = r2_score(y_test, y_pred)
    plt.text(0.05, 0.95, f'R¬≤ = {r2:.3f}', transform=plt.gca().transAxes,
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.show()

def preprocess_new_data(data: pd.DataFrame, 
                       encoders: Dict, 
                       scaler: StandardScaler) -> pd.DataFrame:
    """
    Preprocess new data for prediction using fitted artifacts.
    
    Args:
        data (pd.DataFrame): New data to preprocess
        encoders (Dict): Fitted encoders
        scaler (StandardScaler): Fitted scaler
        
    Returns:
        pd.DataFrame: Preprocessed data ready for prediction
    """
    print("üîß Preprocessing new data for prediction...")
    
    data_processed = data.copy()
    
    # Apply the same cleaning as training data (but don't remove outliers for new data)
    # Just handle missing values
    if data_processed.isnull().any().any():
        print("   Warning: Found missing values in new data")
        # For prediction, you might want to handle this differently
        data_processed = data_processed.fillna(data_processed.median())
    
    # Apply categorical encoding
    if 'Sex' in data_processed.columns and 'Sex' in encoders:
        # Encode Sex column
        data_processed['Sex_encoded'] = encoders['Sex'].transform(data_processed['Sex'])
        
        # Create dummy variables
        sex_dummies = pd.get_dummies(data_processed['Sex'], prefix='Sex')
        data_processed = pd.concat([data_processed, sex_dummies], axis=1)
        data_processed = data_processed.drop('Sex', axis=1)
    
    # Apply feature engineering (same as training)
    data_processed = feature_engineering(data_processed)
    
    # Remove target column if it exists (for prediction on new data)
    if 'Rings' in data_processed.columns:
        data_processed = data_processed.drop('Rings', axis=1)
    
    # Apply scaling
    data_scaled, _ = scale_features(data_processed, fit_scaler=False, scaler=scaler)
    
    print(f"   Preprocessed {len(data_processed)} samples with {len(data_processed.columns)} features")
    
    return data_scaled

def predict_abalone_age(model: RandomForestRegressor,
                       data: pd.DataFrame,
                       encoders: Dict,
                       scaler: StandardScaler,
                       return_confidence: bool = False) -> Dict:
    """
    Make predictions on new abalone data.
    
    Args:
        model: Trained model
        data: New data (raw format)
        encoders: Fitted encoders
        scaler: Fitted scaler
        return_confidence: Whether to return prediction intervals
        
    Returns:
        Dict: Predictions and metadata
    """
    print(f"\nüîÆ Making predictions for {len(data)} samples...")
    
    # Preprocess the data
    data_processed = preprocess_new_data(data, encoders, scaler)
    
    # Make predictions
    predictions = model.predict(data_processed)
    
    # Calculate prediction intervals if requested
    prediction_results = {
        'predictions': predictions,
        'num_samples': len(data),
        'features_used': list(data_processed.columns)
    }
    
    if return_confidence and hasattr(model, 'estimators_'):
        # Calculate prediction intervals using individual trees
        tree_predictions = np.array([tree.predict(data_processed) for tree in model.estimators_])
        
        # Calculate confidence intervals (e.g., 95%)
        lower_percentile = np.percentile(tree_predictions, 2.5, axis=0)
        upper_percentile = np.percentile(tree_predictions, 97.5, axis=0)
        
        prediction_results.update({
            'confidence_lower': lower_percentile,
            'confidence_upper': upper_percentile,
            'confidence_width': upper_percentile - lower_percentile
        })
    
    # Convert to age in years (rings + 1.5)
    age_predictions = predictions + 1.5
    prediction_results['age_years'] = age_predictions
    
    print(f"   ‚úÖ Predictions complete!")
    print(f"   Predicted rings range: {predictions.min():.1f} - {predictions.max():.1f}")
    print(f"   Predicted age range: {age_predictions.min():.1f} - {age_predictions.max():.1f} years")
    
    return prediction_results

def create_sample_prediction_data() -> pd.DataFrame:
    """
    Create sample data for demonstration of prediction pipeline.
    
    Returns:
        pd.DataFrame: Sample abalone data
    """
    sample_data = pd.DataFrame({
        'Sex': ['M', 'F', 'I'],
        'Length': [0.455, 0.53, 0.33],
        'Diameter': [0.365, 0.42, 0.255],
        'Height': [0.095, 0.135, 0.08],
        'Whole weight': [0.514, 0.677, 0.205],
        'Shucked weight': [0.2245, 0.2565, 0.0895],
        'Viscera weight': [0.101, 0.1415, 0.0395],
        'Shell weight': [0.15, 0.21, 0.055]
    })
    
    return sample_data

# Demonstrate prediction pipeline
print("\n" + "="*50)
print("üîÆ PREDICTION PIPELINE DEMO")
print("="*50)

# Create sample data for prediction
sample_data = create_sample_prediction_data()
print("Sample data for prediction:")
display(sample_data)

# Make predictions
prediction_results = predict_abalone_age(
    model=model,
    data=sample_data,
    encoders=encoders,
    scaler=feature_scaler,
    return_confidence=True
)

# Display results
predictions_df = pd.DataFrame({
    'Sex': sample_data['Sex'],
    'Length': sample_data['Length'],
    'Predicted_Rings': prediction_results['predictions'],
    'Predicted_Age_Years': prediction_results['age_years'],
    'Confidence_Lower': prediction_results.get('confidence_lower', [None]*len(sample_data)),
    'Confidence_Upper': prediction_results.get('confidence_upper', [None]*len(sample_data))
})

print(f"\nüéØ Prediction Results:")
display(predictions_df.round(2))

## 7. Complete ML Pipeline

In [16]:
# Save the best model and create summary
import pickle
from pathlib import Path

# Find best model based on test RMSE
best_model_name = min(models_results.keys(), key=lambda x: models_results[x]['test_rmse'])
best_metrics = models_results[best_model_name]

if best_model_name == "Linear Regression":
    best_model = lr_trained
elif best_model_name == "Random Forest":
    best_model = rf_trained
else:
    best_model = dt_trained

print(f"üèÜ Best performing model: {best_model_name}")
print(f"üìä Best model metrics:")
for metric, value in best_metrics.items():
    print(f"  {metric}: {value:.4f}")

# Save the best model
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save model and preprocessor
with open(models_dir / 'best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
with open(models_dir / 'label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print(f"\nüíæ Best model saved to: {models_dir}/best_model.pkl")

# Final summary
print(f"\n" + "="*60)
print(f"? EXPERIMENT SUMMARY")
print(f"="*60)
print(f"üìä Dataset: {df.shape[0]} samples, {df.shape[1]} features")
print(f"üîß Preprocessing: Simple label encoding of 'Sex' column")
print(f"ü§ñ Models trained: 3 (Linear Regression, Random Forest, Decision Tree)")
print(f"üèÜ Best model: {best_model_name}")
print(f"üìà Best Test RMSE: {best_metrics['test_rmse']:.4f}")
print(f"üìà Best Test R¬≤: {best_metrics['test_r2']:.4f}")
print(f"\nüí° To view experiments in MLflow UI:")
print(f"   1. Run: mlflow ui")
print(f"   2. Open: http://localhost:5000")
print(f"   3. Browse the 'abalone_age_prediction' experiment")
print(f"="*60)