In [0]:
# Install dependencies
!pip install -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt

# Install Databricks Feature Engineering for Feature Store support
%pip install databricks-feature-engineering --quiet

print("✅ All dependencies installed")

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
✅ All dependencies installed


In [0]:
# Fix typing_extensions version issue
# The Sentinel class was added in typing_extensions 4.6.0
%pip install --upgrade typing_extensions>=4.6.0 pydantic>=2.0.0 --quiet

# Restart Python kernel to use updated packages
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import sys
import os
import logging
import importlib

sys.path.append(os.path.abspath('../'))

from utils.common_utils import load_config, setup_logging, print_section_header, Timer, get_spark_session
from utils.data_loader import load_data_from_source
from utils import model_training
importlib.reload(model_training)
from utils.model_training import (
    prepare_training_data, get_model, train_model, evaluate_model,
    get_feature_importance, plot_feature_importance, log_model_to_mlflow
)
import mlflow
import pandas as pd
import numpy as np

In [0]:
# Override prepare_training_data to fix the bug and keep interest rates as numeric
from typing import Dict, Any, Tuple
import pandas as pd
import logging
from sklearn.model_selection import train_test_split

def prepare_training_data(features_df: pd.DataFrame,
                         target_df: pd.DataFrame,
                         config: Dict[str, Any]) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Prepare data for model training (train-test split).
    
    FIXED: 
    1. Interest rate columns are kept as numeric instead of one-hot encoded
    2. feature_cols is properly updated after one-hot encoding
    
    Args:
        features_df: DataFrame with features
        target_df: DataFrame with target variable
        config: Configuration dictionary
        
    Returns:
        Tuple of (X_train, X_test, y_train, y_test)
    """
    logging.info("Preparing training data...")
    
    # Merge features with target
    df = features_df.merge(target_df, on='CUSTOMERID', how='inner')
    
    # Remove customers without target (didn't purchase next product)
    df = df[df['NEXT_PRODUCT_ID'].notna()].copy()
    
    logging.info(f"Training dataset: {len(df)} samples with target")
    
    # Separate features and target
    target_col = config['model']['target_variable']
    df[target_col] = df['NEXT_PRODUCT_ID']
    
    # Identify feature columns (exclude ID and target columns)
    id_cols = ['CUSTOMERID', 'PARTYID', 'NEXT_PRODUCT_ID', target_col]
    feature_cols = [col for col in df.columns if col not in id_cols]
    
    # Handle categorical columns (one-hot encoding)
    # Exclude interest rate columns - keep them as numeric
    categorical_cols = df[feature_cols].select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Convert interest rate columns to numeric (they should not be one-hot encoded)
    interest_rate_cols = [col for col in categorical_cols if 'INTEREST_RATE' in col]
    if interest_rate_cols:
        logging.info(f"Converting {len(interest_rate_cols)} interest rate columns to numeric: {interest_rate_cols}")
        print(f"\n🔧 Converting {len(interest_rate_cols)} interest rate columns to numeric: {interest_rate_cols}")
        for col in interest_rate_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        # Remove from categorical list
        categorical_cols = [col for col in categorical_cols if col not in interest_rate_cols]
    
    if categorical_cols:
        logging.info(f"Encoding {len(categorical_cols)} categorical columns: {categorical_cols}")
        print(f"🔄 One-hot encoding {len(categorical_cols)} categorical columns: {categorical_cols}")
        df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
        
        # FIX: Update feature columns after encoding (this was missing in original code)
        id_cols_set = set(id_cols)
        feature_cols = [col for col in df.columns if col not in id_cols_set]
        print(f"✅ Features after encoding: {len(feature_cols)} columns")
    
    X = df[feature_cols]
    y = df[target_col]
    
    print(f"\n✅ Final feature count: {X.shape[1]} (interest rates kept as numeric)")
    
    # Train-test split
    split_config = config['model']['train_test_split']
    test_size = split_config['test_size']
    random_state = split_config['random_state']
    stratify = y if split_config['stratify'] else None
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=stratify
    )
    
    logging.info(f"Train set: {len(X_train)} samples")
    logging.info(f"Test set: {len(X_test)} samples")
    logging.info(f"Number of features: {X_train.shape[1]}")
    logging.info(f"Number of classes: {y.nunique()}")
    
    return X_train, X_test, y_train, y_test

print("✅ Fixed prepare_training_data function loaded")
print("   - Interest rates will be kept as numeric (not one-hot encoded)")
print("   - Bug fix: feature_cols properly updated after encoding")

✅ Fixed prepare_training_data function loaded
   - Interest rates will be kept as numeric (not one-hot encoded)
   - Bug fix: feature_cols properly updated after encoding


In [0]:
config = load_config('../config/config.yaml')

# Set MLflow tracking URI and experiment for local or databricks mode
if config['environment']['mode'] == 'databricks':
    # Use Databricks MLflow and set experiment name from config
    experiment_name = config['mlflow']['databricks']['experiment_name']
    mlflow.set_experiment(experiment_name)
    print(f'Using Databricks MLflow tracking.')
    print(f'Experiment: {experiment_name}')
else:
    # Set MLflow tracking to workspace root /mlruns directory
    workspace_root = os.path.abspath('..')
    mlflow_tracking_dir = os.path.join(workspace_root, 'mlruns')
    
    # For Windows, use file:/// with forward slashes
    mlflow_tracking_uri = 'file:///' + mlflow_tracking_dir.replace('\\', '/')
    
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    
    experiment_name = config['mlflow']['local'].get('experiment_name', 'default')
    mlflow.set_experiment(experiment_name)
    
    print(f"📂 MLflow tracking URI: {mlflow_tracking_uri}")
    print(f"📂 MLflow runs will be saved to: {mlflow_tracking_dir}")
    print(f"Experiment: {experiment_name}")

setup_logging(config)

print_section_header("Model Training")

# Initialize MLflow client
mlflow_client = mlflow

Using Databricks MLflow tracking.
Experiment: /Users/ashish.kamboj@tigeranalytics.com/next-best-product-recommendation

                                 Model Training                                 



## 1. Load Features and Target

In [0]:
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Load features and targets
if config['data_source']['type'] == 'unity_catalog':
    # Load from Unity Catalog Feature Store
    uc_config = config['data_source']['unity_catalog']
    catalog = uc_config['catalog']
    output_schema = uc_config['output_schema']
    
    # Try to load from Feature Store first, fallback to regular table
    try:
        from databricks.feature_engineering import FeatureEngineeringClient
        
        fe = FeatureEngineeringClient()
        
        feature_store_table = f"{catalog}.{output_schema}.customer_features_fs"
        target_table = f"{catalog}.{output_schema}.customer_targets"
        
        print(f"📊 Loading features from Feature Store: {feature_store_table}")
        print(f"📂 Loading targets from: {target_table}")
        
        # Load features from Feature Store
        features_df = fe.read_table(name=feature_store_table).toPandas()
        
        # Load targets from regular table
        target_df = spark.table(target_table).toPandas()
        
        print("✅ Using Databricks Feature Store for training")
        
    except ImportError:
        # Feature Store not available, use regular tables
        print("⚠️ Feature Store client not available, using regular Unity Catalog tables")
        
        features_table = f"{catalog}.{output_schema}.customer_features"
        target_table = f"{catalog}.{output_schema}.customer_targets"
        
        print(f"📂 Loading features from: {features_table}")
        print(f"📂 Loading targets from: {target_table}")
        
        features_df = spark.table(features_table).toPandas()
        target_df = spark.table(target_table).toPandas()
        
    except Exception as e:
        # Feature Store table doesn't exist, use regular tables
        print(f"⚠️ Feature Store table not found: {str(e)}")
        print("   Using regular Unity Catalog tables instead")
        
        features_table = f"{catalog}.{output_schema}.customer_features"
        target_table = f"{catalog}.{output_schema}.customer_targets"
        
        print(f"📂 Loading features from: {features_table}")
        print(f"📂 Loading targets from: {target_table}")
        
        features_df = spark.table(features_table).toPandas()
        target_df = spark.table(target_table).toPandas()
else:
    # CSV mode
    features_path = os.path.abspath('../data/processed/customer_features.csv')
    target_path = os.path.abspath('../data/processed/customer_targets.csv')
    print(f"📂 Loading features from: {features_path}")
    print(f"📂 Loading targets from: {target_path}")
    
    features_df = pd.read_csv(features_path)
    target_df = pd.read_csv(target_path)

logging.info(f"Loaded features: {features_df.shape}")
logging.info(f"Loaded targets: {target_df.shape}")

print(f"\n✅ Features loaded: {features_df.shape}")
print(f"✅ Targets loaded: {target_df.shape}")

📊 Loading features from Feature Store: datafabric_catalog.ml_outputs.customer_features_fs
📂 Loading targets from: datafabric_catalog.ml_outputs.customer_targets
✅ Using Databricks Feature Store for training

✅ Features loaded: (1000, 43)
✅ Targets loaded: (467, 2)


## 2. Prepare Training Data

In [0]:
# Check if data is valid
print(f"Features shape: {features_df.shape}")
print(f"Targets shape: {target_df.shape}")
print(f"Customers with features: {features_df['CUSTOMERID'].nunique()}")
print(f"Customers with targets: {target_df['CUSTOMERID'].nunique()}")
print(f"Targets with NEXT_PRODUCT_ID: {target_df['NEXT_PRODUCT_ID'].notna().sum()}")

# Check for common customers
common_customers = set(features_df['CUSTOMERID']).intersection(set(target_df['CUSTOMERID']))
print(f"Common customers: {len(common_customers)}")

if len(common_customers) == 0:
    raise ValueError("No common customers between features and targets! Please run 02_feature_engineering.ipynb first.")

# Check target distribution and filter out classes with too few samples
target_counts = target_df['NEXT_PRODUCT_ID'].value_counts()
classes_with_few_samples = target_counts[target_counts < 2].index.tolist()

if len(classes_with_few_samples) > 0:
    print(f"\n⚠️ Warning: Removing {len(classes_with_few_samples)} classes with < 2 samples: {classes_with_few_samples}")
    # Filter out these classes from target_df
    target_df = target_df[~target_df['NEXT_PRODUCT_ID'].isin(classes_with_few_samples)]
    print(f"Remaining targets: {target_df['NEXT_PRODUCT_ID'].notna().sum()}")

# Temporarily disable stratification if we still have too few samples
original_stratify = config['model']['train_test_split'].get('stratify', True)
if target_df['NEXT_PRODUCT_ID'].notna().sum() < 50:
    print("\n⚠️ Warning: Too few samples for stratification. Disabling stratify.")
    config['model']['train_test_split']['stratify'] = False

X_train, X_test, y_train, y_test = prepare_training_data(features_df, target_df, config)

# Restore original stratify setting
config['model']['train_test_split']['stratify'] = original_stratify

print(f"\n✅ Training set: {X_train.shape}")
print(f"✅ Test set: {X_test.shape}")

Features shape: (1000, 43)
Targets shape: (467, 2)
Customers with features: 1000
Customers with targets: 467
Targets with NEXT_PRODUCT_ID: 284
Common customers: 467

🔧 Converting 2 interest rate columns to numeric: ['MIN_INTEREST_RATE', 'MAX_INTEREST_RATE']
🔄 One-hot encoding 2 categorical columns: ['AGE_GROUP', 'TENURE_GROUP']
✅ Features after encoding: 46 columns

✅ Final feature count: 46 (interest rates kept as numeric)

✅ Training set: (227, 46)
✅ Test set: (57, 46)


## 3. Train Model

In [0]:
with Timer("Model Training"):
    # Get model
    model = get_model(config)
    
    # Train
    model = train_model(model, X_train, y_train)
    
    # Evaluate
    metrics = evaluate_model(model, X_test, y_test, config)
    
    # Feature importance
    feature_importance = get_feature_importance(model, X_train.columns.tolist())
    
    # Save feature importance plot as PNG
    if not feature_importance.empty:
        import os
        models_output_path = config.get('eda', {}).get('output_path', './outputs/models').replace('eda', 'models')
        if not os.path.isabs(models_output_path):
            models_output_path = os.path.abspath(os.path.join('..', models_output_path))
        os.makedirs(models_output_path, exist_ok=True)
        
        plot_path = os.path.join(models_output_path, 'feature_importance.png')
        plot_feature_importance(feature_importance, plot_path, top_n=20, algorithm=config['model']['algorithm'])
        print(f"📊 Feature importance plot saved to: {plot_path}")

print("\n✅ Model training completed!")
print(f"\n📊 Model trained with {X_train.shape[1]} features (interest rates as numeric)")

📊 Feature importance plot saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/models/feature_importance.png

✅ Model training completed!

📊 Model trained with 46 features (interest rates as numeric)


## 4. Log to MLflow

In [0]:
run_id = log_model_to_mlflow(
    model, X_train, metrics, feature_importance, config, mlflow_client
)

print(f"✅ Model logged to MLflow. Run ID: {run_id}")

🔗 View Logged Model at: https://adb-1364099644588382.2.azuredatabricks.net/ml/experiments/2220209055559251/models/m-b371b339c7294134826d5382f4029091?o=1364099644588382


📊 Feature importance saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/models/feature_importance.csv
✅ Model logged to MLflow. Run ID: b5b7dcadd50b4c23aad9ace5dc793cf4


## 5. Model Summary

In [0]:
print_section_header("Model Training Summary")

print(f"""
Algorithm: {config['model']['algorithm']}
Training samples: {len(X_train)}
Test samples: {len(X_test)}
Number of features: {X_train.shape[1]}
Number of classes: {len(np.unique(y_train))}

Model Performance Metrics:
""")

# Display standard metrics
print("\n📊 Classification Metrics:")
for metric, value in metrics.items():
    if value is not None and not metric.startswith('top_'):
        print(f"  {metric}: {value:.4f}")

# Display Top-K metrics separately
if any(k.startswith('top_') for k in metrics.keys()):
    print("\n🎯 Top-K Accuracy (Recommendation Metrics):")
    print("  (How often is the true product in top-K recommendations?)")
    for metric, value in metrics.items():
        if metric.startswith('top_') and value is not None:
            k_value = metric.split('_')[1]
            print(f"  {metric}: {value:.4f} ({value*100:.1f}% - true product in top-{k_value})")

print(f"\n📈 Feature Importance:")
print(f"Top 10 Important Features:")
print(feature_importance.head(10).to_string(index=False))

print(f"\nMLflow Run ID: {run_id}")

print("\n✅ Model training completed!")


                             Model Training Summary                             


Algorithm: xgboost
Training samples: 227
Test samples: 57
Number of features: 46
Number of classes: 10

Model Performance Metrics:


📊 Classification Metrics:
  accuracy: 0.1053
  precision_weighted: 0.1155
  recall_weighted: 0.1053
  f1_weighted: 0.1057
  roc_auc_ovr: 0.4963

🎯 Top-K Accuracy (Recommendation Metrics):
  (How often is the true product in top-K recommendations?)
  top_1_accuracy: 0.0000 (0.0% - true product in top-1)
  top_3_accuracy: 0.0000 (0.0% - true product in top-3)
  top_5_accuracy: 0.0000 (0.0% - true product in top-5)

📈 Feature Importance:
Top 10 Important Features:
                   feature  importance
  TXN_AMOUNT_STD_LONG_TERM    0.048479
 TXN_AMOUNT_MEAN_LONG_TERM    0.047423
  TXN_AMOUNT_MAX_LONG_TERM    0.036901
     TXN_COUNT_MEDIUM_TERM    0.033900
TXN_AMOUNT_MIN_MEDIUM_TERM    0.027192
  TXN_AMOUNT_SUM_LONG_TERM    0.024215
            TOTAL_ACCOUNTS    0.024166
TXN_A