In [0]:
# Fix typing_extensions version issue first
# The Sentinel class was added in typing_extensions 4.6.0
%pip install --upgrade typing_extensions>=4.6.0 pydantic>=2.0.0 --quiet

# Install Azure dependencies for Unity Catalog
%pip install azure-storage-file-datalake azure-identity azure-core --quiet

# Install Databricks Feature Engineering for Feature Store
%pip install databricks-feature-engineering --quiet

# Install project requirements
!pip install -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt

# Restart Python kernel to use updated packages
dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import sys
import os
import logging
import pandas as pd

# Dynamically determine project root
project_root = os.path.dirname(os.getcwd()) if os.getcwd().endswith('notebooks') else os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import utils modules
from utils.common_utils import load_config, setup_logging, get_spark_session, print_section_header, Timer
from utils.data_loader import load_data_from_source
from utils.model_deployment import load_model_from_registry, batch_predict, save_predictions

import mlflow

print("✅ All modules imported successfully")

✅ All modules imported successfully


In [0]:
config = load_config('../config/config.yaml')
setup_logging(config)

print_section_header("Batch Inference")


                                Batch Inference                                 



## 1. Load Production Model

In [0]:
model_name = config['mlflow']['databricks']['registered_model_name'] if config['environment']['mode'] == 'databricks' else config['mlflow']['local']['registered_model_name']

# Set MLflow tracking URI to correct location (workspace root, not notebooks folder)
import os
env_mode = config['environment']['mode']
if env_mode == 'databricks':
    tracking_uri = config['mlflow']['databricks']['tracking_uri']
else:
    # For local mode, resolve to workspace root
    tracking_uri_relative = config['mlflow']['local']['tracking_uri']
    cwd = os.getcwd()
    if cwd.endswith('notebooks'):
        workspace_root = os.path.dirname(cwd)
    else:
        workspace_root = cwd
    tracking_path = os.path.join(workspace_root, tracking_uri_relative.lstrip('./'))
    tracking_uri = 'file:///' + tracking_path.replace('\\', '/')

mlflow.set_tracking_uri(tracking_uri)
print(f"🔍 MLflow tracking URI: {tracking_uri}")

# Create MLflow client
mlflow_client = mlflow.MlflowClient()

if env_mode == 'databricks':
    # Unity Catalog: use aliases instead of stages
    print(f"\nChecking Unity Catalog model: {model_name}")
    
    # Try to get model version by 'production' alias
    try:
        production_version = mlflow_client.get_model_version_by_alias(model_name, 'production')
        print(f"✅ Found model with 'production' alias: Version {production_version.version}")
        
        # Load model using sklearn loader to get predict_proba method
        model_uri = f"models:/{model_name}@production"
        model = mlflow.sklearn.load_model(model_uri)
        print(f"✅ Model loaded successfully from alias 'production' (sklearn model)")
        
    except Exception as e:
        # If no production alias, show available versions and aliases
        print(f"❌ No model found with 'production' alias")
        print(f"\nAvailable model versions:")
        
        all_versions = mlflow_client.search_model_versions(f"name='{model_name}'")
        for v in sorted(all_versions, key=lambda x: int(x.version)):
            version_details = mlflow_client.get_model_version(model_name, v.version)
            aliases = version_details.aliases if hasattr(version_details, 'aliases') else []
            print(f"  Version {v.version}: Aliases={aliases}, Status={v.status}")
        
        error_msg = f"""
❌ No model found with 'production' alias!
   
   To set the 'production' alias:
   1. Go to 04_model_deployment.ipynb
   2. Run cell 15 (manual promotion) to set 'production' alias
   OR
   3. Run cell 6 above in this notebook to quick-promote
   
   Batch inference REQUIRES a 'production' alias for safety and governance.
"""
        print(error_msg)
        raise ValueError("No Production model available. Please set 'production' alias first.")
        
else:
    # Local MLflow: use traditional stages
    versions = mlflow_client.get_latest_versions(model_name)
    production_versions = [v for v in versions if v.current_stage == 'Production']
    
    print(f"\nAvailable model versions:")
    for v in versions:
        print(f"  Version {v.version}: Stage={v.current_stage}")
    
    if not production_versions:
        error_msg = f"""
❌ No model found in Production stage!
   
   Available versions: {[(v.version, v.current_stage) for v in versions]}
   
   To promote a model to Production:
   1. Go to 04_model_deployment.ipynb
   2. Run the manual promotion cell (section 3c)
   OR
   3. Run cell 6 above in this notebook to quick-promote from Staging
   
   Batch inference REQUIRES a Production model for safety and governance.
"""
        print(error_msg)
        raise ValueError("No Production model available. Please promote a model to Production first.")
    
    print(f"\n✅ Found model in Production stage (Version {production_versions[0].version})")
    model = load_model_from_registry(model_name, stage='Production', config=config)
    print(f"✅ Model loaded successfully from registry")

🔍 MLflow tracking URI: databricks

Checking Unity Catalog model: datafabric_catalog.customer_hc_silver.next_best_product_model
✅ Found model with 'production' alias: Version 5


Downloading artifacts:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Model loaded successfully from alias 'production' (sklearn model)


## 2. Load Customer Features and Filter for Inference

In [0]:
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Load features from Feature Store and targets from Unity Catalog
if config['data_source']['type'] == 'unity_catalog':
    uc_config = config['data_source']['unity_catalog']
    catalog = uc_config['catalog']
    output_schema = uc_config['output_schema']
    
    # Load features from Feature Store
    feature_store_table = f"{catalog}.{output_schema}.customer_features_fs"
    target_table = f"{catalog}.{output_schema}.customer_targets"
    
    print(f"📊 Loading features from Feature Store: {feature_store_table}")
    print(f"📂 Loading targets from: {target_table}")
    
    try:
        from databricks.feature_engineering import FeatureEngineeringClient
        
        fe = FeatureEngineeringClient()
        
        # Read features from Feature Store
        features_spark_df = fe.read_table(name=feature_store_table)
        features_df = features_spark_df.toPandas()
        
        print(f"✅ Loaded features from Feature Store")
        
    except ImportError:
        print(f"⚠️ Feature Engineering client not available, falling back to direct table read")
        features_table = f"{catalog}.{output_schema}.customer_features"
        features_df = spark.table(features_table).toPandas()
        print(f"✅ Loaded features from Unity Catalog table: {features_table}")
    
    except Exception as e:
        print(f"⚠️ Error reading from Feature Store: {str(e)}")
        print(f"   Falling back to direct table read")
        features_table = f"{catalog}.{output_schema}.customer_features"
        features_df = spark.table(features_table).toPandas()
        print(f"✅ Loaded features from Unity Catalog table: {features_table}")
    
    # Load targets from Unity Catalog
    target_df = spark.table(target_table).toPandas()
    
else:
    # CSV mode
    features_path = os.path.abspath('../data/processed/customer_features.csv')
    target_path = os.path.abspath('../data/processed/customer_targets.csv')
    print(f"📂 Loading features from: {features_path}")
    print(f"📂 Loading targets from: {target_path}")
    
    features_df = pd.read_csv(features_path)
    target_df = pd.read_csv(target_path)

print(f"✅ Loaded features for {len(features_df)} customers")

# Filter to ONLY customers who DON'T have a target (never purchased next product)
# These customers were NOT used in model training - true inference scenario
customers_with_target = target_df[target_df['NEXT_PRODUCT_ID'].notna()]['CUSTOMERID'].unique()
features_df = features_df[~features_df['CUSTOMERID'].isin(customers_with_target)].copy()

print(f"✅ Filtered to {len(features_df)} customers WITHOUT target (true inference set)")
print(f"   These customers were NEVER used in model training")

📊 Loading features from Feature Store: datafabric_catalog.ml_outputs.customer_features_fs
📂 Loading targets from: datafabric_catalog.ml_outputs.customer_targets
✅ Loaded features from Feature Store
✅ Loaded features for 1000 customers
✅ Filtered to 716 customers WITHOUT target (true inference set)
   These customers were NEVER used in model training


## 3. Generate Predictions

In [0]:
# Prepare features (match training format)
customer_ids = features_df['CUSTOMERID']
X = features_df.drop(['CUSTOMERID', 'PARTYID'], axis=1, errors='ignore')

# Handle categorical encoding (same as training)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
if categorical_cols:
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print(f"Features after encoding: {X.shape}")

# Align features with training data
# Get expected feature names from the model
if hasattr(model, 'feature_names_in_'):
    expected_features = model.feature_names_in_
    print(f"Model expects {len(expected_features)} features")
    
    # Add missing columns with 0 values
    missing_cols = set(expected_features) - set(X.columns)
    if missing_cols:
        print(f"Adding {len(missing_cols)} missing columns with 0 values")
        for col in missing_cols:
            X[col] = 0
    
    # Remove extra columns not in training
    extra_cols = set(X.columns) - set(expected_features)
    if extra_cols:
        print(f"Removing {len(extra_cols)} extra columns not in training")
        X = X.drop(columns=list(extra_cols))
    
    # Reorder columns to match training
    X = X[expected_features]
    print(f"Features aligned: {X.shape}")
else:
    print("⚠️ Warning: Model doesn't have feature_names_in_ attribute. Proceeding without alignment.")

# Generate top-K predictions
with Timer("Batch Prediction"):
    top_k = config['deployment']['batch_inference']['top_k_recommendations']
    predictions_df = batch_predict(model, X, customer_ids, config, top_k=top_k)

print(f"\n✅ Generated {len(predictions_df)} recommendations for {len(customer_ids)} customers")

Features after encoding: (716, 298)
Model expects 46 features
Adding 2 missing columns with 0 values
Removing 254 extra columns not in training
Features aligned: (716, 46)

✅ Generated 2148 recommendations for 716 customers


In [0]:
# Load products table
if config['data_source']['type'] == 'unity_catalog':
    products_df = load_data_from_source(config, 'banking_product', spark)
else:
    products_path = os.path.abspath('../data/raw/banking_product.csv')
    products_df = pd.read_csv(products_path)

products_df.columns = products_df.columns.str.upper()

# Create mapping: class label (0-9) -> actual product ID (101-110)
unique_product_ids = sorted(target_df['NEXT_PRODUCT_ID'].dropna().unique())
class_to_product_id = {i: int(pid) for i, pid in enumerate(unique_product_ids)}

# Map predicted class labels to actual product IDs
predictions_df['PREDICTED_PRODUCT_ID'] = predictions_df['PREDICTED_PRODUCT_ID'].map(class_to_product_id)

# Merge with products to get product names
predictions_df = predictions_df.merge(
    products_df[['PRODUCTID', 'PRODUCTNAME']], 
    left_on='PREDICTED_PRODUCT_ID', 
    right_on='PRODUCTID', 
    how='left'
).drop(columns=['PRODUCTID'])

# Reorder columns
column_order = ['CUSTOMERID', 'RANK', 'PREDICTED_PRODUCT_ID', 'PRODUCTNAME', 'PROBABILITY', 'PREDICTION_TIMESTAMP']
predictions_df = predictions_df[column_order]

print(f"✅ Product names added to {len(predictions_df)} predictions")
print(f"\nSample recommendations:")
print(predictions_df.head(9).to_string(index=False))

✅ Product names added to 2148 predictions

Sample recommendations:
 CUSTOMERID  RANK  PREDICTED_PRODUCT_ID            PRODUCTNAME  PROBABILITY       PREDICTION_TIMESTAMP
       1001     1                   103     60-Month Auto Loan     0.270500 2025-12-02 15:18:13.150230
       1001     2                   108           RV/Boat Loan     0.180267 2025-12-02 15:18:13.150230
       1001     3                   107 Student Refinance Loan     0.162612 2025-12-02 15:18:13.150230
       1002     1                   101 30-Year Fixed Mortgage     0.213071 2025-12-02 15:18:13.150230
       1002     2                   103     60-Month Auto Loan     0.186766 2025-12-02 15:18:13.150230
       1002     3                   107 Student Refinance Loan     0.147714 2025-12-02 15:18:13.150230
       1003     1                   108           RV/Boat Loan     0.663767 2025-12-02 15:18:13.150230
       1003     2                   107 Student Refinance Loan     0.083175 2025-12-02 15:18:13.150230
      

## 4. Save Predictions

In [0]:
# Save predictions to outputs/predictions folder
import os
from utils.data_loader import save_data_to_destination

# Resolve to workspace root
workspace_root = os.path.dirname(os.getcwd()) if os.getcwd().endswith('notebooks') else os.getcwd()
predictions_output_path = os.path.join(workspace_root, 'outputs', 'predictions')
os.makedirs(predictions_output_path, exist_ok=True)

# Save predictions
if config['data_source']['type'] == 'unity_catalog':
    save_predictions(predictions_df, config, spark)
    print(f"✅ Predictions saved to Unity Catalog")
else:
    save_data_to_destination(predictions_df, config, 'product_recommendations', 
                             spark=None, mode='overwrite', output_path=predictions_output_path)
    print(f"✅ Predictions saved to: {os.path.join(predictions_output_path, 'product_recommendations.csv')}")

✅ Predictions saved to Unity Catalog


## 5. Prediction Summary

In [0]:
print_section_header("Batch Inference Summary")

print(f"""
Total Customers Scored: {len(customer_ids)}
Total Recommendations: {len(predictions_df)}
Top-K per Customer: {top_k}

Sample Recommendations:
""")

print(predictions_df.head(10).to_string(index=False))

print(f"\n✅ Batch inference completed!")


                            Batch Inference Summary                             


Total Customers Scored: 716
Total Recommendations: 2148
Top-K per Customer: 3

Sample Recommendations:

 CUSTOMERID  RANK  PREDICTED_PRODUCT_ID            PRODUCTNAME  PROBABILITY       PREDICTION_TIMESTAMP
       1001     1                   103     60-Month Auto Loan     0.270500 2025-12-02 15:18:13.150230
       1001     2                   108           RV/Boat Loan     0.180267 2025-12-02 15:18:13.150230
       1001     3                   107 Student Refinance Loan     0.162612 2025-12-02 15:18:13.150230
       1002     1                   101 30-Year Fixed Mortgage     0.213071 2025-12-02 15:18:13.150230
       1002     2                   103     60-Month Auto Loan     0.186766 2025-12-02 15:18:13.150230
       1002     3                   107 Student Refinance Loan     0.147714 2025-12-02 15:18:13.150230
       1003     1                   108           RV/Boat Loan     0.663767 2025-12-02 15:1