In [0]:
!pip install -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt
dbutils.library.restartPython()

Collecting pyarrow==20.0.0
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 MB 28.9 MB/s eta 0:00:00
Collecting mlflow==3.0.1
  Downloading mlflow-3.0.1-py3-none-any.whl (24.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 19.6 MB/s eta 0:00:00
Installing collected packages: pyarrow, mlflow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 22.0.0
    Uninstalling pyarrow-22.0.0:
      Successfully uninstalled pyarrow-22.0.0
  Attempting uninstall: mlflow
    Found existing installation: mlflow 3.6.0
    Uninstalling mlflow-3.6.0:
      Successfully uninstalled mlflow-3.6.0
Successfully installed mlflow-3.0.1 pyarrow-20.0.0
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import sys
import os
import logging
sys.path.append(os.path.abspath('../'))

from utils.common_utils import (load_config, setup_logging, get_spark_session,
                                print_section_header, Timer, get_feature_store_client)
from utils.data_loader import load_all_tables, save_data_to_destination
from utils.feature_engineering import *

In [0]:
config = load_config('../config/config.yaml')
setup_logging(config)

print_section_header("Feature Engineering")


                              Feature Engineering                               



## 1. Load Data

In [0]:
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Fix paths to use absolute path relative to project root
if config['data_source']['type'] == 'csv':
    original_input_path = config['data_source']['csv']['input_path']
    config['data_source']['csv']['input_path'] = os.path.abspath('../data/raw')
    print(f"📂 Loading data from: {config['data_source']['csv']['input_path']}")

with Timer("Loading Data"):
    tables = load_all_tables(config, spark)

# Restore original path
if config['data_source']['type'] == 'csv':
    config['data_source']['csv']['input_path'] = original_input_path

print(f"\n✅ Loaded {len([t for t in tables.values() if t is not None])} tables")


✅ Loaded 6 tables


In [0]:
# Create a wrapper function to handle optional address_df and column name conversion
from utils.feature_engineering import create_customer_demographic_features as _original_create_customer_demographic_features
import pandas as pd

def create_customer_demographic_features_wrapper(party_df, customer_df, address_df, reference_date):
    """
    Wrapper that handles optional address_df parameter and converts column names to uppercase.
    """
    # Convert column names to uppercase for consistency
    party_df = party_df.copy()
    party_df.columns = party_df.columns.str.upper()
    
    customer_df = customer_df.copy()
    customer_df.columns = customer_df.columns.str.upper()
    
    if address_df is not None:
        address_df = address_df.copy()
        address_df.columns = address_df.columns.str.upper()
    
    # Merge customer with party
    df = customer_df.merge(party_df, on='PARTYID', how='left')
    
    # Merge with address if provided
    if address_df is not None:
        df = df.merge(address_df, left_on='PRIMARYADDRESSID', right_on='ADDRESSID', how='left')
    
    # Calculate age
    ref_date = pd.to_datetime(reference_date)
    df['DATEOFBIRTH'] = pd.to_datetime(df['DATEOFBIRTH'])
    df['AGE'] = (ref_date - df['DATEOFBIRTH']).dt.days / 365.25
    
    # Convert AGE to integer
    df['AGE'] = df['AGE'].round().astype('Int64')  # Use Int64 to handle NaN values
    
    # Calculate customer tenure (in days)
    df['CUSTOMERESTABLISHEDDATE'] = pd.to_datetime(df['CUSTOMERESTABLISHEDDATE'])
    df['CUSTOMER_TENURE_DAYS'] = (ref_date - df['CUSTOMERESTABLISHEDDATE']).dt.days
    
    # Age group
    df['AGE_GROUP'] = pd.cut(df['AGE'], 
                             bins=[0, 25, 35, 45, 55, 65, 100],
                             labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
    
    # Tenure group
    df['TENURE_GROUP'] = pd.cut(df['CUSTOMER_TENURE_DAYS'],
                                bins=[-1, 180, 365, 730, 1825, 999999],
                                labels=['0-6M', '6M-1Y', '1-2Y', '2-5Y', '5Y+'])
    
    # Select relevant features (exclude address columns if not available)
    feature_cols = ['CUSTOMERID', 'PARTYID', 'AGE', 'CUSTOMER_TENURE_DAYS',
                    'AGE_GROUP', 'TENURE_GROUP']
    
    # Add address columns only if address_df was provided
    if address_df is not None and 'CITY' in df.columns:
        feature_cols.extend(['CITY', 'STATE', 'POSTALCODE'])
    
    features = df[feature_cols].copy()
    
    logging.info(f"Created {len(feature_cols)} demographic features for {len(features)} customers")
    
    return features

# Override the function name for use in subsequent cells
create_customer_demographic_features = create_customer_demographic_features_wrapper

print("✅ Wrapper function created for demographic features with column name conversion")

✅ Wrapper function created for demographic features with column name conversion


In [0]:
# Create wrapper functions for account and transaction features
from utils.feature_engineering import (
    create_account_features as _original_create_account_features,
    create_transaction_features as _original_create_transaction_features
)

def create_account_features_wrapper(customer_account_df, product_df, channel_df, reference_date):
    """
    Wrapper that converts column names to uppercase before processing.
    """
    # Convert column names to uppercase
    customer_account_df = customer_account_df.copy()
    customer_account_df.columns = customer_account_df.columns.str.upper()
    
    # Convert ORIGINATIONDATE from Unix timestamp to datetime if needed
    if customer_account_df['ORIGINATIONDATE'].dtype in ['int32', 'int64']:
        customer_account_df['ORIGINATIONDATE'] = pd.to_datetime(customer_account_df['ORIGINATIONDATE'], unit='s')
    
    product_df = product_df.copy()
    product_df.columns = product_df.columns.str.upper()
    
    channel_df = channel_df.copy()
    channel_df.columns = channel_df.columns.str.upper()
    
    # Call original function
    return _original_create_account_features(customer_account_df, product_df, channel_df, reference_date)

def create_transaction_features_wrapper(transaction_df, customer_account_df, reference_date, lookback_periods):
    """
    Wrapper that converts column names to uppercase before processing.
    """
    # Convert column names to uppercase
    transaction_df = transaction_df.copy()
    transaction_df.columns = transaction_df.columns.str.upper()
    
    customer_account_df = customer_account_df.copy()
    customer_account_df.columns = customer_account_df.columns.str.upper()
    
    # Call original function
    return _original_create_transaction_features(transaction_df, customer_account_df, reference_date, lookback_periods)

# Override the function names
create_account_features = create_account_features_wrapper
create_transaction_features = create_transaction_features_wrapper

print("✅ Wrapper functions created for account and transaction features")

✅ Wrapper functions created for account and transaction features


## 2. Create Features

In [0]:
reference_date = config['feature_engineering']['reference_date']
lookback_periods = config['feature_engineering']['lookback_periods']

# Demographic features (excluding address data)
demo_features = create_customer_demographic_features(
    tables['party'], tables['customer'], None, reference_date
)

# Account features
account_features = create_account_features(
    tables['customer_account'], tables['banking_product'], tables['channel'], reference_date
)

# Transaction features
transaction_features = create_transaction_features(
    tables['transaction'], tables['customer_account'], reference_date, lookback_periods
)

print("✅ Feature creation completed (excluding communication and document features)")

✅ Feature creation completed (excluding communication and document features)


## 3. Merge All Features

In [0]:
# Merge only demographic, account, and transaction features
all_features = merge_all_features([
    demo_features, account_features, transaction_features
])

# Handle missing values
all_features = handle_missing_values(
    all_features,
    config['feature_engineering']['missing_value_strategy']['numeric'],
    config['feature_engineering']['missing_value_strategy']['categorical']
)

print(f"\n✅ Final feature set: {len(all_features)} customers, {len(all_features.columns)} features")


✅ Final feature set: 1000 customers, 43 features


  df[col] = df[col].fillna(mode_val[0])


## 4. Create Target Variable

In [0]:
# Ensure latest version of feature_engineering (handles notebook re-runs after file edits)
import importlib, inspect
try:
    import utils.feature_engineering as fe
    importlib.reload(fe)
    from utils.feature_engineering import create_target_variable
    logging.info("Reloaded utils.feature_engineering module successfully")
except Exception as e:
    logging.warning(f"Could not reload feature_engineering module: {e}")

# Convert column names to uppercase and handle ORIGINATIONDATE
customer_account_upper = tables['customer_account'].copy()
customer_account_upper.columns = customer_account_upper.columns.str.upper()

# Convert ORIGINATIONDATE from Unix timestamp to datetime
if customer_account_upper['ORIGINATIONDATE'].dtype in ['int32', 'int64']:
    customer_account_upper['ORIGINATIONDATE'] = pd.to_datetime(customer_account_upper['ORIGINATIONDATE'], unit='s')

# Safely inspect signature to determine if training_ratio is supported
supports_training_ratio = False
try:
    sig = inspect.signature(create_target_variable)
    supports_training_ratio = 'training_ratio' in sig.parameters
    logging.info(f"create_target_variable signature: {sig}")
except Exception as e:
    logging.warning(f"Could not inspect create_target_variable signature: {e}")

if supports_training_ratio:
    target_df = create_target_variable(
        customer_account_upper,
        reference_date,
        prediction_window_days=365,
        training_ratio=config['feature_engineering']['training_data_ratio']
    )
else:
    logging.warning("training_ratio parameter not found; falling back to legacy call without sampling control")
    target_df = create_target_variable(
        customer_account_upper,
        reference_date,
        prediction_window_days=90
    )

logging.info(f"Target variable created for {len(target_df)} customers")



## 5. Save Features

In [0]:
# Install Databricks Feature Engineering package
%pip install databricks-feature-engineering --quiet

print("✅ databricks-feature-engineering installed")
print("⚠️ Note: Restart Python manually if needed: dbutils.library.restartPython()")

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
✅ databricks-feature-engineering installed
⚠️ Note: Restart Python manually if needed: dbutils.library.restartPython()


In [0]:
# Save features and targets to Unity Catalog
print("💾 Saving features and targets...")

save_data_to_destination(all_features, config, 'customer_features', spark)
print("✅ Features saved")

save_data_to_destination(target_df, config, 'customer_targets', spark)
print("✅ Targets saved")

# Register to Databricks Feature Store
if config['data_source']['type'] == 'unity_catalog' and spark:
    try:
        from databricks.feature_engineering import FeatureEngineeringClient
        
        fe = FeatureEngineeringClient()
        
        catalog = config['data_source']['unity_catalog']['catalog']
        output_schema = config['data_source']['unity_catalog']['output_schema']
        feature_store_table = f"{catalog}.{output_schema}.customer_features_fs"
        
        print(f"\n📊 Feature Store: {feature_store_table}")
        
        # Check if table exists
        try:
            existing = spark.table(feature_store_table)
            print(f"  ✅ Feature table already exists")
            print(f"  Rows: {existing.count()}")
        except:
            # Create new feature table
            print(f"  Creating new feature table...")
            features_spark_df = spark.createDataFrame(all_features)
            
            fe.create_table(
                name=feature_store_table,
                primary_keys=['CUSTOMERID'],
                df=features_spark_df,
                description="Customer features for next best product recommendation"
            )
            print(f"  ✅ Feature table created")
        
        print(f"\n✅ Feature Store ready!")
        print(f"   Table: {feature_store_table}")
        print(f"   Primary Key: CUSTOMERID")
        print(f"   Features: {len(all_features.columns)} columns")
        print(f"   Rows: {len(all_features)} customers")
        
    except ImportError:
        print(f"\n⚠️ databricks-feature-engineering not installed")
        print(f"   Run cell 14.5 to install the package")
    except Exception as e:
        print(f"\n⚠️ Feature Store error: {str(e)}")
        print(f"   Features are still saved to Unity Catalog tables")
else:
    print("\n⚠️ Feature Store only available for Unity Catalog")

print("\n✅ Feature engineering completed!")

💾 Saving features and targets...
✅ Features saved
✅ Targets saved

📊 Feature Store: datafabric_catalog.ml_outputs.customer_features_fs
  ✅ Feature table already exists
  Rows: 1000

✅ Feature Store ready!
   Table: datafabric_catalog.ml_outputs.customer_features_fs
   Primary Key: CUSTOMERID
   Features: 43 columns
   Rows: 1000 customers

✅ Feature engineering completed!
