In [0]:
!pip install -r /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/requirements.txt
dbutils.library.restartPython()


Collecting pyarrow==20.0.0
  Downloading pyarrow-20.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (42.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 MB 53.6 MB/s eta 0:00:00
Collecting mlflow==3.0.1
  Downloading mlflow-3.0.1-py3-none-any.whl (24.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 101.9 MB/s eta 0:00:00
Collecting mlflow-skinny==3.0.1
  Downloading mlflow_skinny-3.0.1-py3-none-any.whl (1.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 97.4 MB/s eta 0:00:00
Installing collected packages: pyarrow, mlflow-skinny, mlflow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 22.0.0
    Uninstalling pyarrow-22.0.0:
      Successfully uninstalled pyarrow-22.0.0
  Attempting uninstall: mlflow-skinny
    Found existing installation: mlflow-skinny 3.6.0
    Uninstalling mlflow-skinny-3.6.0:
      Successfully uninstalled mlflow-skinny-3.6.0
  Attempting uninstall: mlflow
    Found existing installation: mlflow 3.6.0
    Un

## 1. Setup and Configuration

In [0]:
import sys
import os
import logging
import pandas as pd
import numpy as np

sys.path.append(os.path.abspath('../'))

from utils.common_utils import load_config, setup_logging, get_spark_session, print_section_header, Timer
from utils.data_loader import load_all_tables
from utils.eda_utils import (
    create_data_summary_report, analyze_missing_values, analyze_numeric_columns,
    plot_numeric_distributions, analyze_categorical_columns, plot_correlation_matrix,
    analyze_target_variable, create_eda_summary
)
from utils.feature_engineering import create_target_variable

In [0]:
# Load configuration
config = load_config('../config/config.yaml')
setup_logging(config)

print_section_header("Exploratory Data Analysis")

# Fix output path to use absolute path relative to project root
# EDA outputs should go to outputs/eda at project root (one level up from notebooks)
output_path = os.path.abspath('../outputs/eda')
os.makedirs(output_path, exist_ok=True)

print(f"📂 EDA outputs will be saved to: {output_path}")
logging.info(f"EDA outputs will be saved to: {output_path}")


                           Exploratory Data Analysis                            

📂 EDA outputs will be saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/eda


## 2. Load Data

In [0]:
# Get Spark session if needed
spark = None
if config['data_source']['type'] == 'unity_catalog':
    spark = get_spark_session(config)

# Fix paths to use absolute path relative to project root
# EDA notebook needs data from data/raw (one level up from notebooks)
if config['data_source']['type'] == 'csv':
    original_input_path = config['data_source']['csv']['input_path']
    config['data_source']['csv']['input_path'] = os.path.abspath('../data/raw')
    print(f"📂 Loading data from: {config['data_source']['csv']['input_path']}")

# Load all tables
with Timer("Loading Data"):
    tables = load_all_tables(config, spark)

# Restore original path
if config['data_source']['type'] == 'csv':
    config['data_source']['csv']['input_path'] = original_input_path
    
print(f"\n✅ Loaded {len([t for t in tables.values() if t is not None])} tables")


✅ Loaded 6 tables


## 3. Overall Data Summary

In [0]:
with Timer("Creating Data Summary"):
    summary_df = create_data_summary_report(tables, output_path)
    
print("\n📊 Data Summary:")
print(summary_df.to_string(index=False))


📊 Data Summary:
      table_name  num_rows  num_columns  memory_mb  missing_cells  missing_percentage
           party      1000            8   0.345090              0                 0.0
        customer      1000            5   0.073555              0                 0.0
customer_account       950           10   0.241652              0                 0.0
 banking_product        10            7   0.001625              0                 0.0
         channel         7            4   0.001083              0                 0.0
     transaction      1000            6   0.077370              0                 0.0


## 4. Missing Value Analysis

In [0]:
print_section_header("Missing Value Analysis")

# Consolidate all missing value stats into one CSV
all_missing_stats = []

for table_name, df in tables.items():
    if df is not None:
        missing_stats = pd.DataFrame({
            'table': table_name,
            'column': df.columns,
            'missing_count': df.isnull().sum().values,
            'missing_percentage': (df.isnull().sum() / len(df) * 100).values,
            'dtype': df.dtypes.values
        })
        
        all_missing_stats.append(missing_stats)
        
        # Display summary for tables with missing values
        has_missing = missing_stats[missing_stats['missing_percentage'] > 0]
        if len(has_missing) > 0:
            print(f"\n{table_name.upper()}:")
            print(has_missing[['column', 'missing_count', 'missing_percentage']].to_string(index=False))
        else:
            print(f"\n{table_name.upper()}: No missing values")

# Save consolidated missing values report
if all_missing_stats:
    consolidated_missing = pd.concat(all_missing_stats, ignore_index=True)
    consolidated_missing = consolidated_missing[consolidated_missing['missing_percentage'] > 0].sort_values(
        ['table', 'missing_percentage'], ascending=[True, False]
    )
    
    os.makedirs(output_path, exist_ok=True)
    missing_path = os.path.join(output_path, 'all_missing_values.csv')
    consolidated_missing.to_csv(missing_path, index=False)
    print(f"\n✅ Saved consolidated missing values report to: all_missing_values.csv")


                             Missing Value Analysis                             


PARTY: No missing values

CUSTOMER: No missing values

CUSTOMER_ACCOUNT: No missing values

BANKING_PRODUCT: No missing values

CHANNEL: No missing values

TRANSACTION: No missing values

✅ Saved consolidated missing values report to: all_missing_values.csv


## 5. Numeric Column Analysis

In [0]:
print_section_header("Numeric Column Analysis")

# Consolidate all numeric column stats into one CSV
all_numeric_stats = []

for table_name, df in tables.items():
    if df is not None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        if len(numeric_cols) > 0:
            stats = df[numeric_cols].describe().T
            stats['table'] = table_name
            stats['column'] = stats.index
            stats = stats.reset_index(drop=True)
            
            # Reorder columns
            cols = ['table', 'column', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
            stats = stats[cols]
            
            all_numeric_stats.append(stats)
            
            print(f"\n{table_name.upper()} - Top 5 Numeric Columns:")
            print(stats.head()[['column', 'mean', 'std', 'min', 'max']].to_string(index=False))

# Save consolidated numeric statistics
if all_numeric_stats:
    consolidated_numeric = pd.concat(all_numeric_stats, ignore_index=True)
    
    os.makedirs(output_path, exist_ok=True)
    numeric_path = os.path.join(output_path, 'all_numeric_statistics.csv')
    consolidated_numeric.to_csv(numeric_path, index=False)
    print(f"\n✅ Saved consolidated numeric statistics to: all_numeric_statistics.csv")


                            Numeric Column Analysis                             


PARTY - Top 5 Numeric Columns:
          column   mean        std   min    max
         PartyId 800.50 288.819436 301.0 1300.0
PrimaryAddressId 510.46   5.861293 501.0  520.0

CUSTOMER - Top 5 Numeric Columns:
        column     mean        std    min    max
    CustomerId 1500.500 288.819436 1001.0 2000.0
       PartyId  800.500 288.819436  301.0 1300.0
CustomerTypeId    2.463   1.089871    1.0    4.0

CUSTOMER_ACCOUNT - Top 5 Numeric Columns:
           column         mean          std          min          max
CustomerAccountId 5.501469e+03 2.881161e+02       5001.0       6000.0
       CustomerId 1.498945e+03 2.904275e+02       1002.0       2000.0
        ProductId 1.054368e+02 2.875299e+00        101.0        110.0
        ChannelId 3.952632e+00 2.015971e+00          1.0          7.0
  OriginationDate 1.662607e+09 2.334919e+07 1621306802.0 1702465338.0

BANKING_PRODUCT - Top 5 Numeric Columns:
     

## 6. Categorical Column Analysis

In [0]:
print_section_header("Categorical Column Analysis")

# Consolidate all categorical column stats into one CSV
all_categorical_stats = []

for table_name, df in tables.items():
    if df is not None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
        
        if len(categorical_cols) > 0:
            for col in categorical_cols:
                unique_count = df[col].nunique()
                top_value = df[col].mode()[0] if len(df[col].mode()) > 0 else None
                top_freq = df[col].value_counts().iloc[0] if len(df[col]) > 0 else 0
                
                all_categorical_stats.append({
                    'table': table_name,
                    'column': col,
                    'unique_values': unique_count,
                    'top_value': top_value,
                    'top_frequency': top_freq,
                    'top_percentage': (top_freq / len(df) * 100) if len(df) > 0 else 0
                })
            
            print(f"\n{table_name.upper()}:")
            table_stats = pd.DataFrame([s for s in all_categorical_stats if s['table'] == table_name])
            print(table_stats[['column', 'unique_values', 'top_value', 'top_frequency']].to_string(index=False))

# Save consolidated categorical statistics
if all_categorical_stats:
    consolidated_categorical = pd.DataFrame(all_categorical_stats)
    
    os.makedirs(output_path, exist_ok=True)
    cat_path = os.path.join(output_path, 'all_categorical_statistics.csv')
    consolidated_categorical.to_csv(cat_path, index=False)
    print(f"\n✅ Saved consolidated categorical statistics to: all_categorical_statistics.csv")


                          Categorical Column Analysis                           


PARTY:
        column  unique_values          top_value  top_frequency
     LegalName            999     Michael Watson              2
     PartyType              2         Individual            771
  PrimaryEmail            999 contact@johnsonorg              2
  PrimaryPhone           1000       003-478-0603              1

CUSTOMER:
        column  unique_values top_value  top_frequency

CUSTOMER_ACCOUNT:
        column  unique_values top_value  top_frequency
  InterestRate            141    0.0540             13
 AccountStatus              3    Active            790

BANKING_PRODUCT:
        column  unique_values              top_value  top_frequency
   ProductName             10 15-Year Fixed Mortgage              1

CHANNEL:
        column  unique_values               top_value  top_frequency
   ChannelName              7 Auto Dealership Partner              1

TRANSACTION:
        column  unique_

## 7. Correlation Analysis

In [0]:
print_section_header("Correlation Analysis")

# Combine ALL numeric columns from all tables into one correlation matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Collect all numeric columns from all tables
all_data = []
sample_size = 10000  # Sample rows for performance

for table_name, df in tables.items():
    if df is not None:
        # Get all numeric columns
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        
        if len(numeric_cols) > 0:
            # Sample data for performance
            df_sample = df[numeric_cols].head(sample_size).copy()
            
            # Add table prefix to column names to avoid conflicts
            df_sample.columns = [f"{table_name}_{col}" for col in df_sample.columns]
            
            all_data.append(df_sample)

# Concatenate all tables horizontally (align by index)
if all_data:
    combined_df = pd.concat(all_data, axis=1)
    print(f"📊 Total numeric features collected: {len(combined_df.columns)} from {len(tables)} tables")
else:
    combined_df = pd.DataFrame()
    print("⚠️ No numeric columns found")

# Create single combined correlation heatmap
if len(combined_df.columns) >= 2:
    # Calculate correlation matrix
    corr_matrix = combined_df.corr()
    
    # Create figure with appropriate size
    fig_width = max(14, len(combined_df.columns) * 0.5)
    fig_height = max(12, len(combined_df.columns) * 0.4)
    plt.figure(figsize=(min(fig_width, 30), min(fig_height, 25)))
    
    # Create heatmap (without annotations if too many features)
    annot = len(combined_df.columns) <= 20  # Only annotate if 20 or fewer features
    
    sns.heatmap(corr_matrix, annot=annot, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5 if len(combined_df.columns) <= 30 else 0,
                cbar_kws={"shrink": 0.8}, fmt='.2f' if annot else None,
                xticklabels=True, yticklabels=True)
    
    plt.title(f'Combined Correlation Matrix - All Numeric Features ({len(combined_df.columns)} features)', 
              fontsize=14, pad=20)
    plt.xticks(rotation=90, ha='right', fontsize=8)
    plt.yticks(rotation=0, fontsize=8)
    plt.tight_layout()
    
    # Save combined correlation matrix
    os.makedirs(output_path, exist_ok=True)
    plot_path = os.path.join(output_path, 'combined_correlation_matrix.png')
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    # Also save correlation matrix as CSV for detailed analysis
    corr_csv_path = os.path.join(output_path, 'correlation_matrix.csv')
    corr_matrix.to_csv(corr_csv_path)
    
    print(f"\n✅ Created combined correlation matrix with {len(combined_df.columns)} features")
    print(f"📊 Heatmap saved to: combined_correlation_matrix.png")
    print(f"📊 Correlation values saved to: correlation_matrix.csv")
    
    # Show top correlations (excluding diagonal)
    print("\n🔝 Top 10 Strongest Correlations (excluding self-correlation):")
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_pairs.append({
                'Feature 1': corr_matrix.columns[i],
                'Feature 2': corr_matrix.columns[j],
                'Correlation': corr_matrix.iloc[i, j]
            })
    
    if corr_pairs:
        corr_df = pd.DataFrame(corr_pairs)
        corr_df['Abs_Correlation'] = corr_df['Correlation'].abs()
        top_corr = corr_df.nlargest(10, 'Abs_Correlation')[['Feature 1', 'Feature 2', 'Correlation']]
        print(top_corr.to_string(index=False))
else:
    print("⚠️ Not enough numeric columns for correlation analysis")


                              Correlation Analysis                              

📊 Total numeric features collected: 22 from 6 tables

✅ Created combined correlation matrix with 22 features
📊 Heatmap saved to: combined_correlation_matrix.png
📊 Correlation values saved to: correlation_matrix.csv

🔝 Top 10 Strongest Correlations (excluding self-correlation):
                           Feature 1                            Feature 2  Correlation
                 customer_CustomerId                     customer_PartyId     1.000000
banking_product_ProductMinimumAmount banking_product_ProductMaximumAmount     0.986068
banking_product_ProductMinimumAmount   banking_product_ProductMinimumTerm     0.889574
  banking_product_ProductMinimumTerm   banking_product_ProductMaximumTerm     0.884947
banking_product_ProductMaximumAmount   banking_product_ProductMinimumTerm     0.861037
    customer_account_PrincipalAmount   banking_product_ProductMaximumTerm    -0.776752
          customer_account_Cha

## 8. Target Variable Analysis

In [0]:
pd.to_datetime(1680037204)

Timestamp('1970-01-01 00:00:01.680037204')

In [0]:
print("Target Variable Analysis")

# Convert column names to uppercase for compatibility with feature_engineering functions
customer_account_upper = tables['customer_account'].copy()
customer_account_upper.columns = customer_account_upper.columns.str.upper()

# Convert ORIGINATIONDATE from Unix timestamp to datetime
customer_account_upper['ORIGINATIONDATE'] = pd.to_datetime(customer_account_upper['ORIGINATIONDATE'], unit='s')

# Create target variable
target_df = create_target_variable(
    customer_account_upper,
    reference_date=config['feature_engineering']['reference_date'],
    prediction_window_days=90
)

# Analyze target distribution
target_distribution = analyze_target_variable(
    target_df[target_df['NEXT_PRODUCT_ID'].notna()],
    'NEXT_PRODUCT_ID',
    output_path
)

print("\nTarget Variable Distribution:")
print(target_distribution.to_string(index=False))



Target Variable Analysis

Target Variable Distribution:
 product_id  count  percentage
      102.0     15   15.151515
      106.0     12   12.121212
      109.0     11   11.111111
      108.0     11   11.111111
      101.0     10   10.101010
      103.0     10   10.101010
      107.0     10   10.101010
      104.0      7    7.070707
      110.0      7    7.070707
      105.0      6    6.060606


## 9. Key Insights and Summary

In [0]:
print_section_header("EDA Completed")

# Create summary document
summary_file = create_eda_summary(output_path)

print(f"""
✅ EDA Analysis Complete!

📁 All outputs saved to: {output_path}

Key Deliverables:
- Data summary reports (CSV)
- Missing value analysis (CSV + PNG)
- Numeric statistics (CSV)
- Distribution plots (PNG)
- Correlation matrices (PNG)
- Target variable analysis (CSV + PNG)
- EDA summary document

Next Steps:
1. Review all outputs in {output_path}
2. Document key findings
3. Proceed to Feature Engineering (02_feature_engineering.py)
""")


                                 EDA Completed                                  


✅ EDA Analysis Complete!

📁 All outputs saved to: /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/eda

Key Deliverables:
- Data summary reports (CSV)
- Missing value analysis (CSV + PNG)
- Numeric statistics (CSV)
- Distribution plots (PNG)
- Correlation matrices (PNG)
- Target variable analysis (CSV + PNG)
- EDA summary document

Next Steps:
1. Review all outputs in /Workspace/Users/ashish.kamboj@tigeranalytics.com/home-credit-hyperpersonalization/outputs/eda
2. Document key findings
3. Proceed to Feature Engineering (02_feature_engineering.py)

