In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
import joblib
from collections import Counter

print("Libraries imported successfully")

Libraries imported successfully


In [4]:
# Load raw data
df = pd.read_parquet(
    r"C:\Users\Asus\Documents\GitHub\Credit-Scoring\data\data-processing\flat_table\flat_credit_model_20251027_143321.parquet",
    engine="fastparquet"
)

print("="*80)
print("DATA LOADING")
print("="*80)
print(f"Raw data shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check target distribution
print(f"\nTarget distribution:")
print(df['TARGET'].value_counts())
print(df['TARGET'].value_counts(normalize=True))

DATA LOADING
Raw data shape: (307511, 66)
Columns: 66
Memory usage: 139.89 MB

Target distribution:
TARGET
False    282686
True      24825
Name: count, dtype: int64
TARGET
False    0.919271
True     0.080729
Name: proportion, dtype: float64


In [5]:
# Data Quality Analysis
print("="*80)
print("DATA QUALITY ANALYSIS")
print("="*80)

# Missing values analysis
missing_df = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing_Percentage', ascending=False)

print(f"\nColumns with >50% missing values:")
high_missing = missing_df[missing_df['Missing_Percentage'] > 50]
print(high_missing.head(20).to_string(index=False))

print(f"\nTotal columns with >50% missing: {len(high_missing)}")
print(f"Total columns with >90% missing: {len(missing_df[missing_df['Missing_Percentage'] > 90])}")

# Check for duplicate rows
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Check for constant features
constant_features = [col for col in df.columns if df[col].nunique() <= 1]
print(f"Constant features: {len(constant_features)}")

DATA QUALITY ANALYSIS

Columns with >50% missing values:
                   Column  Missing_Count  Missing_Percentage
         cc_payment_ratio         250066               81.32
     cc_raw_total_payment         246451               80.14
    cc_active_month_ratio         246202               80.06
       cc_avg_utilization         221475               72.02
       cc_max_utilization         221475               72.02
       cc_raw_balance_avg         220606               71.74
    cc_raw_total_drawings         220606               71.74
cc_raw_invalid_limit_flag         220606               71.74
          cc_total_months         220606               71.74
           raw_cc_records         220606               71.74
           cc_raw_max_dpd         220606               71.74
      cc_has_overdue_flag         220606               71.74
         cc_raw_limit_avg         220606               71.74
    cc_raw_overdue_months         220606               71.74

Total columns with >50% mis

In [6]:
# Remove Low-Quality Features
print("="*80)
print("FEATURE CLEANING")
print("="*80)

# Remove columns with >90% missing values
threshold = 90
cols_to_drop = missing_df[missing_df['Missing_Percentage'] > threshold]['Column'].tolist()

# Remove constant features
cols_to_drop.extend(constant_features)

# Remove SK_ID_CURR (identifier, not feature)
if 'SK_ID_CURR' in df.columns:
    cols_to_drop.append('SK_ID_CURR')

# Remove duplicates from list
cols_to_drop = list(set(cols_to_drop))

# Ensure TARGET is not dropped
if 'TARGET' in cols_to_drop:
    cols_to_drop.remove('TARGET')

print(f"Dropping {len(cols_to_drop)} columns:")
print(f"  - {len([c for c in cols_to_drop if c in missing_df[missing_df['Missing_Percentage'] > threshold]['Column'].tolist()])} high missing")
print(f"  - {len([c for c in cols_to_drop if c in constant_features])} constant")
print(f"  - {'SK_ID_CURR' if 'SK_ID_CURR' in cols_to_drop else 'None'} (ID column)")

df_cleaned = df.drop(columns=cols_to_drop)
print(f"\n‚úÖ Shape after cleaning: {df_cleaned.shape}")

FEATURE CLEANING
Dropping 7 columns:
  - 0 high missing
  - 6 constant
  - SK_ID_CURR (ID column)

‚úÖ Shape after cleaning: (307511, 59)


In [7]:
# Feature Engineering
print("="*80)
print("FEATURE ENGINEERING")
print("="*80)

# Create ratio features (domain knowledge)
if 'raw_income_total' in df_cleaned.columns and 'raw_credit_amt' in df_cleaned.columns:
    df_cleaned['income_to_credit_ratio'] = df_cleaned['raw_income_total'] / (df_cleaned['raw_credit_amt'] + 1e-6)
    print("‚úÖ Created: income_to_credit_ratio")

if 'raw_annuity_amt' in df_cleaned.columns and 'raw_income_total' in df_cleaned.columns:
    df_cleaned['payment_burden'] = df_cleaned['raw_annuity_amt'] / (df_cleaned['raw_income_total'] + 1e-6)
    print("‚úÖ Created: payment_burden")

if 'AMT_CREDIT_SUM_DEBT' in df_cleaned.columns and 'AMT_CREDIT_SUM' in df_cleaned.columns:
    df_cleaned['credit_utilization'] = df_cleaned['AMT_CREDIT_SUM_DEBT'] / (df_cleaned['AMT_CREDIT_SUM'] + 1e-6)
    print("‚úÖ Created: credit_utilization")

# Create aggregation features
if 'dpd_mean' in df_cleaned.columns and 'total_utilization' in df_cleaned.columns:
    df_cleaned['dpd_utilization_interaction'] = df_cleaned['dpd_mean'] * df_cleaned['total_utilization']
    print("‚úÖ Created: dpd_utilization_interaction")

# Age groups (if age exists)
if 'age_years' in df_cleaned.columns:
    df_cleaned['age_group'] = pd.cut(
        df_cleaned['age_years'], 
        bins=[0, 25, 35, 45, 55, 100], 
        labels=[0, 1, 2, 3, 4]
    ).astype(int)
    print("‚úÖ Created: age_group")

print(f"\n‚úÖ Final shape: {df_cleaned.shape}")
print(f"   Added {df_cleaned.shape[1] - df.drop(columns=cols_to_drop).shape[1]} new features")

FEATURE ENGINEERING
‚úÖ Created: income_to_credit_ratio
‚úÖ Created: payment_burden
‚úÖ Created: dpd_utilization_interaction
‚úÖ Created: age_group

‚úÖ Final shape: (307511, 63)
   Added 4 new features


In [None]:
# Handle Missing Values Intelligently
print("="*80)
print("HANDLING MISSING VALUES")
print("="*80)

# Separate features by type
X = df_cleaned.drop(columns=['TARGET'])
y = df_cleaned['TARGET'].astype(int)

# Identify numeric vs categorical (if any)
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Strategy 1: Simple median imputation for features with <20% missing
# Strategy 2: KNN imputation for features with 20-50% missing
# Strategy 3: Create missing indicator for important features

missing_pct = X.isnull().sum() / len(X) * 100

low_missing = missing_pct[(missing_pct > 0) & (missing_pct < 20)].index.tolist()
medium_missing = missing_pct[(missing_pct >= 20) & (missing_pct <= 50)].index.tolist()

print(f"\nImputation strategy:")
print(f"  Low missing (<20%): {len(low_missing)} columns ‚Üí Median")
print(f"  Medium missing (20-50%): {len(medium_missing)} columns ‚Üí KNN")

# Apply simple imputation first
simple_imputer = SimpleImputer(strategy='median')
if len(low_missing) > 0:
    X[low_missing] = simple_imputer.fit_transform(X[low_missing])

# Apply KNN imputation for medium missing (optional - can be slow)
if len(medium_missing) > 0 and len(medium_missing) < 50:  # Only if manageable
    print(f"\nüîÑ Applying KNN imputation (this may take a while)...")
    knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
    X[medium_missing] = knn_imputer.fit_transform(X[medium_missing])
    print("‚úÖ KNN imputation complete")
else:
    # Fallback to median for medium missing if too many columns
    if len(medium_missing) > 0:
        X[medium_missing] = simple_imputer.fit_transform(X[medium_missing])
        print(f"‚ö†Ô∏è Too many columns for KNN, using median imputation instead")

# Final check: fill any remaining NaNs with 0
X = X.fillna(0)

print(f"\n‚úÖ Missing values handled")
print(f"   Remaining NaNs: {X.isnull().sum().sum()}")

HANDLING MISSING VALUES


Numeric features: 62
Categorical features: 0

Imputation strategy:
  Low missing (<20%): 37 columns ‚Üí Median
  Medium missing (20-50%): 2 columns ‚Üí KNN

üîÑ Applying KNN imputation (this may take a while)...


In [None]:
# Remove Outliers (Conservative Approach)
print("="*80)
print("OUTLIER HANDLING")
print("="*80)

from scipy.stats import zscore

# Identify high-value features that might have outliers
outlier_candidates = ['raw_income_total', 'raw_credit_amt', 'raw_annuity_amt']
outlier_candidates = [col for col in outlier_candidates if col in X.columns]

initial_len = len(X)

for col in outlier_candidates:
    # Use IQR method (more robust than z-score)
    Q1 = X[col].quantile(0.01)  # 1st percentile
    Q3 = X[col].quantile(0.99)  # 99th percentile
    IQR = Q3 - Q1
    
    # Only remove extreme outliers
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    
    mask = (X[col] >= lower_bound) & (X[col] <= upper_bound)
    X = X[mask]
    y = y[mask]
    
    removed = initial_len - len(X)
    if removed > 0:
        print(f"  {col}: removed {removed} extreme outliers")
        initial_len = len(X)

print(f"\n‚úÖ Final data shape: {X.shape}")
print(f"   Removed {df_cleaned.shape[0] - len(X)} total outliers ({(df_cleaned.shape[0] - len(X))/df_cleaned.shape[0]*100:.2f}%)")

After SMOTE:
TARGET
0    0.5
1    0.5
Name: proportion, dtype: float64


In [None]:
# Feature Scaling (Optional for Tree-based, but helps with stability)
print("="*80)
print("FEATURE SCALING")
print("="*80)

# Use RobustScaler for high-value features (resistant to outliers)
scale_features = ['raw_income_total', 'raw_credit_amt', 'raw_annuity_amt', 
                  'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT']
scale_features = [col for col in scale_features if col in X.columns]

if len(scale_features) > 0:
    scaler = RobustScaler()
    X[scale_features] = scaler.fit_transform(X[scale_features])
    print(f"‚úÖ Scaled {len(scale_features)} high-value features")
    
    # Save scaler for production
    joblib.dump(scaler, r"C:\Users\Asus\Documents\GitHub\Credit-Scoring\output\models\robust_scaler.pkl")
    print("‚úÖ Scaler saved for production use")
else:
    print("‚ö†Ô∏è No features to scale")

Processed data saved successfully.


In [None]:
# Train/Test Split (STRATIFIED)
print("="*80)
print("TRAIN/TEST SPLIT")
print("="*80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # CRITICAL: Ensures same distribution
)

print(f"Train size: {X_train.shape}")
print(f"Test size: {X_test.shape}")

print(f"\nClass distribution (Training):")
print(f"  Class 0 (No Default): {Counter(y_train)[0]:,} ({Counter(y_train)[0]/len(y_train)*100:.2f}%)")
print(f"  Class 1 (Default): {Counter(y_train)[1]:,} ({Counter(y_train)[1]/len(y_train)*100:.2f}%)")
print(f"  Imbalance Ratio: {Counter(y_train)[0]/Counter(y_train)[1]:.2f}:1")

print(f"\nClass distribution (Testing):")
print(f"  Class 0 (No Default): {Counter(y_test)[0]:,} ({Counter(y_test)[0]/len(y_test)*100:.2f}%)")
print(f"  Class 1 (Default): {Counter(y_test)[1]:,} ({Counter(y_test)[1]/len(y_test)*100:.2f}%)")
print(f"  Imbalance Ratio: {Counter(y_test)[0]/Counter(y_test)[1]:.2f}:1")

In [None]:
# Save Processed Data (WITHOUT SMOTE)
print("="*80)
print("SAVING PROCESSED DATA")
print("="*80)

# IMPORTANT: Save ORIGINAL split, NOT resampled data
# SMOTE should be applied during training only!

save_path = r"C:\Users\Asus\Documents\GitHub\Credit-Scoring\output\models\processed_data_lgbm_v2.pkl"

joblib.dump(
    (X_train, X_test, y_train, y_test),
    save_path
)

print(f"‚úÖ Processed data saved to:")
print(f"   {save_path}")

# Save metadata
metadata = {
    'original_shape': df.shape,
    'cleaned_shape': df_cleaned.shape,
    'final_shape': X.shape,
    'features_removed': len(cols_to_drop),
    'features_engineered': df_cleaned.shape[1] - df.drop(columns=cols_to_drop).shape[1],
    'outliers_removed': df_cleaned.shape[0] - len(X),
    'train_shape': X_train.shape,
    'test_shape': X_test.shape,
    'class_imbalance_train': Counter(y_train)[0] / Counter(y_train)[1],
    'class_imbalance_test': Counter(y_test)[0] / Counter(y_test)[1],
    'feature_names': X.columns.tolist(),
    'scaled_features': scale_features if len(scale_features) > 0 else []
}

joblib.dump(metadata, r"C:\Users\Asus\Documents\GitHub\Credit-Scoring\output\models\data_metadata_v2.pkl")

print(f"\n‚úÖ Metadata saved")
print(f"\nüìä Data Processing Summary:")
print(f"   Original features: {df.shape[1]}")
print(f"   Features removed: {len(cols_to_drop)}")
print(f"   Features engineered: {metadata['features_engineered']}")
print(f"   Final features: {X.shape[1]}")
print(f"   Samples removed (outliers): {metadata['outliers_removed']}")
print(f"   Train samples: {X_train.shape[0]:,}")
print(f"   Test samples: {X_test.shape[0]:,}")

In [None]:
# Validation - Check Data Quality
print("="*80)
print("DATA QUALITY VALIDATION")
print("="*80)

# Check for any remaining issues
print("‚úì Checks:")
print(f"  Missing values in train: {X_train.isnull().sum().sum()}")
print(f"  Missing values in test: {X_test.isnull().sum().sum()}")
print(f"  Infinite values in train: {np.isinf(X_train).sum().sum()}")
print(f"  Infinite values in test: {np.isinf(X_test).sum().sum()}")
print(f"  Duplicate rows in train: {X_train.duplicated().sum()}")
print(f"  Duplicate rows in test: {X_test.duplicated().sum()}")

# Check feature variance
low_variance_features = X_train.columns[X_train.var() < 0.01].tolist()
if len(low_variance_features) > 0:
    print(f"\n‚ö†Ô∏è Warning: {len(low_variance_features)} features have very low variance")
    print(f"   Consider removing: {low_variance_features[:5]}")
else:
    print(f"\n‚úÖ All features have sufficient variance")

print(f"\n‚úÖ Data quality validation complete!")
print(f"   Ready for model training")