In [1]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from pathlib import Path

In [2]:
ARTIFACTS_DIR = Path('../../output/models/alt_models')
PROCESS_DIR = Path('../../output/data')

print("Loading model and artifacts...")

Loading model and artifacts...


In [3]:
model = joblib.load(ARTIFACTS_DIR / 'xgb_alt_data_model.pkl')
print("Model loaded.")

Model loaded.


In [4]:
model_features = joblib.load(ARTIFACTS_DIR / 'model_features_list.pkl')
print(f"Feature list loaded ({len(model_features)} features).")

Feature list loaded (23 features).


In [9]:
encoders = joblib.load(PROCESS_DIR / 'label_encoders.pkl')
print("Label Encoders loaded.")

Label Encoders loaded.


In [10]:
def process_single_input(input_dict, encoders, feature_columns):
    # 1. Convert to DataFrame
    df = pd.DataFrame([input_dict])
    
    # 2. Fill NaN (Numeric -> 0, Categorical -> Unknown)
    numeric_cols = [
        'telco_account_age_days', 'telco_avg_revenue_mean', 
        'telco_recharge_count_mean', 'telco_mobile_data_mb_mean',
        'wallet_txn_count', 'wallet_avg_amount', 
        'wallet_large_txn_ratio', 'wallet_failure_rate'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
            
    cat_cols = ['user_region', 'edu_highest_level', 'edu_gpa_band', 
                'edu_graduation_status', 'edu_institution_tier', 'edu_major_group']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown").astype(str)

    # 3. FICO-Derived Features Calculation (Copy logic từ NB 01)
    # Payment Reliability
    df['fico_payment_reliability'] = df.get('telco_recharge_count_mean', 0) - (df.get('wallet_failure_rate', 0) * 100)
    
    # Financial Capacity
    df['fico_financial_capacity'] = df.get('telco_avg_revenue_mean', 0) + df.get('wallet_avg_amount', 0)
    
    # History Tenure
    df['fico_history_tenure'] = df.get('telco_account_age_days', 0) + (df.get('wallet_txn_count', 0) * 10)
    
    # New Credit Intensity
    df['fico_new_credit_intensity'] = df.get('wallet_large_txn_ratio', 0)
    
    # Credit Mix Score
    # Helper: Check Degree
    has_degree = 1 if df.get('edu_highest_level', [''])[0] in ['university', 'master', 'phd'] else 0
    df['fico_credit_mix_score'] = (
        df.get('has_telco_data', 0) + 
        df.get('has_ewallet_data', 0) + 
        has_degree
    )
    
    # 4. Encoders & Mapping
    # Tier Mapping
    tier_map = {'Unknown': 0, 'tier_3': 1, 'tier_2': 2, 'tier_1': 3}
    if 'edu_institution_tier' in df.columns:
        df['edu_institution_tier_encoded'] = df['edu_institution_tier'].map(tier_map).fillna(0)
    
    # Label Encoding
    if encoders:
        for col, le in encoders.items():
            encoded_col_name = f"{col}_encoded"
            if col in df.columns:
                # Handle unseen labels (gán về class đầu tiên hoặc mode)
                val = df[col].iloc[0]
                if val in le.classes_:
                    df[encoded_col_name] = le.transform([val])
                else:
                    df[encoded_col_name] = 0 # Default fallback
    
    # Các biến phụ (tùy vào feature list của bạn có hay không)
    # Ví dụ: age_group, is_high_potential... nếu NB 01 có tạo thì ở đây cũng phải tạo.
    # Để đơn giản, ta gán 0 cho các biến thiếu nếu không quan trọng
    for col in feature_columns:
        if col not in df.columns:
            df[col] = 0
            
    # 5. Return đúng thứ tự cột
    return df[feature_columns]

In [11]:
def calculate_score(pd_proba):
    # Anchor: 600 @ 50:1 Odds, PDO=20
    # Công thức: Score = Offset + Factor * ln(Odds)
    # Vì model predict Bad (1), nên Odds = Good/Bad = (1-p)/p
    pdo = 20
    target_score = 600
    target_odds = 50
    
    factor = pdo / np.log(2)
    offset = target_score - (factor * np.log(target_odds))
    
    pd_proba = np.clip(pd_proba, 0.0001, 0.9999)
    odds = (1 - pd_proba) / pd_proba
    score = offset + (factor * np.log(odds))
    return int(score)

In [12]:
# Case A
good_customer = {
    'user_age': 35,
    'user_region': 'HN',
    # Telco
    'has_telco_data': 1,
    'telco_account_age_days': 3600, # 10 năm
    'telco_avg_revenue_mean': 500000,
    'telco_recharge_count_mean': 5, # Nạp đều
    'telco_recharge_amount_mean': 500000,
    'telco_mobile_data_mb_mean': 10000,
    # Wallet
    'has_ewallet_data': 1,
    'wallet_txn_count': 50,
    'wallet_avg_amount': 5000000, # Số dư 5tr
    'wallet_failure_rate': 0.0,   # Không lỗi
    'wallet_large_txn_ratio': 0.1,
    # Edu
    'has_academic_data': 1,
    'edu_highest_level': 'university',
    'edu_institution_tier': 'tier_1'
}
# Case B
bad_customer = {
    'user_age': 22,
    'user_region': 'Unknown',
    # Telco
    'has_telco_data': 1,
    'telco_account_age_days': 30, # Mới mua sim
    'telco_avg_revenue_mean': 10000,
    'telco_recharge_count_mean': 0,
    # Wallet
    'has_ewallet_data': 1,
    'wallet_txn_count': 2,
    'wallet_avg_amount': 0,
    'wallet_failure_rate': 0.8, # Lỗi 80%
    'wallet_large_txn_ratio': 0.0,
    # Edu
    'has_academic_data': 0
}

In [13]:
print("\n--- TEST KẾT QUẢ ---")

for name, data in [("Good Customer", good_customer), ("Bad Customer", bad_customer)]:
    # 1. Process
    X_input = process_single_input(data, encoders, model_features)
    
    # 2. Predict Prob (Bad Rate)
    pd_val = model.predict_proba(X_input)[0, 1]
    
    # 3. Calculate Score
    score = calculate_score(pd_val)
    
    # 4. Display
    print(f"\nProfile: {name}")
    print(f" -> Xác suất nợ xấu (PD): {pd_val:.2%}")
    print(f" -> Điểm tín dụng (FICO Alt): {score}")
    
    if score >= 600:
        print(" -> Đánh giá: ĐỦ ĐIỀU KIỆN VAY")
    else:
        print(" -> Đánh giá: RỦI RO CAO - TỪ CHỐI")


--- TEST KẾT QUẢ ---

Profile: Good Customer
 -> Xác suất nợ xấu (PD): 0.00%
 -> Điểm tín dụng (FICO Alt): 752
 -> Đánh giá: ĐỦ ĐIỀU KIỆN VAY

Profile: Bad Customer
 -> Xác suất nợ xấu (PD): 99.71%
 -> Điểm tín dụng (FICO Alt): 318
 -> Đánh giá: RỦI RO CAO - TỪ CHỐI
