In [12]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Configuration ---
# Path to the unified flat table
DATA_PATH = Path('../../data/data-processing/alternative_data/flat_user_credit_scoring.csv')

# Output directory for engineered features
OUTPUT_DIR = Path('../../data/data-processing/alternative_data/processed/alt_data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"Reading data from: {DATA_PATH.resolve()}")

Reading data from: D:\Credit-Scoring\data\data-processing\alternative_data\flat_user_credit_scoring.csv


In [6]:
df = pd.read_csv(DATA_PATH)
print(f"Input Shape: {df.shape}")
df.head()

Input Shape: (100000, 24)


Unnamed: 0,user_id,user_age,user_region,telco_account_age_days,telco_avg_revenue_mean,telco_avg_revenue_std,telco_recharge_count_mean,telco_recharge_amount_mean,telco_mobile_data_mb_mean,edu_highest_level,...,wallet_txn_count,wallet_avg_amount,wallet_max_amount,wallet_total_amount,wallet_large_txn_ratio,wallet_failure_rate,has_ewallet_data,has_telco_data,has_academic_data,target
0,USR_SI278N6MBX,56,OTHER,611.0,171500.0,117389.664508,2.75,575000.0,4944.25,,...,8.0,282625.0,864000.0,2261000.0,0.0,0.0,1.0,1,0,0
1,USR_FUMA2AOFZY,46,OTHER,3122.0,97000.0,62513.998432,2.75,487500.0,4389.5,university,...,,,,,,,0.0,1,1,1
2,USR_HKN9ZLWHG5,32,HN,,,,,,,university,...,6.0,323166.7,1028000.0,1939000.0,0.0,0.0,1.0,0,1,1
3,USR_6MWUPTHFN2,60,HN,263.0,105000.0,101924.808887,3.5,487500.0,6027.75,,...,16.0,1057938.0,6798000.0,16927000.0,0.125,0.0625,1.0,1,0,0
4,USR_E4ZBSVBNL5,25,HCM,2896.0,144750.0,111735.923797,3.25,512500.0,3961.5,,...,,,,,,,0.0,1,0,1


In [15]:
# 1. Capacity & History Features (Numeric): Fill with 0
# Logic: If no wallet/telco data, capacity and tenure are effectively 0.
numeric_fill_zero = [
    'telco_account_age_days', 'telco_avg_revenue_mean', 
    'telco_recharge_count_mean', 'telco_mobile_data_mb_mean',
    'wallet_txn_count', 'wallet_avg_amount', 
    'wallet_large_txn_ratio', 'wallet_failure_rate'
]
df[numeric_fill_zero] = df[numeric_fill_zero].fillna(0)

# 2. Ecosystem Features (Categorical): Fill with 'Unknown'
categorical_fill_unknown = [
    'edu_highest_level', 'edu_institution_tier', 'user_region'
]
df[categorical_fill_unknown] = df[categorical_fill_unknown].fillna('Unknown')

print("Imputation complete based on FICO logic.")

Imputation complete based on FICO logic.


In [16]:
# --- Factor 1: Payment History (Weight 35%) ---
# Logic: "Failure Rate" is a proxy for default (Bad), "Recharge Count" implies stability (Good).
# We create a composite 'reliability_score'
# Note: Adding a small epsilon to failure rate to avoid potential issues, though 0 is fine here.
df['fico_payment_reliability'] = df['telco_recharge_count_mean'] - (df['wallet_failure_rate'] * 100)
# (Multiplied failure rate by 100 to scale it comparable to recharge count)


# --- Factor 2: Amounts Owed / Capacity (Weight 30%) ---
# Logic: Unlike traditional credit where Debt = Risk, here Revenue/Balance = Capacity (Good).
# "Dấu của trọng số sẽ ngược nhau" (Signs are opposite).
df['fico_financial_capacity'] = df['telco_avg_revenue_mean'] + df['wallet_avg_amount']


# --- Factor 3: Length of Credit History (Weight 15%) ---
# Logic: "Sim dùng 10 năm uy tín tương đương thẻ tín dụng dùng 10 năm"
# We prioritize Telco tenure but boost it with Wallet transaction depth.
df['fico_history_tenure'] = df['telco_account_age_days'] + (df['wallet_txn_count'] * 10) 
# (Assumption: 1 wallet txn adds slight 'weight' to history depth)


# --- Factor 4: New Credit / Activity (Weight 10%) ---
# Logic: "Hành vi giao dịch ví tăng đột biến" (Sudden spikes).
# Proxy: 'wallet_large_txn_ratio' reflects active, high-demand spending behavior.
df['fico_new_credit_intensity'] = df['wallet_large_txn_ratio']


# --- Factor 5: Credit Mix (Weight 10%) ---
# Logic: "Hệ sinh thái rộng" (Ecosystem breadth: Degree + Wallet + 4G).
# We calculate a score: 1 point for Telco, 1 for Wallet, 1 for University degree.

# Helper: Encode Education to binary (University/Tier 1/2 = 1, Others = 0)
has_degree = df['edu_highest_level'].isin(['university', 'master', 'phd']).astype(int)

df['fico_credit_mix_score'] = (
    df['has_telco_data'] + 
    df['has_ewallet_data'] + 
    has_degree
)

print("Constructed 5 Adapted FICO Factors:")
print(["fico_payment_reliability", "fico_financial_capacity", "fico_history_tenure", "fico_new_credit_intensity", "fico_credit_mix_score"])

Constructed 5 Adapted FICO Factors:
['fico_payment_reliability', 'fico_financial_capacity', 'fico_history_tenure', 'fico_new_credit_intensity', 'fico_credit_mix_score']


In [None]:
# Ordinal Encoding for Tiers (Preserving rank as implied by "Quality tier")
tier_map = {'Unknown': 0, 'tier_3': 1, 'tier_2': 2, 'tier_1': 3}
df['edu_institution_tier_encoded'] = df['edu_institution_tier'].map(tier_map)

# Label Encoding for nominals
le = LabelEncoder()
df['user_region_encoded'] = le.fit_transform(df['user_region'].astype(str))

# Final Feature Selection
features_to_drop = [
    'user_id', 'target', 
    'edu_highest_level', 'edu_institution_tier', 'user_region', 
    'edu_gpa_band', 'edu_major_group', 'edu_graduation_status'  
]

X = df.drop(columns=[c for c in features_to_drop if c in df.columns])
y = df['target']

print(f"Final Feature Set ({X.shape[1]} features):")
print(X.columns.tolist())

Final Feature Set (33 features):
['user_age', 'telco_account_age_days', 'telco_avg_revenue_mean', 'telco_avg_revenue_std', 'telco_recharge_count_mean', 'telco_recharge_amount_mean', 'telco_mobile_data_mb_mean', 'wallet_txn_count', 'wallet_avg_amount', 'wallet_max_amount', 'wallet_total_amount', 'wallet_large_txn_ratio', 'wallet_failure_rate', 'has_ewallet_data', 'has_telco_data', 'has_academic_data', 'telco_stability_index', 'telco_data_intensity', 'wallet_ticket_size', 'is_high_potential', 'age_group', 'edu_institution_tier_encoded', 'edu_gpa_band_encoded', 'edu_graduation_status_encoded', 'age_group_encoded', 'user_region_encoded', 'edu_major_group_encoded', 'edu_highest_level_encoded', 'fico_payment_reliability', 'fico_financial_capacity', 'fico_history_tenure', 'fico_new_credit_intensity', 'fico_credit_mix_score']


In [None]:
# Stratified Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 1. Save as Parquet (Best for Python/Pandas)
X_train.join(y_train).to_parquet(OUTPUT_DIR / 'train_engineered.parquet')
X_test.join(y_test).to_parquet(OUTPUT_DIR / 'test_engineered.parquet')

# 2. Save as CSV 
full_train_export = X_train.copy()
full_train_export['TARGET'] = y_train
full_train_export['SK_ID_CURR'] = range(len(full_train_export)) 

CSV_PATH = Path('../../data/train_feature_engineered.csv')
full_train_export.to_csv(CSV_PATH, index=False)

print(f"Feature Engineering Complete.")
print(f"Train/Test Parquet saved to: {OUTPUT_DIR}")
print(f"Modeling CSV saved to: {CSV_PATH}")

Feature Engineering Complete.
Train/Test Parquet saved to: ..\..\data\data-processing\alternative_data\processed\alt_data
Modeling CSV saved to: ..\..\data\train_feature_engineered.csv
