# 1- Imports

In [12]:
import numpy as np
import pandas as pd
from datetime import datetime
import yaml
import sys
import os
import re
import json
from tqdm import tqdm
import warnings

from pyspark.sql import functions as F
from pyspark.sql.types import DateType, ArrayType, StringType
from pyspark.sql.functions import udf, broadcast


from optbinning import OptimalBinning, ContinuousOptimalBinning
import category_encoders as ce
from typing import List, Dict
import shap
import lightgbm as lgb
import scipy
from scipy.stats import spearmanr, pearsonr, kendalltau, pointbiserialr, f_oneway
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.figure_factory as ff
import joblib

import numpy as np
from scipy.stats import logistic, norm
from pathlib import Path

In [13]:
# paths and custom modules
cwd_path = Path(os.getcwd())
data_path = cwd_path/'data/'
objects_path = cwd_path/'objects/'
sys.path.append(os.path.dirname(cwd_path))

from utils import *

data_catalog, models_catalog = return_catalog()

In [14]:
# set the pyspark path to env excutable
conda_env_path = sys.prefix
os.environ['PYSPARK_PYTHON'] = os.path.join(conda_env_path, 'python')

# build spark session
spark = build_spark_session()
spark

# 2- CC Params

In [15]:
PD_THRESHOLD = 0.325
GREEN_INCOME_ZONE_THRESHOLD = 0.1
RED_INCOME_ZONE_THRESHOLD = 0.5


PD_BINS_TO_CWF = {
    'pd_in': [PD_THRESHOLD, 0.23, 0.1, 0.04, 0.01, 0],
    'cwf_map': [1.5, 2.3, 4, 5, 7],  
    'risk_seg_map': ['Tier-5', 'Tier-4', 'Tier-3', 'Tier-2', 'Tier-1']
}

CL_MAX_LIMIT = 500000
CL_MIN_LIMIT = 1000


# 3- Fetch


In [16]:
# fetch data
data_date = '20240810'
income_model_version = 'v0.7'
pd_model_version = 'v0.7'
rule_based_version = 'v0.1'

# processed all features
df_to_all_models = pd.read_parquet(data_path / f"features_store/{data_date}_L1_processed_features.parquet")

# income predictions
income_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_income_{income_model_version}_predics.parquet')
income_predics = income_predics.rename(columns={'predics': 'income_predics'})

# pd predictions
pd_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_pd_{pd_model_version}_predics.parquet')
pd_predics = pd_predics.rename(columns={'cali_predics': 'pd_predics'})

# rule-based predictions
rule_based_predics = pd.read_parquet(data_path / f'models_preds/{data_date}_rule_based_{rule_based_version}_predics.parquet')
rule_based_predics = rule_based_predics.rename(columns={'predics_score': 'rule_based_predics_score',
                                                        'predics_pd':'rule_based_predics_pd'})

# 4- Select Features


In [18]:
base_df = df_to_all_models[[
    'client_id',
    'net_income_inflated',
    'fo_par90_flag',
    'OpenAccounts_InstallmentAmount_sum',
    'net_burden_inflated'
    ]].copy()

In [19]:
full_df = (
    base_df
    .merge(income_predics, on='client_id', how='left')
    .merge(pd_predics, on='client_id', how='left')
    .merge(rule_based_predics, on='client_id', how='left')
)

# 6- Simulation Flow

### PD Accept/Reject Rule

In [20]:
full_df['pd_AR'] = np.where(
    full_df['pd_predics'] <= PD_THRESHOLD,
    'accept',
    'reject'
    )

### Income Zones

In [21]:
# Get income ratio
full_df['income_ratio'] = full_df['net_income_inflated'] / full_df['income_predics']

# Get final income and zone
full_df['final_income'], full_df['income_zone'] = calculate_income_zones(full_df,
                                                      'net_income_inflated',
                                                      'income_predics',
                                                      blogic='all_not_verify',
                                                      GREEN_INCOME_ZONE_THRESHOLD=GREEN_INCOME_ZONE_THRESHOLD,
                                                      RED_INCOME_ZONE_THRESHOLD=RED_INCOME_ZONE_THRESHOLD)

### Get Creditworthiness scores

In [None]:
def get_cwf_and_segment(pd_series, ar_series, pd_bins_to_cwf=PD_BINS_TO_CWF):
    """
    Determine the Tier and Creditworthiness Factor (CWF) for accepted customers.
    If the customer is rejected (based on ar_series), return 0 and 'Rejected'.
    Edge cases where PD is higher than max or lower than min are mapped to Tier-5 and Tier-1, respectively.

    Parameters:
    pd_series (pd.Series): Series containing the probability of default (PD) values.
    ar_series (pd.Series): Series indicating 'accept' or 'reject' for each customer.
    pd_bins_to_cwf (dict): Dictionary containing the PD bins, CWF mappings, and risk segments.

    Returns:
    pd.Series, pd.Series: Two Series - cwf and risk_segment.
    """
    # Initialize default values for rejection
    results = [{'cwf': 0, 'risk_segment': 'Rejected'} if ar.lower() == 'reject' else None 
               for ar in ar_series]

    # Map accepted customers
    for i, (pd_value, ar_flag) in enumerate(zip(pd_series, ar_series)):
        if results[i] is not None:  # Skip rejected cases
            continue

        # Edge case: PD is higher than the maximum bin
        if pd_value > pd_bins_to_cwf['pd_in'][0]:
            results[i] = {
                'cwf': pd_bins_to_cwf['cwf_map'][0],
                'risk_segment': pd_bins_to_cwf['risk_seg_map'][0]
            }
            continue

        # Edge case: PD is smaller than the minimum bin
        if pd_value < pd_bins_to_cwf['pd_in'][-1]:
            results[i] = {
                'cwf': pd_bins_to_cwf['cwf_map'][-1],
                'risk_segment': pd_bins_to_cwf['risk_seg_map'][-1]
            }
            continue

        # Accepted cases: Map to corresponding bin
        for j in range(len(pd_bins_to_cwf['pd_in']) - 1):
            if pd_bins_to_cwf['pd_in'][j + 1] < pd_value <= pd_bins_to_cwf['pd_in'][j]:
                results[i] = {
                    'cwf': pd_bins_to_cwf['cwf_map'][j],
                    'risk_segment': pd_bins_to_cwf['risk_seg_map'][j]
                }
                break

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)

    # Return cwf and risk_segment as separate Series
    return results_df['cwf'], results_df['risk_segment']


full_df['cwf'], full_df['cwf_segment'] = get_cwf_and_segment(full_df['pd_predics'], full_df['pd_AR'],PD_BINS_TO_CWF)

# add reject flag
full_df.loc[full_df['pd_AR'] == 'reject', 'cwf_segment'] = 'Rejected'

#
cat_order = ['Rejected']+PD_BINS_TO_CWF['risk_seg_map']
full_df['cwf_segment'] = full_df['cwf_segment'].astype('category')
full_df['cwf_segment'] = full_df['cwf_segment'].cat.reorder_categories(cat_order, ordered=True)

### Calculate Credit Limit

In [23]:
full_df['final_net_income'] = full_df['final_income'] - (full_df['OpenAccounts_InstallmentAmount_sum'].fillna(0) + full_df['net_burden_inflated'].fillna(0))
full_df['final_net_income'] = full_df['final_net_income'].clip(lower=0)
full_df['credit_limit'] = round(full_df['final_net_income'] * full_df['cwf'], 0)

In [24]:
full_df.columns

Index(['client_id', 'net_income_inflated', 'fo_par90_flag',
       'OpenAccounts_InstallmentAmount_sum', 'net_burden_inflated',
       'income_predics', 'pd_predics', 'rule_based_predics_score',
       'rule_based_predics_pd', 'pd_AR', 'income_ratio', 'final_income',
       'income_zone', 'cwf', 'cwf_segment', 'final_net_income',
       'credit_limit'],
      dtype='object')

# 7- Exports

In [25]:
selected_features = ['client_id', 'pd_AR', 'income_ratio',
                     'final_income', 'income_zone', 'cwf', 'cwf_segment',
                     'final_net_income', 'credit_limit']

df_out = full_df[selected_features].reset_index(drop=True)

In [26]:
version = 'v0.2'

df_out.to_parquet(data_path / f'models_preds/{data_date}_calculation_center_{version}_results.parquet')