# **6. Scaling**
---

Here we will:
- Create the scorecards.
- Generate the points map dictionary
- Predict the credit score from an input

## **6.1 Create Scorecards**
---

Assign score to each attribute by specifying:
- Odds of good of 30:1 at 300 points score, and
- 20 PDO (points to double the odds of good).

Thus, we can calculate the offset and factor:
- $\text{Factor}=\text{PDO}/ \ln(2)$
- $\text{Offset} = \text{Score} − {\text{Factor} ∗ \ln (\text{Odds of good})}$

In [3]:
# Import library
import pandas as pd
import numpy as np

# Load configuration
import src.utils as utils

Update the config file to define the references and dump the scorecards.

In [4]:
# Update the config file 
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [5]:
# Function to convert the model's output into score points
def scaling():
    """Function to assign score points to each attribute"""

    # Define the references: score, odds, pdo
    pdo = CONFIG_DATA['pdo']
    score = CONFIG_DATA['score_ref']
    odds = CONFIG_DATA['odds_ref']

    # Load the best model
    best_model_path = CONFIG_DATA['best_model_path']
    best_model = utils.pickle_load(best_model_path)

    # Load the WOE table
    WOE_table_path = CONFIG_DATA['WOE_table_path']
    WOE_table = utils.pickle_load(WOE_table_path)

    # Load the best model's estimates table
    best_model_summary_path = CONFIG_DATA['best_model_summary_path']
    best_model_summary = utils.pickle_load(best_model_summary_path)

    # Calculate Factor and Offset
    factor = pdo/np.log(2)
    offset = score-(factor*np.log(odds))

    print('===================================================')
    print(f"Odds of good of {odds}:1 at {score} points score.")
    print(f"{pdo} PDO (points to double the odds of good).")
    print(f"Offset = {offset:.2f}")
    print(f"Factor = {factor:.2f}")
    print('===================================================')

    # Define n = number of characteristics
    n = best_model_summary.shape[0] - 1

    # Define b0
    b0 = best_model.intercept_[0]

    # Adjust characteristic name in best_model_summary_table
    num_cols = CONFIG_DATA['num_columns']
    for col in best_model_summary['Characteristic']:

        if col in num_cols:
            bin_col = col + '_bin'
        else:
            bin_col = col

        best_model_summary.replace(col, bin_col, inplace = True) 

    # Merge tables to get beta/parameter estimate for each characteristic
    scorecards = pd.merge(left = WOE_table,
                          right = best_model_summary,
                          how = 'left',
                          on = ['Characteristic'])
    
    # Define beta and WOE
    beta = scorecards['Estimate']
    WOE = scorecards['WOE']

    # Calculate the score point for each attribute
    scorecards['Points'] = round((offset/n) - factor*((b0/n) + (beta*WOE)))
    scorecards['Points'] = scorecards['Points'].astype('int')

    # Validate
    print('Scorecards table shape : ', scorecards.shape)
    
    # Dump the scorecards
    scorecards_path = CONFIG_DATA['scorecards_path']
    utils.pickle_dump(scorecards, scorecards_path)

    return scorecards

In [6]:
# Check the function
scaling()

Odds of good of 30:1 at 300 points score.
20 PDO (points to double the odds of good).
Offset = 201.86
Factor = 28.85
Scorecards table shape :  (49, 5)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,Characteristic,Attribute,WOE,Estimate,Points
0,person_age_bin,"(19.999, 23.0]",-0.119722,-0.057122,18
1,person_age_bin,"(23.0, 26.0]",0.026104,-0.057122,19
2,person_age_bin,"(26.0, 30.0]",0.047449,-0.057122,19
3,person_age_bin,"(30.0, 144.0]",0.068969,-0.057122,19
4,person_income_bin,"(3999.999, 38524.75]",-0.862418,-0.977773,-6
5,person_income_bin,"(38524.75, 55000.0]",0.024109,-0.977773,19
6,person_income_bin,"(55000.0, 79635.0]",0.31118,-0.977773,27
7,person_income_bin,"(79635.0, 6000000.0]",1.026844,-0.977773,47
8,person_emp_length_bin,"(-0.001, 2.0]",-0.28411,-0.249803,16
9,person_emp_length_bin,"(2.0, 4.0]",0.065751,-0.249803,19


## **6.2 Predict the Credit Score**
---

Here we need to generate the points map dictionary to predit the credit score from an input.

Update the config file to dump the points map dictionary and the credit score.

In [7]:
# Update the config file 
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [8]:
# Generate the Points map dict function
def get_points_map_dict():
    """Get the Points mapping dictionary"""
    # Load the Scorecards table
    scorecards = utils.pickle_load(CONFIG_DATA['scorecards_path'])

    # Initialize the dictionary
    points_map_dict = {}
    points_map_dict['Missing'] = {}
    unique_char = set(scorecards['Characteristic'])
    for char in unique_char:
        # Get the Attribute & WOE info for each characteristics
        current_data = (scorecards
                            [scorecards['Characteristic']==char]     # Filter based on characteristic
                            [['Attribute', 'Points']])                 # Then select the attribute & WOE
        
        # Get the mapping
        points_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            points = current_data.loc[idx, 'Points']

            if attribute == 'Missing':
                points_map_dict['Missing'][char] = points
            else:
                points_map_dict[char][attribute] = points
                points_map_dict['Missing'][char] = np.nan

    # Validate data
    print('Number of key : ', len(points_map_dict.keys()))

    # Dump
    utils.pickle_dump(points_map_dict, CONFIG_DATA['points_map_dict_path'])

    return points_map_dict
    

In [9]:
# Check the function
get_points_map_dict()

Number of key :  12


{'Missing': {'loan_percent_income_bin': nan,
  'loan_amnt_bin': nan,
  'loan_intent': nan,
  'person_income_bin': nan,
  'cb_person_default_on_file': nan,
  'person_emp_length_bin': 15,
  'person_home_ownership': nan,
  'loan_grade': nan,
  'loan_int_rate_bin': 19,
  'cb_person_cred_hist_length_bin': nan,
  'person_age_bin': nan},
 'loan_percent_income_bin': {Interval(-0.001, 0.09, closed='right'): 35,
  Interval(0.09, 0.15, closed='right'): 33,
  Interval(0.15, 0.23, closed='right'): 23,
  Interval(0.23, 0.83, closed='right'): -7},
 'loan_amnt_bin': {Interval(499.999, 5000.0, closed='right'): 20,
  Interval(5000.0, 8000.0, closed='right'): 27,
  Interval(8000.0, 12250.0, closed='right'): 22,
  Interval(12250.0, 35000.0, closed='right'): 8},
 'loan_intent': {'DEBTCONSOLIDATION': 7,
  'EDUCATION': 29,
  'HOMEIMPROVEMENT': 10,
  'MEDICAL': 8,
  'PERSONAL': 22,
  'VENTURE': 35},
 'person_income_bin': {Interval(3999.999, 38524.75, closed='right'): -6,
  Interval(38524.75, 55000.0, closed='

Next, transform the raw input data into score points.

In [10]:
def transform_points(raw_data=None, type=None, CONFIG_DATA=None):
    """Replace data value with points"""
    # Load the numerical columns
    num_cols = CONFIG_DATA['num_columns']

    # Load the points_map_dict
    points_map_dict = utils.pickle_load(CONFIG_DATA['points_map_dict_path'])

    # Load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(CONFIG_DATA[f'{type}_path'][0])

    # Map the data
    points_data = raw_data.copy()
    for col in points_data.columns:
        # Perbaiki kolom numerik
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        points_data[col] = points_data[col].map(points_map_dict[map_col])

    # Map the data if there is a missing value or out of range value
    for col in points_data.columns:
        if col in num_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        points_data[col] = points_data[col].fillna(value=points_map_dict['Missing'][map_col])

    # Dump data
    if type is not None:
        utils.pickle_dump(points_data, CONFIG_DATA[f'X_{type}_points_path'])

    return points_data

In [11]:
# Check the function on the train set
X_train_points = transform_points(type='train', CONFIG_DATA=CONFIG_DATA)

X_train_points

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
11491,19,27,8,16.0,7,30,22,18.0,23,18,17
3890,18,19,33,19.0,29,22,20,18.0,33,18,19
17344,19,-6,8,16.0,7,49,22,18.0,-7,18,17
13023,19,47,8,16.0,10,30,22,18.0,33,18,17
29565,19,-6,8,19.0,8,30,8,19.0,-7,18,19
...,...,...,...,...,...,...,...,...,...,...,...
22095,19,-6,8,19.0,35,49,27,18.0,23,18,20
18736,19,27,8,16.0,8,22,8,18.0,-7,18,20
1663,18,-6,8,20.0,29,49,20,18.0,35,18,17
18257,19,47,8,19.0,10,-44,8,19.0,23,19,20


Then, add a function to calculate the credit score.

Update the config file to dump the credit score.

In [18]:
# Update the config file 
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/credit_risk_dataset.csv',
 'dataset_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'response_set_path': 'data/output/response.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.3,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_cred_hist_length'],
 'cat_columns': ['person_home_ownership',
  'loan_intent',
  'loan_g

In [16]:
# Function to predict the credit score
def predict_score(raw_data, CONFIG_DATA):
    """Function to predict the credit score"""
    
    points = transform_points(raw_data = raw_data, 
                              type = None, 
                              CONFIG_DATA = CONFIG_DATA)
    
    score = int(points.sum(axis=1))

    # print(f"Credit Score : ", score)
    
    # cutoff_score = CONFIG_DATA['cutoff_score']

    # if score > cutoff_score:
    #     print("Recommendation : APPROVE")
    # else:
    #     print("Recommendation : REJECT")

    utils.pickle_dump(score, CONFIG_DATA['score_path'])

    return score


In [15]:
# Check the function with raw data input
tes_input = {
    'person_age_bin': 23,
    'person_income_bin': 55000,
    'person_emp_length_bin': 2,
    'loan_amnt_bin': 9000,
    'loan_int_rate_bin': 8,
    'loan_percent_income_bin': 0.21,
    'cb_person_cred_hist_length_bin': 4,
    'person_home_ownership': 'RENT',
    'loan_intent': 'MEDICAL',
    'loan_grade': 'B',
    'cb_person_default_on_file': 'N'
}

tes = pd.DataFrame(tes_input, index=[0])

tes

Unnamed: 0,person_age_bin,person_income_bin,person_emp_length_bin,loan_amnt_bin,loan_int_rate_bin,loan_percent_income_bin,cb_person_cred_hist_length_bin,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file
0,23,55000,2,9000,8,0.21,4,RENT,MEDICAL,B,N


In [19]:
# Predict the credit score
predict_score(raw_data=tes, CONFIG_DATA=CONFIG_DATA)

199