# **Scaling**
---

## **Create Scorecard**

In [9]:
#import library
import pandas as pd
import numpy as np

#import library for modeling
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append("../src")
#load configuration
import utils

In [10]:
config_data = utils.config_load()
config_data

{'raw_dataset_path': '../dataset/raw/credit_risk.csv',
 'dataset_path': '../dataset/output/data.pkl',
 'predictors_set_path': '../dataset/output/predictors.pkl',
 'response_set_path': '../dataset/output/response.pkl',
 'train_path': ['../dataset/output/X_train.pkl',
  '../dataset/output/y_train.pkl'],
 'test_path': ['../dataset/output/X_test.pkl', '../dataset/output/y_test.pkl'],
 'data_train_path': '../dataset/output/training_data.pkl',
 'data_train_binned_path': '../dataset/output/bin_training_data.pkl',
 'crosstab_list_path': '../dataset/output/list_crosstab.pkl',
 'WOE_table_path': '../dataset/output/WOE_table.pkl',
 'IV_table_path': '../dataset/output/IV_table.pkl',
 'WOE_map_dict_path': '../dataset/output/WOE_map_dict.pkl',
 'X_train_woe_path': '../dataset/output/X_train_woe.pkl',
 'response_variable': 'loan_status',
 'test_size': 0.2,
 'num_columns': ['person_age',
  'person_income',
  'person_emp_length',
  'loan_amnt',
  'loan_int_rate',
  'loan_percent_income',
  'cb_person_c

In [11]:
def scaling():
    """
    Assign score points to each attribute based on the best model's output

    Returns
    -------
    pandas.DataFrame: A table containing the characteristics, WOE values, parameter estimates, and score points

    This function calculates score points for each attribute based on the best logistic regression model and the Weight of Evidence (WOE) values
    It uses predefined references like PDO (Points to Double the Odds) and offset to transform the logistic regression model output into score points
    The resulting score points are stored in a table and saved to a file. The table includes the characteristics, WOE values, parameter estimates, and score points for each attribute
    """

    #define the references: score, odds, pdo
    pdo = config_data['pdo']
    score = config_data['score_ref']
    odds = config_data['odds_ref']

    #load the best model
    best_model_path = config_data['best_model_path']
    best_model = utils.pickle_load(best_model_path)

    #load the WOE table
    WOE_table_path = config_data['WOE_table_path']
    WOE_table = utils.pickle_load(WOE_table_path)

    #load the best model's estimates table
    best_model_summary_path = config_data['best_model_summary_path']
    best_model_summary = utils.pickle_load(best_model_summary_path)

    #calculate Factor and Offset
    factor = pdo/np.log(2)
    offset = score-(factor*np.log(odds))

    print('===================================================')
    print(f"Odds of good of {odds}:1 at {score} points score.")
    print(f"{pdo} PDO (points to double the odds of good).")
    print(f"Offset = {offset:.2f}")
    print(f"Factor = {factor:.2f}")
    print('===================================================')

    #define n = number of characteristics
    n = best_model_summary.shape[0] - 1

    #define b0
    b0 = best_model.intercept_[0]

    print(f"n = {n}")
    print(f"b0 = {b0:.4f}")

    #adjust characteristic name in best_model_summary_table
    numeric_cols = config_data['num_columns']
    for col in best_model_summary['Characteristic']:

        if col in numeric_cols:
            bin_col = col + '_bin'
        else:
            bin_col = col

        best_model_summary.replace(col, bin_col, inplace = True) 

    #,erge tables to get beta/parameter estimate for each characteristic
    scorecards = pd.merge(left = WOE_table,
                          right = best_model_summary,
                          how = 'left',
                          on = ['Characteristic'])
    
    #define beta and WOE
    beta = scorecards['Estimate']
    WOE = scorecards['WOE']

    #calculate the score point for each attribute
    scorecards['Points'] = (offset/n) - factor*((b0/n) + (beta*WOE))
    scorecards['Points'] = scorecards['Points'].astype('int')

    #validate
    print('Scorecards table shape : ', scorecards.shape)
    
    #dump the scorecards
    scorecards_path = config_data['scorecards_path']
    utils.pickle_dump(scorecards, scorecards_path)

    return scorecards

In [12]:
#check the function
scorecards = scaling()
scorecards

Odds of good of 30:1 at 200 points score.
20 PDO (points to double the odds of good).
Offset = 101.86
Factor = 28.85
n = 10
b0 = -0.0492
Scorecards table shape :  (49, 5)


Unnamed: 0,Characteristic,Attribute,WOE,Estimate,Points
0,person_age_bin,"(19.999, 23.0]",-0.121524,-0.110589,9
1,person_age_bin,"(23.0, 25.0]",0.006519,-0.110589,10
2,person_age_bin,"(25.0, 27.0]",0.049261,-0.110589,10
3,person_age_bin,"(27.0, 32.0]",0.078916,-0.110589,10
4,person_age_bin,"(32.0, 144.0]",0.054565,-0.110589,10
5,person_income_bin,"(3999.999, 35000.0]",-1.007237,-0.848596,-14
6,person_income_bin,"(35000.0, 48996.0]",-0.052875,-0.848596,9
7,person_income_bin,"(48996.0, 63000.0]",0.175419,-0.848596,14
8,person_income_bin,"(63000.0, 86000.0]",0.492046,-0.848596,22
9,person_income_bin,"(86000.0, 6000000.0]",1.030836,-0.848596,35


In [13]:
#calculate the min and max points for each characteristic
grouped_char = scorecards.groupby('Characteristic')
grouped_points = grouped_char['Points'].agg(['min', 'max'])
grouped_points

Unnamed: 0_level_0,min,max
Characteristic,Unnamed: 1_level_1,Unnamed: 2_level_1
cb_person_cred_hist_length_bin,9,11
cb_person_default_on_file,4,11
loan_amnt_bin,0,16
loan_int_rate_bin,-30,43
loan_intent,-1,28
loan_percent_income_bin,-22,28
person_age_bin,9,10
person_emp_length_bin,5,13
person_home_ownership,-1,36
person_income_bin,-14,35


In [14]:
#calculate the min and max score from the scorecards
total_points = grouped_points.sum()
min_score = total_points['min']
max_score = total_points['max']

print(f"The lowest credit score = {min_score}")
print(f"The highest credit score = {max_score}")

The lowest credit score = -41
The highest credit score = 231


## **Predict the Credit Score**

In [15]:
def get_points_map_dict():
    """
    Get the Points mapping dictionary

    Returns:
    dict: A dictionary containing the points mapping for each attribute and characteristic

    This function generates a points mapping dictionary based on the scorecards table
    It iterates through the table, extracts the characteristics, attributes, and their corresponding points, and organizes them into a dictionary structure
    The resulting dictionary is then saved to a file
    """
    #load the Scorecards table
    scorecards = utils.pickle_load(config_data['scorecards_path'])

    #initialize the dictionary
    points_map_dict = {}
    points_map_dict['Missing'] = {}
    unique_char = set(scorecards['Characteristic'])
    for char in unique_char:
        #get the Attribute & WOE info for each characteristics
        current_data = (scorecards
                            [scorecards['Characteristic']==char]    
                            [['Attribute', 'Points']])              
        
        #get the mapping
        points_map_dict[char] = {}
        for idx in current_data.index:
            attribute = current_data.loc[idx, 'Attribute']
            points = current_data.loc[idx, 'Points']

            if attribute == 'Missing':
                points_map_dict['Missing'][char] = points
            else:
                points_map_dict[char][attribute] = points
                points_map_dict['Missing'][char] = np.nan

    #validate data
    print('Number of key : ', len(points_map_dict.keys()))

    #dump
    utils.pickle_dump(points_map_dict, config_data['points_map_dict_path'])

    return points_map_dict

In [16]:
#check the function
get_points_map_dict()

Number of key :  11


{'Missing': {'loan_amnt_bin': nan,
  'cb_person_cred_hist_length_bin': nan,
  'person_home_ownership': nan,
  'loan_int_rate_bin': 11,
  'loan_intent': nan,
  'cb_person_default_on_file': nan,
  'person_emp_length_bin': 5,
  'loan_percent_income_bin': nan,
  'person_age_bin': nan,
  'person_income_bin': nan},
 'loan_amnt_bin': {Interval(499.999, 4400.0, closed='right'): 11,
  Interval(4400.0, 6800.0, closed='right'): 16,
  Interval(6800.0, 10000.0, closed='right'): 14,
  Interval(10000.0, 14500.0, closed='right'): 9,
  Interval(14500.0, 35000.0, closed='right'): 0},
 'cb_person_cred_hist_length_bin': {Interval(1.999, 3.0, closed='right'): 9,
  Interval(3.0, 4.0, closed='right'): 10,
  Interval(4.0, 5.0, closed='right'): 11,
  Interval(5.0, 9.0, closed='right'): 10,
  Interval(9.0, 30.0, closed='right'): 10},
 'person_home_ownership': {'MORTGAGE': 24, 'OTHER': -1, 'OWN': 36, 'RENT': 0},
 'loan_int_rate_bin': {Interval(5.419, 7.51, closed='right'): 43,
  Interval(7.51, 10.25, closed='rig

In [17]:
def transform_points(raw_data=None, type=None, config_data=None):
    """
    Replace data value with points

    Args
    ----
    raw_data (DataFrame, optional): The raw data to be transformed. If None, the data is loaded based on the specified 'type'
    type (str, optional): The type of data to be transformed (e.g., 'train', 'test'). If None, 'raw_data' must be provided
    config_data (dict, optional): Configuration data containing file paths and settings

    Returns
    -------
    DataFrame: The transformed data with values replaced by points

    This function replaces the values in the input data with their corresponding points based on the 'points_map_dict'
    It handles both numeric and categorical columns, mapping them to their respective points
    Missing or out-of-range values are also mapped to points
    The transformed data is returned, and if a 'type' is specified, it is saved to a file
    """
    #lLoad the numerical columns
    numeric_cols = config_data['num_columns']

    #load the points_map_dict
    points_map_dict = utils.pickle_load(config_data['points_map_dict_path'])

    #load the saved data if type is not None
    if type is not None:
        raw_data = utils.pickle_load(config_data[f'{type}_path'][0])

    #map the data
    points_data = raw_data.copy()
    for col in points_data.columns:
        if col in numeric_cols:
            map_col = col + '_bin'
        else:
            map_col = col    

        points_data[col] = points_data[col].map(points_map_dict[map_col])

    #map the data if there is a missing value or out of range value
    for col in points_data.columns:
        if col in numeric_cols:
            map_col = col + '_bin'
        else:
            map_col = col 

        points_data[col] = points_data[col].fillna(value=points_map_dict['Missing'][map_col])

    #dump data
    if type is not None:
        utils.pickle_dump(points_data, config_data[f'X_{type}_points_path'])

    return points_data

In [18]:
#check the function on the train set
X_train_points = transform_points(type='train', config_data=config_data)

X_train_points

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
15884,10,35,24,11.0,20,0,43.0,28,11,10
15138,9,-14,0,11.0,14,11,12.0,28,11,10
7474,10,14,24,13.0,0,0,12.0,-22,11,9
18212,10,-14,36,5.0,0,16,-30.0,-22,11,10
6493,10,14,24,9.0,28,14,29.0,14,11,9
...,...,...,...,...,...,...,...,...,...,...
14621,10,35,24,13.0,1,0,-30.0,-22,11,10
18736,10,22,0,7.0,0,0,12.0,14,11,10
1663,9,-14,0,12.0,20,11,29.0,28,11,9
18257,10,35,0,11.0,1,0,-30.0,14,4,10


In [19]:
def predict_score(raw_data, config_data):
    """
    Predict the credit score for a given dataset.

    Args
    ----
    raw_data (DataFrame): The raw data for which to predict the credit score.
    config_data (dict): Configuration data containing file paths and settings.

    Returns
    -------
    int: The predicted credit score.

    This function takes raw data as input, transforms it into points using the 'transform_points' function, and calculates the credit score by summing the points for each row
    The cutoff score specified in the configuration is used to make a recommendation (APPROVE or REJECT), and the predicted score is saved to a file
    """
    
    points = transform_points(raw_data = raw_data, 
                              type = None, 
                              config_data = config_data)
    
    score = int(points.sum(axis=1))
    
    cutoff_score = config_data['cutoff_score']

    if score > cutoff_score:
        print("Recommendation : APPROVE")
    else:
        print("Recommendation : REJECT")

    utils.pickle_dump(score, config_data['score_path'])

    return score

In [26]:
# Check the function with raw data input
tes_input = {
    'person_age_bin': 25,
    'person_income_bin': 60000,
    'person_emp_length_bin': 1,
    'loan_amnt_bin': 8000,
    'loan_int_rate_bin': 10,
    'loan_percent_income_bin': 0.25,
    'cb_person_cred_hist_length_bin': 2,
    'person_home_ownership': 'RENT',
    'loan_intent': 'MEDICAL',
    'cb_person_default_on_file': 'N'
}

tes = pd.DataFrame(tes_input, index=[0])

tes

Unnamed: 0,person_age_bin,person_income_bin,person_emp_length_bin,loan_amnt_bin,loan_int_rate_bin,loan_percent_income_bin,cb_person_cred_hist_length_bin,person_home_ownership,loan_intent,cb_person_default_on_file
0,25,60000,1,8000,10,0.25,2,RENT,MEDICAL,N


In [27]:
#predict the credit score
predict_score(raw_data=tes, config_data = config_data)

Recommendation : REJECT


108