### Imports

In [15]:
import numpy as np
import pandas as pd

### Weight of evidence (WOE) and Information value (IV)

In [16]:
df = pd.DataFrame({'Rating': np.random.choice(['A', 'B', 'C', 'D'], size=(100)),
                   'Income1': 100 + 10 * np.random.randn(100),
                   'Income2': 100 + 10 * np.random.randn(100),
                   'Default': np.random.choice([0,1], size=(100))})

df.head()

Unnamed: 0,Rating,Income1,Income2,Default
0,B,95.136781,96.819795,0
1,C,109.999996,103.394892,1
2,A,96.90948,89.676331,1
3,B,99.478898,101.700365,0
4,D,83.762867,112.293057,1


In [17]:
feature,target = 'Rating', 'Default'
pd.crosstab(df[feature], df[target], normalize='columns')

Default,0,1
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.368421,0.27907
B,0.298246,0.232558
C,0.157895,0.162791
D,0.175439,0.325581


In [18]:
df_woe_iv = (pd.crosstab(df[feature], df[target], normalize='columns')
             .assign(woe=lambda dfx: np.log(dfx[1] / dfx[0]))
             .assign(iv=lambda dfx: np.sum(dfx['woe']*(dfx[1]-dfx[0]))))

df_woe_iv

Default,0,1,woe,iv
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.368421,0.27907,-0.277765,0.134146
B,0.298246,0.232558,-0.248777,0.134146
C,0.157895,0.162791,0.030537,0.134146
D,0.175439,0.325581,0.618323,0.134146


In [31]:
# Calculate information value
def calc_iv(df, list_features, target):
    
    df_ivs = pd.DataFrame()
    
    for feature in list_features:

        lst = []

        for i in range(df[feature].nunique()):
            val = list(df[feature].unique())[i]
            lst.append([feature, val, df[df[feature] == val].count()[feature], df[(df[feature] == val) & (df[target] == 1)].count()[feature]])

        data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Bad'])
        data = data[data['Bad'] > 0]

        data['Share'] = data['All'] / data['All'].sum()
        data['Bad Rate'] = data['Bad'] / data['All']
        data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
        data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
        data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
        data['IV'] = (data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])).sum()

        data = data.sort_values(by=['Variable', 'Value'], ascending=True)

        iv = data['IV'].values[0]
        s = pd.Series([feature, iv])
        df_ivs = df_ivs.append(s, ignore_index=True)
    
    df_ivs.columns = ['Feature', 'IV']
    df_ivs = df_ivs.sort_values(by='IV', ascending=False)
    return df_ivs

In [32]:
calc_iv(df, ['Income1', 'Income2'], 'Default')

Unnamed: 0,Feature,IV
0,Income1,0.0
1,Income2,0.0
