In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import copy 

This notebook uses IV to select the best variables and transform them with WoE for each feature. The data used to train a logist regression model with 2 order interactions variables (multiplication).

Important
*--- There is data leakage in the WoE which will be used to train the LR models ---*

# Load data

In [2]:
train = pd.read_csv('icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('icr-identify-age-related-conditions/test.csv')
greeks = pd.read_csv('icr-identify-age-related-conditions/greeks.csv')

train.columns = train.columns.str.replace(' ', '')
test.columns = test.columns.str.replace(' ', '')

# Greeks will be used in the stratified k-fold strategy

In [3]:
greeks['k'] = greeks['Alpha'] + greeks['Beta'] + greeks['Gamma'] + greeks['Delta']
train = pd.merge( greeks[['k', 'Id']],train,on='Id')

names = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD', 'BN',
         'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 'CR', 'CS',
         'CU', 'CW', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
         'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD', 'FE', 'FI',
         'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']
target_name = 'Class'

# Data Cleaning

In [4]:
train['EJ'] = pd.Series(np.where(train.EJ.values == 'A', 1, 0), train.index)
test['EJ'] = pd.Series(np.where(test.EJ.values == 'A', 1, 0), test.index)

# fill nan data with mean values 
train[names] = train[names].fillna(train[names].mean())
test[names] = test[names].fillna(train[names].mean())
# clip values to avoid different values in the test set from train
test = test[names].clip(train[names].min(axis=0).values,train[names].max(axis=0).values, axis=1)

# data scaled to allow the features interaction (by multiplication)
scaler = StandardScaler()

train2 = copy.copy(train)
teste2 = copy.copy(test)

vals = scaler.fit_transform(train[names])
vals_test = scaler.transform(test[names])

train2[names] = vals
teste2[names] = vals_test

# Defining 2 order interactions

In [5]:
# def multiply and make a array of all interactions
def mab(df,nome1,nome2):
    a  = df[nome1]*df[nome2]
    return(a/max(a))

h = []
ht = []

n = 1
for n1 in names:
    for n2 in names[n:]:
        h.append(mab(train2,n1,n2).rename(n1+'_mul_'+n2))
        ht.append(mab(teste2,n1,n2).rename(n1+'_mul_'+n2))
        
    n+=1
    
newF = pd.DataFrame(h)
newF_test = pd.DataFrame(ht)

Information Value (IV) and Weights of evidence (WoE) are widely used in credit score models with logistic regression. IV is a bivariate metric which ranks the predictive power of the the variables in relation to the target. WoE will transform the data to a monotonic pattern which is fundamental to the Logistic Regression model to perform well.


In [6]:
#https://lucastiagooliveira.github.io/datascience/iv/woe/python/2020/12/15/iv_woe.html
def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})

        
        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        
        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
        #print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)

        #Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [7]:
a,b = iv_woe(train2, target_name, bins=10, show_woe=False)

In [8]:
# most important features based on IV
a.sort_values(by='IV',ascending=False).Variable.values 

array(['k', 'Id', 'DU', 'GL', 'FL', 'CR', 'DA', 'AF', 'AB', 'BQ', 'DI',
       'EB', 'FD', 'EE', 'EH', 'FR', 'CD', 'DE', 'CC', 'BN', 'FI', 'FE',
       'DH', 'EU', 'GF', 'DF', 'BC', 'DL', 'AM', 'BP', 'AH', 'AR', 'GH',
       'DN', 'CS', 'GB', 'DY', 'CF', 'CB', 'GI', 'BD', 'FC', 'BR', 'CU',
       'FS', 'AZ', 'EL', 'EJ', 'CW', 'AX', 'GE', 'AY', 'EG', 'EP', 'CH',
       'CL', 'BZ', 'DV'], dtype=object)

In [9]:
# Reordering the dataframe to keep IV with higger values in front
trainE = train[a.sort_values(by='IV',ascending=False).Variable.values]
trainE[target_name] = train[target_name]
testeE = test[a.sort_values(by='IV',ascending=False).Variable.values[2:]]

# join the original vars and the interactions between them
ff = pd.concat([trainE,newF.T],axis=1)
ff_teste = pd.concat([testeE,newF_test.T],axis=1)

a,b = iv_woe(ff, target_name, bins=10, show_woe=False)

# deleting all IVs below 0.05
a = a.loc[a['IV']> 0.05]

allNames = a.sort_values(by='IV',ascending=False).Variable.values
crossNames = [x for x in allNames if '_mul_' in x]

nomes2 = list(trainE)+crossNames
nomes2.remove('Class')

In [10]:
# threshold to correlation features
threshold = 0.3

cc = ff[nomes2[2:]].corr()

mat_x = abs(cc)>threshold
mat_x = mat_x.to_numpy()

In [11]:
# select variables with low correlation, there are +- 70 features with low correlation
var1 = []
nomes = list(cc)
var1.append(nomes[0])
max_vars = 100

count = 1
for n in range(1,len(nomes)):
    
    if (mat_x[n,:n+1].sum() ) == 1:
        
        var1.append(nomes[n])        
        count+=1
        
        if(count == max_vars):
            break

In [12]:
# 'CW', 'AZ', 'FS', 'BR', 'FE', 'BN', 'DE', 'AF', 'CR',

features_to_drop = ['CR_mul_DE', 'BQ_mul_FE', 'CR_mul_GE', 'EE_mul_GF', 
                    'CR_mul_FE', 'BQ_mul_FC', 'DE_mul_DL', 'AZ_mul_GL', 'CW_mul_DL', 
                    'BN_mul_CR', 'DN_mul_FI', 'AZ_mul_FE', 'CW_mul_EL', 'AZ_mul_CU',
                    'CW_mul_DY', 'DH_mul_DL', 'AX_mul_CU', 'BN_mul_DE', 'BN_mul_CW', 
                    'AZ_mul_EL', 'AZ_mul_DE']

var1 = [v for v in var1 if v not in features_to_drop]

In [13]:
# create dic with WoE transformation
list_dics = []

for var in var1:
  df_temp = b.loc[b['Variable']==var].reset_index()
  # crieate dict
  dict_var = {}
  for x in range(len(df_temp)):
    line = df_temp.iloc[x]
    dict_var[line['Cutoff']] = line['WoE']
  list_dics.append(dict_var)

In [14]:
# train and test data
df_original = ff[var1+[target_name]+ ['k'] ]
df_test2 = ff_teste[var1]
names = var1

In [15]:
# pandas is giving multiple warnings and making the code "dirty"
import warnings
warnings.filterwarnings('ignore')


In [16]:
# In this part there is some data leakage as the map is using the full dataset
n= 0

for var in var1:
  df_original.loc[:,var] = df_original[var].map(list_dics[n])
  df_test2.loc[:,var] = df_test2[var].map(list_dics[n])
  n=n+1

In [17]:
df_original.loc[:,names] = df_original[names].fillna(df_original[names].mean())
df_test2.loc[:,names] = df_test2[names].fillna(df_original[names].mean())

In [18]:
#https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/409691
from sklearn.utils import class_weight

def balanced_log_loss(y_true, y_pred):
    # Nc is the number of observations
    N_1 = np.sum(y_true == 1, axis=0)
    N_0 = np.sum(y_true == 0, axis=0)

    N_inv_0 = 1/N_0 if N_0 > 0 else 0
    N_inv_1 = 1/N_1 if N_1 > 0 else 0

    # In order to avoid the extremes of the log function, each predicted probability 𝑝 is replaced with max(min(𝑝,1−10−15),10−15)
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)

    # balanced logarithmic loss
    loss_numerator = - N_inv_0 * np.sum((1 - y_true) * np.log(1 - y_pred)) - N_inv_1 * np.sum(y_true * np.log(y_pred))

    return loss_numerator / 2

In [22]:
n_splits = 10

predictions_LR = 0
cv_score_LR = 0
train_score_LR = 0

rr = [42, 21, 100, 45, 1, 228]

for v_fold in rr:
    print(v_fold)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=v_fold)
    for i, (train_index, test_index) in enumerate(skf.split(df_original[target_name], df_original['k'])):

            model = LogisticRegression(random_state=0, C=0.1, n_jobs=-1, max_iter=2000, class_weight='balanced')

            df_train = df_original.iloc[train_index]
            df_test = df_original.iloc[test_index]
            
            df_train1 = df_train[names].to_numpy()
            df_test1 = df_test[names].to_numpy()            
            
            model.fit( df_train1, df_train[target_name])
            
            y_hat_teste_LR = model.predict_proba(df_test1)[:,1]
            val = balanced_log_loss(df_test[target_name], y_hat_teste_LR.reshape(-1, ))
            print(val)
            cv_score_LR += val

            y_hat_train_LR = model.predict_proba(df_train1)[:,1]
            train_score_LR += balanced_log_loss(df_train[target_name], y_hat_train_LR.reshape(-1, ))  
            predictions_LR += model.predict_proba(df_test2[names])
         
print('Train :', train_score_LR/(n_splits*len(rr)) )
print('CV score: ', cv_score_LR/(n_splits*len(rr)) )
predictions_LR = predictions_LR/(n_splits*len(rr))


42
0.24280263471097113
0.13642566102885578
0.14156763496868685
0.1793804218790115
0.19496722447133227
0.14730339957833405
0.13584909662873615
0.09828051839798833
0.13543981745162145
0.36578212067475663
21
0.2450706588681173
0.09270907267188531
0.12540697391728722
0.1135619390823405
0.1941062647471148
0.22449607693783752
0.12270661135463531
0.22601885617284784
0.2972048581555248
0.1562382273409431
100
0.08929361224045272
0.23580141316254477
0.20375731451269444
0.13245838063725
0.2234422585489368
0.14705897315903718
0.27020893835493137
0.10698643899002926
0.16886928545583238
0.12298801873969596
45
0.21032641066574131
0.11078042447855894
0.2991066403949558
0.08749303785239407
0.34922175774079733
0.09496181589985117
0.12321482215272472
0.13246330764589648
0.16087548618657038
0.16424358955522916
1
0.1921066020326651
0.15856892361524655
0.14791719032398493
0.13207819149880284
0.18799761649311703
0.2773032463536865
0.15374101777027327
0.08832960667135527
0.26613424180760636
0.1495172992267387

In [20]:
# Train : 0.1403696989231827
# CV score:  0.169496932984689

Maybe number is 31

011001    1.29

001010    1.23