In [1]:
# Target leakage:
# dataset contains values that can only be determined AFTER the moment of prediction

# Train-test contamination:
# using validation sets to fit the imputer

In [2]:
import pandas as pd

data = pd.read_csv('./data/AER_credit_card_data.csv', true_values = ['yes'], false_values = ['no'])

y = data['card']
X = data.drop(['card'], axis = 1)

print(X.shape)
X.head()

(1319, 11)


Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(RandomForestClassifier(n_estimators = 100, random_state = 0))

cv_scores = cross_val_score(
    pipeline, X, y,
    cv = 5,
    scoring = 'accuracy'
)

cv_scores.mean()

0.9802915082382764

In [6]:
# that is extremely high
# let's take another look at data

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   reports      1319 non-null   int64  
 1   age          1319 non-null   float64
 2   income       1319 non-null   float64
 3   share        1319 non-null   float64
 4   expenditure  1319 non-null   float64
 5   owner        1319 non-null   bool   
 6   selfemp      1319 non-null   bool   
 7   dependents   1319 non-null   int64  
 8   months       1319 non-null   int64  
 9   majorcards   1319 non-null   int64  
 10  active       1319 non-null   int64  
dtypes: bool(2), float64(4), int64(5)
memory usage: 95.4 KB


In [11]:
expenditures_cardholders = X['expenditure'][y]
expenditures_noncholders = X['expenditure'][~y]

(expenditures_cardholders == 0).mean()

0.020527859237536656

In [12]:
(expenditures_noncholders == 0).mean()

1.0

In [15]:
# expenditures are a target leak
# let's examine other columns

data.groupby('card').agg(['mean'])

Unnamed: 0_level_0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
card,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
False,1.587838,33.202984,3.068509,0.000477,0.0,0.304054,0.094595,1.077703,55.300676,0.739865,6.054054
True,0.129032,33.216031,3.451273,0.088482,238.602421,0.479961,0.061584,0.969697,55.258065,0.839687,7.269795


In [17]:
# share is partly defined by expenditure. We should exclude it
# active and majorcards also contain information about whether or not the card is held

In [18]:
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']

X2 = X.drop(potential_leaks, axis = 1)

cv_scores = cross_val_score(
    pipeline, X2, y,
    cv = 5,
    scoring = 'accuracy'
)

cv_scores.mean()

0.829410070284595