# Tuning Isolation Forest

In [1]:
import numpy  as np
import pandas as pd
from collections import Counter

In [2]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')

In [3]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [4]:
columns4drop = [
    'BatchId',
    'SubscriptionId',
    'CustomerId',
#     'AccountId',
#     'ProviderId',
#     'ProductId',
#     'ProductCategory',
#     'ChannelId',
#     'Amount',
#     'Value',
    'TransactionStartTime',
#     'PricingStrategy',
]

In [5]:
X_trn_cut = X_trn.drop(columns=columns4drop, axis=1)

In [6]:
X_trn_cut.head()

Unnamed: 0,AccountId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,3956,5,9,0,2,1000.0,1000,2
1,4840,3,5,2,1,-20.0,20,2
2,4228,5,0,0,2,500.0,500,2
3,647,0,20,9,2,20000.0,21800,2
4,4840,3,5,2,1,-644.0,644,2


In [7]:
from sklearn.ensemble import IsolationForest

In [12]:
# %%timeit
# alg = IsolationForest(behaviour='new', max_samples=100, random_state=rng, contamination='auto')
alg = IsolationForest(behaviour='new', contamination=0.0015)
alg.fit(X_trn_cut)
y_pred_trn = alg.predict(X_trn_cut)

In [13]:
print('True Train Label Distribution: {}'.format(Counter(y_trn)))
print('IF   Train Label Distribution: {}'.format(Counter(y_pred_trn)))

True Train Label Distribution: Counter({0: 95469, 1: 193})
IF   Train Label Distribution: Counter({1: 95520, -1: 142})


### Grid Search CV

In [14]:
X1 = X_trn_cut[ : df_trn.shape[0]]
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [15]:
from sklearn.model_selection import StratifiedKFold

In [16]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
folds = list(skf.split(X1, y1))

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [18]:
IF = IsolationForest()
params = {
#     'n_estimators' : range(50, 150, 5),                       # 20
#     'max_samples'  : [128, 256, 512, 1024],                   # 4
    'contamination': np.linspace(0.0010, 0.0030, num=201),    # 101
#     'max_features' : np.linspace(0.1, 1.0, num=11),           # 11
#     'bootstrap'    : ['False', 'True']                        # 2
}                                                             # 20*4*101*11*2=177760*1.7s=84h
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=IF, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);



In [19]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

Unnamed: 0,mean_test_score,params
34,0.999031,{'contamination': 0.00134}
3,0.999021,{'contamination': 0.00103}
10,0.999,{'contamination': 0.0011}
47,0.998995,{'contamination': 0.00147}
26,0.998974,{'contamination': 0.00126}


In [20]:
grdsrch.best_estimator_

IsolationForest(behaviour='old', bootstrap=False, contamination=0.00134,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=None, verbose=0, warm_start=False)

In [21]:
IF_opt = grdsrch.best_estimator_

In [22]:
X_tst_cut = X_tst.drop(columns=columns4drop, axis=1)

In [23]:
X_tst_cut.head()

Unnamed: 0,AccountId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,2440,4,2,0,2,1000.0,1000,3
1,3438,4,14,2,2,2000.0,2000,2
2,4840,3,5,2,1,-50.0,50,2
3,2684,4,9,0,2,3000.0,3000,3
4,4840,3,5,2,1,-60.0,60,2


In [25]:
y_pred = IF_opt.predict(X_tst_cut)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)



Counter({0: 45000, 1: 19})

In [28]:
frames = [X_trn_cut, X_tst_cut]
X_uni_cut = pd.concat(frames)
X_uni_cut.shape

(140681, 8)

In [29]:
X_uni_cut.head()

Unnamed: 0,AccountId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,3956,5,9,0,2,1000.0,1000,2
1,4840,3,5,2,1,-20.0,20,2
2,4228,5,0,0,2,500.0,500,2
3,647,0,20,9,2,20000.0,21800,2
4,4840,3,5,2,1,-644.0,644,2


In [42]:
y_pred = IsolationForest(contamination=0.0020).fit(X_tst_cut).predict(X_tst_cut)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)



Counter({0: 44957, 1: 62})

In [43]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0722_IF_le.csv', encoding='utf-8', index=False)