# Тьюнинг EllipticEnvelope

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')
df_sbm = pd.read_csv('../data/sample_submission.csv')

In [3]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

In [4]:
from collections import Counter
print('Train Label Distribution: {}'.format(Counter(y_trn)))

Train Label Distribution: Counter({0: 95469, 1: 193})


In [5]:
num_transactions = y_trn.shape[0]
num_otliers = y_trn.value_counts()[1]
num_inliers = y_trn.value_counts()[0]
outliers_fraction = num_otliers / num_transactions
print('Train outliers fraction:', round(outliers_fraction, 3))

Train outliers fraction: 0.002


In [6]:
frames = [X_trn, X_tst]
X_uni = pd.concat(frames)
X_uni.shape

(140681, 12)

In [7]:
from sklearn.covariance import EllipticEnvelope

In [8]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [9]:
def running_EE(X, y_eta_trn):
    print(' '*37, 'Confusion-Matrix')
    print('Algorithm', ' '*30, 'TN       FP')
    print(' '*17, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
    print('='*100)    

    algorithm = EllipticEnvelope(contamination=outliers_fraction)
    y_pred = algorithm.fit(X).predict(X)

    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]

    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)

    print('%s %8i %8i' % (' '*34, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          ('EE    '+' '*10, n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))
        
    print('-'*100)

### `Amount`

In [10]:
X_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2018-11-15 02:18:49,2
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2018-11-15 02:19:08,2
2,53940,4228,221,4682,5,0,0,2,500.0,500,2018-11-15 02:44:21,2
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2018-11-15 03:32:55,2
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2018-11-15 03:34:21,2


In [11]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Value',
    'TransactionStartTime',
    'PricingStrategy'
]
X_amount = X_trn.drop(columns=columns4drop, axis=1)
X_amount.head()

Unnamed: 0,Amount
0,1000.0
1,-20.0
2,500.0
3,20000.0
4,-644.0


In [12]:
algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X_amount).predict(X_amount)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

f1-score=0.81132


### `Value`

In [13]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'TransactionStartTime',
    'PricingStrategy'
]
X_value = X_trn.drop(columns=columns4drop, axis=1)
X_value.head()

Unnamed: 0,Value
0,1000
1,20
2,500
3,21800
4,644


In [14]:
algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X_value).predict(X_value)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

f1-score=0.65775


### `Amount` and `Value`

In [15]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'TransactionStartTime',
    'PricingStrategy'
]
X_am_val = X_trn.drop(columns=columns4drop, axis=1)
X_am_val.head()

Unnamed: 0,Amount,Value
0,1000.0,1000
1,-20.0,20
2,500.0,500
3,20000.0,21800
4,-644.0,644


In [16]:
algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X_am_val).predict(X_am_val)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

f1-score=0.31755


### `Amount` scalers

In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [18]:
def running_EE_scaler(name, X, y_eta_trn):
    algorithm = EllipticEnvelope(contamination=outliers_fraction, support_fraction=1.0)
    y_pred = algorithm.fit(X).predict(X)

    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]

    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)

    print('%s %8i %8i' % (' '*56, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          (name, n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))

In [19]:
print(' '*59, 'Confusion-Matrix')
print('Scaler', ' '*55, 'TN       FP')
print(' '*39, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
print('='*110)    

X = X_amount
running_EE_scaler('Unscaled data                         ', X, y_trn)
running_EE_scaler('Standard scaling                      ', StandardScaler().fit_transform(X), y_trn)
running_EE_scaler('Min-max scaling                       ', MinMaxScaler().fit_transform(X), y_trn)
running_EE_scaler('Max-abs scaling                       ', MaxAbsScaler().fit_transform(X), y_trn)
running_EE_scaler('Robust scaling (without params)       ', RobustScaler().fit_transform(X), y_trn)
running_EE_scaler('Robust scaling (quantile=(25, 75))    ', RobustScaler(quantile_range=(25, 75)).fit_transform(X), y_trn)
running_EE_scaler('Power transformation (Yeo-Johnson)    ', PowerTransformer(method='yeo-johnson').fit_transform(X), y_trn)
# running_EE_scaler('Power transformation (Box-Cox)        ', PowerTransformer(method='box-cox').fit_transform(X), y_trn)
running_EE_scaler('Quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X), y_trn)
running_EE_scaler('Quantile transformation (uniform pdf) ', QuantileTransformer(output_distribution='uniform').fit_transform(X), y_trn)
# running_EE_scaler('Sample-wise L2 normalizing            ', Normalizer().fit_transform(X), y_trn)

print('-'*110)

                                                            Confusion-Matrix
Scaler                                                         TN       FP
                                        Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score
                                                            95411       58
Unscaled data                                 0      180       71      122     0.6778     0.6321     0.6542
                                                            95411       58
Standard scaling                              0      180       71      122     0.6778     0.6321     0.6542
                                                            95411       58
Min-max scaling                               0      180       71      122     0.6778     0.6321     0.6542
                                                            95410       59
Max-abs scaling                               0      231       21      172     0.7446     0.8912     0.8113
      

### `Amount` and `ProviderId`

In [20]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
#     'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
    'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

   ProviderId   Amount
0           5   1000.0
1           3    -20.0
2           5    500.0
3           0  20000.0
4           3   -644.0
f1-score=0.64324


### `Amount` and `ProductId`

In [21]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
#     'ProductId',
    'ProductCategory',
    'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
    'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

   ProductId   Amount
0          9   1000.0
1          5    -20.0
2          0    500.0
3         20  20000.0
4          5   -644.0
f1-score=0.65775


### `Amount` and `ProductCategory`

In [22]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
#     'ProductCategory',
    'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
    'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

   ProductCategory   Amount
0                0   1000.0
1                2    -20.0
2                0    500.0
3                9  20000.0
4                2   -644.0
f1-score=0.78554


### `Amount` and `ChannelId`

In [23]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
#     'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
    'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

   ChannelId   Amount
0          2   1000.0
1          1    -20.0
2          2    500.0
3          2  20000.0
4          1   -644.0
f1-score=0.81132


### `Amount` and `PricingStrategy`

In [24]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
#     'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

    Amount  PricingStrategy
0   1000.0                2
1    -20.0                2
2    500.0                2
3  20000.0                2
4   -644.0                2
f1-score=0.69430


### `ProviderId`, `ProductId`, `ProductCategory`, `ChannelId`, `Amount`, `PricingStrategy`

In [25]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
#     'ProviderId',
#     'ProductId',
#     'ProductCategory',
#     'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
#     'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)
print(X[:5])

algorithm = EllipticEnvelope(contamination=outliers_fraction, support_fraction=1)
y_pred = algorithm.fit(X).predict(X)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
print('f1-score=%.5f' % f1_score(y_trn, y_pred))

   ProviderId  ProductId  ProductCategory  ChannelId   Amount  PricingStrategy
0           5          9                0          2   1000.0                2
1           3          5                2          1    -20.0                2
2           5          0                0          2    500.0                2
3           0         20                9          2  20000.0                2
4           3          5                2          1   -644.0                2
f1-score=0.70951


## Grid Search

### `Amount`

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [28]:
X1 = X_amount
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [29]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
folds = list(skf.split(X1, y1))

In [30]:
ee = EllipticEnvelope()
params = {
    'support_fraction': np.linspace(0.70, 0.99, num=30),
    'contamination'   : np.linspace(0.0015, 0.0025, num=101)
}
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=ee, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);

In [31]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

Unnamed: 0,mean_test_score,params
1468,0.999581,"{'contamination': 0.00198, 'support_fraction':..."
1498,0.999581,"{'contamination': 0.00199, 'support_fraction':..."
2779,0.999576,"{'contamination': 0.0024200000000000003, 'supp..."
2803,0.999576,"{'contamination': 0.00243, 'support_fraction':..."
2790,0.999576,"{'contamination': 0.00243, 'support_fraction':..."


In [32]:
grdsrch.best_estimator_

EllipticEnvelope(assume_centered=False, contamination=0.00198,
                 random_state=None, store_precision=True,
                 support_fraction=0.98)

### `ProviderId`, `ProductId`, `ProductCategory`, `ChannelId`, `Amount`, `PricingStrategy`

In [44]:
columns4drop = [
    'BatchId',
    'AccountId',
    'SubscriptionId',
    'CustomerId',
#     'ProviderId',
#     'ProductId',
#     'ProductCategory',
#     'ChannelId',
#     'Amount',
    'Value',
    'TransactionStartTime',
#     'PricingStrategy'
]
X = X_trn.drop(columns=columns4drop, axis=1)

In [34]:
X1 = X
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [35]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
folds = list(skf.split(X1, y1))

In [37]:
ee = EllipticEnvelope()
params = {
#     'support_fraction': np.linspace(0.70,   0.99,   num=30),
    'contamination'   : np.linspace(0.0015, 0.0025, num=101)
}
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=ee, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);











In [38]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

Unnamed: 0,mean_test_score,params
93,0.99957,{'contamination': 0.00243}
92,0.99957,{'contamination': 0.0024200000000000003}
88,0.999565,{'contamination': 0.00238}
84,0.99956,{'contamination': 0.00234}
86,0.99956,{'contamination': 0.00236}


In [39]:
grdsrch.best_estimator_

EllipticEnvelope(assume_centered=False, contamination=0.0024200000000000003,
                 random_state=None, store_precision=True,
                 support_fraction=None)

In [45]:
X_tst_drop = X_tst.drop(columns=columns4drop, axis=1)

In [46]:
y_pred = EllipticEnvelope(
    assume_centered=False,
    contamination=0.0024200000000000003,
    random_state=None,
    store_precision=True,
    support_fraction=None
).fit(X_tst_drop).predict(X_tst_drop)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)











Counter({0: 44909, 1: 110})

In [47]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0722_EE_Amount_etc.csv', encoding='utf-8', index=False)

`Results: 0.617283950617284`