# Тьюнинг EllipticEnvelope

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')

In [3]:
y_trn = df_trn['FraudResult']

**Frauds**

In [4]:
from collections import Counter
print('Train Label Distribution: {}'.format(Counter(y_trn)))

Train Label Distribution: Counter({0: 95469, 1: 193})


In [5]:
num_transactions = df_trn.shape[0]
num_otliers = df_trn['FraudResult'].value_counts()[1]
num_inliers = df_trn['FraudResult'].value_counts()[0]
outliers_fraction = num_otliers / num_transactions
print('Train outliers fraction:', round(outliers_fraction, 3))

Train outliers fraction: 0.002


In [7]:
print('Number of outliers on union dataset =', round((df_trn.shape[0]+df_tst.shape[0])*outliers_fraction, 0))

Number of outliers on union dataset = 284.0


**Union dataframes**

In [8]:
df_trn = df_trn.drop(columns='FraudResult', axis=1)

In [9]:
columns4drop = [
    'BatchId',
    'SubscriptionId',
    'CustomerId',
    'TransactionStartTime'
]

In [10]:
df_trn = df_trn.drop(columns=columns4drop, axis=1)
df_tst = df_tst.drop(columns=columns4drop, axis=1)
print(df_trn.shape, df_tst.shape)

(95662, 8) (45019, 8)


In [11]:
frames = [df_trn, df_tst]
df_union = pd.concat(frames)
df_union.shape

(140681, 8)

In [12]:
df_union.head()

Unnamed: 0,AccountId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,3956,5,9,0,2,1000.0,1000,2
1,4840,3,5,2,1,-20.0,20,2
2,4228,5,0,0,2,500.0,500,2
3,647,0,20,9,2,20000.0,21800,2
4,4840,3,5,2,1,-644.0,644,2


In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [24]:
def running_EE(X, y_eta_trn):
    print(' '*27, 'Confusion-Matrix')
    print('Algorithm', ' '*20, 'TN       FP')
    print(' '*7, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
    print('='*100)

    y_pred = algorithm.fit(X).predict(X)
            
    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]
        
    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)
        
    print('%s %8i %8i' % (' '*24, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          ('EE    ', n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))
        
    print('-'*100)
    return

In [16]:
from sklearn.covariance import EllipticEnvelope

In [17]:
algorithm = EllipticEnvelope(contamination=outliers_fraction*1.15)

### Only `Amount`

In [18]:
columns4drop = [
    'AccountId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Value',
    'PricingStrategy',
]

In [19]:
df = df_union.drop(columns=columns4drop, axis=1)

In [20]:
df.head()

Unnamed: 0,Amount
0,1000.0
1,-20.0
2,500.0
3,20000.0
4,-644.0


In [25]:
running_EE(df, y_trn)

                            Confusion-Matrix
Algorithm                      TN       FP
        Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score
                            95408       61
EE           94      233       21      172     0.7382     0.8912     0.8075
----------------------------------------------------------------------------------------------------


In [None]:
# support_fraction=1.,contamination=0.261

**Grid Search CV**

In [29]:
X1 = df[ : df_trn.shape[0]]
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [26]:
from sklearn.model_selection import StratifiedKFold

In [30]:
skf = StratifiedKFold(n_splits=3)
folds = list(skf.split(X1, y1))

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [72]:
ee = EllipticEnvelope()
params = {
    'support_fraction': np.linspace(0.50, 0.99, num=50),
    'contamination'   : np.linspace(0.0015, 0.0025, num=101)
}
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=ee, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);

In [73]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

Unnamed: 0,mean_test_score,params
3773,0.999581,"{'contamination': 0.0022500000000000003, 'supp..."
3732,0.999581,"{'contamination': 0.0022400000000000002, 'supp..."
3682,0.999581,"{'contamination': 0.00223, 'support_fraction':..."
3683,0.999581,"{'contamination': 0.00223, 'support_fraction':..."
3583,0.999581,"{'contamination': 0.00221, 'support_fraction':..."


In [74]:
grdsrch.best_estimator_

EllipticEnvelope(assume_centered=False, contamination=0.0022, random_state=None,
                 store_precision=True, support_fraction=0.61)

**Make predictions with our optimized envelope fit**

In [75]:
ee_opt = grdsrch.best_estimator_
X_tst = df[df_trn.shape[0] : ]
y_pred = ee_opt.predict(X_tst)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)

Counter({0: 44925, 1: 94})

**Submission**

In [76]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0719_EE_Amount.csv', encoding='utf-8', index=False)

**Result:** `0.684931506849315`

## Scaling

In [78]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [79]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

### Standard Scaler

In [82]:
df_ss = scaleColumns(df, ['Amount'], StandardScaler())

In [83]:
df_ss.head()

Unnamed: 0,Amount
0,-0.043799
1,-0.052644
2,-0.048135
3,0.12097
4,-0.058056


**Grid Search CV**

In [84]:
X1 = df_ss[ : df_trn.shape[0]]
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [85]:
skf = StratifiedKFold(n_splits=3)
folds = list(skf.split(X1, y1))

In [86]:
ee = EllipticEnvelope()
params = {
    'support_fraction': np.linspace(0.50, 0.99, num=50),
    'contamination'   : np.linspace(0.0015, 0.0025, num=101)
}
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=ee, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);

In [87]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

Unnamed: 0,mean_test_score,params
3780,0.999581,"{'contamination': 0.0022500000000000003, 'supp..."
3768,0.999581,"{'contamination': 0.0022500000000000003, 'supp..."
3608,0.999581,"{'contamination': 0.00222, 'support_fraction':..."
3783,0.999581,"{'contamination': 0.0022500000000000003, 'supp..."
3782,0.999581,"{'contamination': 0.0022500000000000003, 'supp..."


In [88]:
grdsrch.best_estimator_

EllipticEnvelope(assume_centered=False, contamination=0.00222,
                 random_state=None, store_precision=True,
                 support_fraction=0.58)

**Make predictions with our optimized envelope fit**

In [89]:
ee_opt = grdsrch.best_estimator_
X_tst = df[df_trn.shape[0] : ]
y_pred = ee_opt.predict(X_tst)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)

Counter({0: 44950, 1: 69})

# Field `Value`

In [100]:
columns4drop = [
    'AccountId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'PricingStrategy',
]

**Ищем наилучший Scaler**

In [178]:
X = df_union.drop(columns=columns4drop, axis=1)

In [179]:
distributions = [
    ('Unscaled data                         ', X),
    ('Standard scaling                      ', StandardScaler().fit_transform(X)),
    ('Min-max scaling                       ', MinMaxScaler().fit_transform(X)),
    ('Max-abs scaling                       ', MaxAbsScaler().fit_transform(X)),
    ('Robust scaling                        ', RobustScaler(quantile_range=(25, 75)).fit_transform(X)),
    ('Power transformation (Yeo-Johnson)    ', PowerTransformer(method='yeo-johnson').fit_transform(X)),
    ('Power transformation (Box-Cox)        ', PowerTransformer(method='box-cox').fit_transform(X)),
    ('Quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X)),
    ('Quantile transformation (uniform pdf) ', QuantileTransformer(output_distribution='uniform').fit_transform(X)),
#     ('Sample-wise L2 normalizing            ', Normalizer().fit_transform(X)),
]

In [180]:
def running_EE_scaler(name, X, y_eta_trn):
    algorithm = EllipticEnvelope(contamination=0.0022, support_fraction=1.0)
    y_pred = algorithm.fit(X).predict(X)
            
    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]
        
    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)
        
    print('%s %8i %8i' % (' '*56, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          (name, n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))
    
    return

In [181]:
print(' '*58, 'Confusion-Matrix')
print('Scaler', ' '*55, 'TN       FP')
print(' '*39, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
print('='*110)

for item_idx in range(len(distributions)):
    X = df_union.drop(columns=columns4drop, axis=1)
    title, X = distributions[item_idx]
    running_EE_scaler(title, X, y_trn)

print('-'*110)

                                                           Confusion-Matrix
Scaler                                                         TN       FP
                                        Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score
                                                            95411       58
Unscaled data                                55      181       70      123     0.6796     0.6373     0.6578
                                                            95411       58
Standard scaling                             55      181       70      123     0.6796     0.6373     0.6578
                                                            95411       58
Min-max scaling                              55      181       70      123     0.6796     0.6373     0.6578
                                                            95410       59
Max-abs scaling                              94      231       21      172     0.7446     0.8912     0.8113
       

- **Max-abs Scaling**
- **Grid Search CV**