# Тьюнинг EllipticEnvelope

## Field `Value`

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn = pd.read_csv('../data/training_le.csv')
df_tst = pd.read_csv('../data/test_le.csv')

In [3]:
X_trn = df_trn.drop(columns=['FraudResult'], axis=1)
y_trn = df_trn['FraudResult']

X_tst = df_tst

**Frauds**

In [4]:
from collections import Counter
print('Train Label Distribution: {}'.format(Counter(y_trn)))

Train Label Distribution: Counter({0: 95469, 1: 193})


In [5]:
num_transactions = df_trn.shape[0]
num_otliers = df_trn['FraudResult'].value_counts()[1]
num_inliers = df_trn['FraudResult'].value_counts()[0]
outliers_fraction = num_otliers / num_transactions
print('Train outliers fraction:', round(outliers_fraction, 3))

Train outliers fraction: 0.002


In [6]:
print('Number of outliers on union dataset =', round((df_trn.shape[0]+df_tst.shape[0])*outliers_fraction, 0))

Number of outliers on union dataset = 284.0


In [8]:
df_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2018-11-15 02:18:49,2,0
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2018-11-15 02:19:08,2,0
2,53940,4228,221,4682,5,0,0,2,500.0,500,2018-11-15 02:44:21,2,0
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2018-11-15 03:32:55,2,0
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2018-11-15 03:34:21,2,0


**Union dataframes**

In [7]:
columns4drop = [
    'BatchId',
    'SubscriptionId',
    'CustomerId',
    'TransactionStartTime'
]

In [9]:
df_trn_cut = df_trn.drop(columns=columns4drop, axis=1)
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

df_trn_cut = df_trn_cut.drop(columns='FraudResult', axis=1)

print(df_trn_cut.shape, df_tst_cut.shape)

(95662, 8) (45019, 8)


In [10]:
frames = [df_trn_cut, df_tst_cut]
df_union = pd.concat(frames)
df_union.shape

(140681, 8)

In [11]:
df_union.head()

Unnamed: 0,AccountId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy
0,3956,5,9,0,2,1000.0,1000,2
1,4840,3,5,2,1,-20.0,20,2
2,4228,5,0,0,2,500.0,500,2
3,647,0,20,9,2,20000.0,21800,2
4,4840,3,5,2,1,-644.0,644,2


## Only `Value`

In [12]:
columns4drop = [
    'AccountId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'PricingStrategy',
]

In [13]:
df_union_value = df_union.drop(columns=columns4drop, axis=1)

In [14]:
df_union_value.head()

Unnamed: 0,Value
0,1000
1,20
2,500
3,21800
4,644


## Running

In [15]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [16]:
from sklearn.covariance import EllipticEnvelope

In [17]:
def running_EE(X, y_eta_trn):
    print(' '*37, 'Confusion-Matrix')
    print('Algorithm', ' '*30, 'TN       FP')
    print(' '*17, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
    print('='*100)    

    algorithm = EllipticEnvelope(contamination=outliers_fraction)
    y_pred = algorithm.fit(X).predict(X)

    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]

    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)

    print('%s %8i %8i' % (' '*34, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          ('EE    '+' '*10, n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))
        
    print('-'*100)
    return

In [20]:
running_EE(X_trn, y_trn)
running_EE(df_union_value, y_trn)

                                      Confusion-Matrix
Algorithm                                TN       FP
                  Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score


ValueError: could not convert string to float: '2018-11-15 02:18:49'

## Choosing Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [None]:
def running_EE_scaler(name, X, y_eta_trn):
    algorithm = EllipticEnvelope(contamination=outliers_fraction, support_fraction=1.0)
    y_pred = algorithm.fit(X).predict(X)

    y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
    y_pred_trn = y_pred[ : len(y_trn)]
    y_pred_tst = y_pred[len(y_trn) : ]

    n_frauds_trn      = Counter(y_pred_trn)[1]
    n_frauds_tst      = Counter(y_pred_tst)[1]
    tn, fp, fn, tp    = confusion_matrix(y_eta_trn, y_pred_trn).ravel()
    precision         = precision_score(y_eta_trn, y_pred_trn)
    recall            = recall_score(y_eta_trn, y_pred_trn)
    f1                = f1_score(y_eta_trn, y_pred_trn)

    print('%s %8i %8i' % (' '*56, tn, fp))
    print('%s %8i %8i %8i %8i %10.4f %10.4f %10.4f' % 
          (name, n_frauds_tst, n_frauds_trn, fn, tp, precision, recall, f1))
        
    return

In [None]:
print(' '*59, 'Confusion-Matrix')
print('Scaler', ' '*55, 'TN       FP')
print(' '*39, 'Frd_tst  Frd_trn       FN       TP     Precision  Recall     F1-score')
print('='*110)    

X = df_value
running_EE_scaler('Unscaled data                         ', X, y_trn)
running_EE_scaler('Standard scaling                      ', StandardScaler().fit_transform(X), y_trn)
running_EE_scaler('Min-max scaling                       ', MinMaxScaler().fit_transform(X), y_trn)
running_EE_scaler('Max-abs scaling                       ', MaxAbsScaler().fit_transform(X), y_trn)
running_EE_scaler('Robust scaling (without params)       ', RobustScaler().fit_transform(X), y_trn)
running_EE_scaler('Robust scaling (quantile=(25, 75))    ', RobustScaler(quantile_range=(25, 75)).fit_transform(X), y_trn)
running_EE_scaler('Power transformation (Yeo-Johnson)    ', PowerTransformer(method='yeo-johnson').fit_transform(X), y_trn)
running_EE_scaler('Power transformation (Box-Cox)        ', PowerTransformer(method='box-cox').fit_transform(X), y_trn)
running_EE_scaler('Quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal').fit_transform(X), y_trn)
running_EE_scaler('Quantile transformation (uniform pdf) ', QuantileTransformer(output_distribution='uniform').fit_transform(X), y_trn)
# running_EE_scaler('Sample-wise L2 normalizing            ', Normalizer().fit_transform(X), y_trn)

print('-'*110)

## Max-abs Scaling

In [None]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

In [None]:
df_value_max = scaleColumns(df_value, ['Value'], MaxAbsScaler())

In [None]:
df_value_max.head()

In [None]:
df_value_max.shape

## Grid Search CV

In [None]:
X1 = df_value_max[ : df_trn.shape[0]]
y1 = y_trn.astype(np.int8)
y1[y_trn==1] = -1
y1[y_trn==0] = 1

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
folds = list(skf.split(X1, y1))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer

In [None]:
ee = EllipticEnvelope()
params = {
    'support_fraction': np.linspace(0.70, 0.99, num=30),
    'contamination'   : np.linspace(0.0015, 0.0025, num=101)
}
f1score = make_scorer(f1_score)
grdsrch = GridSearchCV(estimator=ee, param_grid=params, scoring=f1score, cv=folds, n_jobs=-1)
grdsrch.fit(X1, y1);

In [None]:
df_res = pd.DataFrame(grdsrch.cv_results_)
df_res[['mean_test_score', 'params']].sort_values(by=['mean_test_score'], ascending=False).head()

In [None]:
grdsrch.best_estimator_

**Make predictions with our optimized envelope fit**

In [None]:
ee_opt = grdsrch.best_estimator_
X_tst = df_value[df_trn.shape[0] : ]
y_pred = ee_opt.predict(X_tst)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)

**Submission**

In [None]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0720_EE_Value.csv', encoding='utf-8', index=False)

**Result:** `0.0`

### Dummy submitted

In [None]:
df_tst = pd.read_csv('../data/test_le.csv')

In [None]:
columns4drop = [
    'BatchId',
    'SubscriptionId',
    'CustomerId',
    'AccountId',
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'Amount',
    'TransactionStartTime',
    'PricingStrategy',
]

In [None]:
df_tst_cut = df_tst.drop(columns=columns4drop, axis=1)

In [None]:
df_tst_cut.head()

In [None]:
X_tst = df_tst_cut
y_pred = EllipticEnvelope(contamination=outliers_fraction).fit(X_tst).predict(X_tst)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)

In [None]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0720_EE_Value.csv', encoding='utf-8', index=False)

**Result:** `0.581818181818182`

max-abs scaler

In [None]:
df_tst_cut_max = scaleColumns(df_tst_cut, ['Value'], MaxAbsScaler())

In [None]:
df_tst_cut_max.head()

In [None]:
X_tst = df_tst_cut_max
y_pred = EllipticEnvelope(contamination=0.00201, support_fraction=0.79).fit(X_tst).predict(X_tst)
y_pred = [ 1 if x == -1 else 0 for x in y_pred ]
Counter(y_pred)

In [None]:
df_sbm = pd.read_csv('../data/sample_submission.csv')
df_sbm['FraudResult'] = y_pred
df_sbm.to_csv('../submitted/AlBo0720_EE_Value.csv', encoding='utf-8', index=False)

**Result:** `0.684931506849315`