# Оптимизация методов бинарной классификации

- Fields: `Amount` и `Value`
- Robust Scaler

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn    = pd.read_csv('../data/training_le.csv')
df_tst    = pd.read_csv('../data/test_le.csv')
df_sbm    = pd.read_csv('../data/sample_submission.csv')

In [3]:
df_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2018-11-15 02:18:49,2,0
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2018-11-15 02:19:08,2,0
2,53940,4228,221,4682,5,0,0,2,500.0,500,2018-11-15 02:44:21,2,0
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2018-11-15 03:32:55,2,0
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2018-11-15 03:34:21,2,0


In [4]:
list(df_trn.columns)

['BatchId',
 'AccountId',
 'SubscriptionId',
 'CustomerId',
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'Amount',
 'Value',
 'TransactionStartTime',
 'PricingStrategy',
 'FraudResult']

In [5]:
columns4drop = [
    'BatchId', 
    'AccountId', 
    'SubscriptionId', 
    'CustomerId', 
    'ProviderId',
    'ProductId',
    'ProductCategory',
    'ChannelId',
    'TransactionStartTime',
    'PricingStrategy']

In [6]:
df_trn = df_trn.drop(columns4drop, axis=1)
df_tst = df_tst.drop(columns4drop, axis=1)

**Scaling**

In [7]:
from sklearn.preprocessing import RobustScaler

In [8]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

In [9]:
df_trn = scaleColumns(df_trn, ['Amount', 'Value'], RobustScaler())
df_tst = scaleColumns(df_tst, ['Amount', 'Value'], RobustScaler())

In [10]:
df_trn.head()

Unnamed: 0,Amount,Value,FraudResult
0,0.0,0.0,0
1,-0.357895,-0.207407,0
2,-0.175439,-0.10582,0
3,6.666667,4.402116,0
4,-0.576842,-0.075344,0


In [11]:
X = df_trn.drop('FraudResult', axis=1)
y = df_trn['FraudResult']

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24, shuffle=y)

In [14]:
from collections import Counter

In [15]:
print('Train Label Distribution: {}'.format(Counter(y_train)))
print('Test  Label Distribution: {}'.format(Counter(y_test)))

Train Label Distribution: Counter({0: 76380, 1: 149})
Test  Label Distribution: Counter({0: 19089, 1: 44})


In [16]:
# Classifier Libraries
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.neural_network import MLPClassifier

In [17]:
classifiers = {
    'Naive Bayes                       ': GaussianNB(),
    'k-Nearest Neighbors               ': KNeighborsClassifier(3),
    'Decision Tree Classifier          ': DecisionTreeClassifier(max_depth=5),
    'Logisitic Regression              ': LogisticRegression(),
    'AdaBoost Classifier               ': AdaBoostClassifier(),
    'Bagging Classifier                ': BaggingClassifier(),
    'Extra-Trees Classifier            ': ExtraTreesClassifier(),
    'Gradient Boosting                 ': GradientBoostingClassifier(),
    'Random Forest                     ': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'Histogram-based GradBoostClassTree': HistGradientBoostingClassifier(),
    'Linear Discriminant Analysis      ': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis   ': QuadraticDiscriminantAnalysis(),
    'Multilayer Perceptron             ': MLPClassifier(alpha=1, max_iter=1000)
}
#     'Voting Classifier              ': VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft'),

In [18]:
from time import time
import warnings
warnings.filterwarnings("ignore")

In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

In [20]:
classifier =  KNeighborsClassifier(3)

In [26]:
classifier.fit(X_train.values, y_train.values)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [27]:
cv_results = cross_validate(classifier, X_train.values, y_train.values, scoring=('precision', 'recall', 'f1'), cv=5, n_jobs=-1)

In [28]:
name = 'k-Nearest Neighbors'
print(name,
          round(cv_results['test_precision'].mean(), 5), ' \t',
          round(cv_results['test_recall'].mean(),    5), ' \t',
          round(cv_results['test_f1'].mean(),        5), ' \t',
          round(cv_results['fit_time'].sum(),        2))

k-Nearest Neighbors 0.92856  	 0.78414  	 0.84204  	 1.54


In [29]:
from hpsklearn import HyperoptEstimator, knn

In [30]:
estim = HyperoptEstimator(classifier=knn('myKNN'))

In [31]:
estim.fit(X_train.values, y_train.values)

100%|██████████| 1/1 [00:01<00:00,  1.18s/it, best loss: 0.0015680125441003279]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s, best loss: 0.0007840062720502194]
100%|██████████| 1/1 [00:01<00:00,  1.98s/it, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:01<00:00,  1.20s/it, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:01<00:00,  1.21s/it, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:00<00:00,  1.58it/s, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s, best loss: 0.000653338560041794]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s, best loss: 0.000653338560041794]


In [36]:
predict = estim.predict(X_test.values)

In [37]:
f1_score(y_test.values, predict)

0.6582278481012659

In [38]:
from sklearn.model_selection import GridSearchCV

In [42]:
knn = 

In [63]:
#create a dictionary of all values we want to test for n_neighbors
knn_GS_params = {
    'n_neighbors': np.arange(1, 25),
    'weights'    : ['uniform', 'distance'],
    'metric'     : ['euclidean', 'manhattan', 'minkowski']
}

In [64]:
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(KNeighborsClassifier(), knn_GS_params, verbose=1, cv=5, n_jobs=-1)

In [65]:
#fit model to data
knn_gs_results = knn_gs.fit(X_train.values, y_train.values)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed:   59.4s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  1.6min finished


In [66]:
knn_gs_results.best_score_

0.9995165231480877

In [67]:
knn_gs_results.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='distance')

In [68]:
knn_gs_results.best_params_

{'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}

In [69]:
knn_opt = knn_gs_results.best_estimator_

In [70]:
predict = knn_opt.predict(X_test)

In [71]:
f1_score(y_test, predict)

0.9268292682926829

In [72]:
knn_3 = KNeighborsClassifier(3).fit(X_train.values, y_train.values)

In [73]:
predict = knn_3.predict(X_test)

In [74]:
f1_score(y_test, predict)

0.9318181818181818

In [None]:
from hpsklearn import HyperoptEstimator, svc

In [None]:
estim = HyperoptEstimator(classifier=svc('mySVC'))

In [None]:
estim.fit(X_train, y_train)

In [None]:
# iterate over classifiers
print('Classifiers \t\t\t Precision \t Recall \t F1-score \t Fit-time')
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    cv_results = cross_validate(classifier, X_train.values, y_train.values, scoring=('precision', 'recall', 'f1'), cv=5, n_jobs=-1)
    print(name,
          round(cv_results['test_precision'].mean(), 5), ' \t',
          round(cv_results['test_recall'].mean(),    5), ' \t',
          round(cv_results['test_f1'].mean(),        5), ' \t',
          round(cv_results['fit_time'].sum(),        2))

In [None]:
# iterate over classifiers
print('Classifiers \t\t\t Precision \t Recall \t F1-score \t Fit-time')
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    cv_results = cross_validate(classifier, X_train, y_train,
                                scoring=('precision', 'recall', 'f1'), cv=5, n_jobs=-1)

    print(name,
          round(cv_results['test_precision'].mean(), 4), ' \t',
          round(cv_results['test_recall'].mean(),    4), ' \t',
          round(cv_results['test_f1'].mean(),        4), ' \t',
          round(cv_results['fit_time'].sum(),        2))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

$Precision = \frac{TP}{TP+FP}$

$Recall = \frac{TP}{TP+FN}$

$F1 = 2\cdot\frac{Precision\cdot{Recall}}{Precision+Recall}$

In [None]:
# iterate over classifiers
print('Classifiers \t\t\t     TN     FP    FN    TP   Precision    Recall     F1-score')
print('-' * 95)
for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    predict = classifier.predict(X_test)

    tn, fp, fn, tp = confusion_matrix(y_test, predict).ravel()
    precision      = precision_score(y_test, predict)
    recall         = recall_score(y_test, predict)
    f1             = f1_score(y_test, predict)

    print('%s %5i %5i %5i %5i %10.4f %10.4f %10.4f' % (name, tn, fp, fn, tp, precision, recall, f1))

**Submitting**

In [None]:
X.head()

In [None]:
df_tst.head()

In [None]:
classifiers = {
#     'Naive Bayes                       ': GaussianNB(),
    'k-Nearest Neighbors               ': KNeighborsClassifier(3),
    'Decision Tree Classifier          ': DecisionTreeClassifier(max_depth=5),
    'Logisitic Regression              ': LogisticRegression(),
    'AdaBoost Classifier               ': AdaBoostClassifier(),
    'Bagging Classifier                ': BaggingClassifier(),
    'Extra-Trees Classifier            ': ExtraTreesClassifier(),
    'Gradient Boosting                 ': GradientBoostingClassifier(),
    'Random Forest                     ': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     'Histogram-based GradBoostClassTree': HistGradientBoostingClassifier(),
    'Linear Discriminant Analysis      ': LinearDiscriminantAnalysis(),
#     'Quadratic Discriminant Analysis   ': QuadraticDiscriminantAnalysis(),
    'Multilayer Perceptron             ': MLPClassifier(alpha=1, max_iter=1000)
}

In [None]:
for name, classifier in classifiers.items():
    classifier.fit(X, y)
    predict = classifier.predict(df_tst)

    print(name, ':', Counter(predict))
    df_sbm['FraudResult'] = predict
    df_sbm.to_csv('../submitted/AlBo0713_' + name.rstrip().replace(' ', '_') + '.csv', encoding='utf-8', index=False)

Results on ZINDI
- 'k-Nearest Neighbors               ': 0.549019607843137
- 'Decision Tree Classifier          ': 0.583333333333333
- 'Logisitic Regression              ': 0.528301886792453
- 'AdaBoost Classifier               ': 0.6
- 'Bagging Classifier                ': 0.688524590163934
- 'Extra-Trees Classifier            ': 0.592592592592593
- 'Gradient Boosting                 ': 0.4
- 'Random Forest                     ': 0.694444444444444 === 1-st place ===
- 'Linear Discriminant Analysis      ': 0.555555555555556
- 'Multilayer Perceptron             ': 0.512820512820513

In [None]:
classifiers_sc = {
    'Support Vector Classifier, Linear': SVC(kernel="linear", C=0.025),
    'Support Vector Classifier, RBF   ': SVC(kernel="rbf", gamma=2, C=1),
}

In [None]:
# iterate over classifiers_sc
print('Classifiers \t\t Cross_val_score \t Time')
for name, classifier in classifiers_sc.items():
    tac = time()
    classifier.fit(X_sc_train, y_sc_train)
    training_score = cross_val_score(classifier, X_sc_train, y_sc_train, cv=5)
    tic = time()
    print(name, round(training_score.mean(), 5), ' \t', round(tic-tac, 2))

In [None]:
from sklearn.metrics import make_scorer

In [None]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]
scoring = {'tp': make_scorer(tp),
           'tn': make_scorer(tn),
           'fp': make_scorer(fp),
           'fn': make_scorer(fn)}

In [None]:
cv_results = cross_validate(svm.fit(X, y), X, y, scoring=scoring, cv=5)
# Getting the test set true positive scores
print(cv_results['test_tp'])  

# Getting the test set false negative scores
print(cv_results['test_fn'])  

In [None]:
# We will undersample during cross validating
undersample_X = df.drop('Class', axis=1)
undersample_y = df['Class']

for train_index, test_index in sss.split(undersample_X, undersample_y):
    print("Train:", train_index, "Test:", test_index)
    undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
    undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
    
undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values 

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique 
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
    
    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))