# Выбор метода бинарной классификации на всех полях,
# кроме `TransactionStartTime`

- drop `TransactionStartTime`
- `Label Encoding` для всех полей, кроме `Amount`, `Value`
- RobustScaler для `Amount`, `Value`

In [1]:
import numpy  as np
import pandas as pd

In [2]:
df_trn    = pd.read_csv('../data/training_le.csv')
df_tst    = pd.read_csv('../data/test_le.csv')
df_sbm    = pd.read_csv('../data/sample_submission.csv')

df_trn_sc = pd.read_csv('../data/training_le.csv')
df_tst_sc = pd.read_csv('../data/test_le.csv')

In [3]:
df_trn    = df_trn.drop('TransactionStartTime', axis=1)
df_tst    = df_tst.drop('TransactionStartTime', axis=1)

df_trn_sc = df_trn_sc.drop('TransactionStartTime', axis=1)
df_tst_sc = df_tst_sc.drop('TransactionStartTime', axis=1)

**Scaling**

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [5]:
columns4MinMaxScaler = [
    'BatchId',
    'AccountId', 
    'SubscriptionId', 
    'CustomerId', 
    'ProviderId',
    'ProductId', 
    'ProductCategory', 
    'ChannelId',
    'PricingStrategy'
]

In [6]:
# int --> float
for clm in columns4MinMaxScaler:
    df_trn_sc[clm] = df_trn_sc[clm].astype(float)
    df_tst_sc[clm] = df_tst_sc[clm].astype(float)

In [7]:
def scaleColumns(data, cols_to_scale, scaler):
    for col in cols_to_scale:
        data[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(data[col])))
    return data

In [8]:
df_trn_sc = scaleColumns(df_trn_sc, columns4MinMaxScaler, MinMaxScaler())
df_tst_sc = scaleColumns(df_tst_sc, columns4MinMaxScaler, MinMaxScaler())

In [9]:
df_trn_sc = scaleColumns(df_trn_sc, ['Amount', 'Value'], RobustScaler())
df_tst_sc = scaleColumns(df_tst_sc, ['Amount', 'Value'], RobustScaler())

In [10]:
df_trn.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult
0,36122,3956,886,4405,5,9,0,2,1000.0,1000,2,0
1,15641,4840,3828,4405,3,5,2,1,-20.0,20,2,0
2,53940,4228,221,4682,5,0,0,2,500.0,500,2,0
3,102362,647,2184,987,0,20,9,2,20000.0,21800,2,0
4,38779,4840,3828,987,3,5,2,1,-644.0,644,2,0


In [11]:
df_trn_sc.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,FraudResult
0,0.258949,0.817355,0.183078,0.589061,1.0,0.346154,0.0,0.5,0.0,0.0,0.666667,0
1,0.112122,1.0,0.791684,0.589061,0.6,0.192308,0.222222,0.25,-0.357895,-0.207407,0.666667,0
2,0.386684,0.873554,0.045511,0.626103,1.0,0.0,0.0,0.5,-0.175439,-0.10582,0.666667,0
3,0.733818,0.133678,0.451593,0.131987,0.0,0.769231,1.0,0.5,6.666667,4.402116,0.666667,0
4,0.277996,1.0,0.791684,0.131987,0.6,0.192308,0.222222,0.25,-0.576842,-0.075344,0.666667,0


In [12]:
X    = df_trn.drop('FraudResult', axis=1)
y    = df_trn['FraudResult']

X_sc = df_trn_sc.drop('FraudResult', axis=1)
y_sc = df_trn_sc['FraudResult']

In [13]:
from sklearn.model_selection import train_test_split

In [None]:
# This is explicitly used for undersampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

X_sc_train, X_sc_test, y_sc_train, y_sc_test = train_test_split(X_sc, y_sc, test_size=0.2, shuffle=False)

In [None]:
# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test  = X_test.values
y_train = y_train.values
y_test  = y_test.values

X_sc_train = X_sc_train.values
X_sc_test  = X_sc_test.values
y_sc_train = y_sc_train.values
y_sc_test  = y_sc_test.values

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
from collections import Counter
print('Train Label Distribution: {}'.format(Counter(y_train)))
print('Train-Scaled Label Distribution: {}'.format(Counter(y_sc_train)))

In [None]:
# Classifier Libraries
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.neural_network import MLPClassifier

In [None]:
classifiers = {
    'Naive Bayes                       ': GaussianNB(),
    'k-Nearest Neighbors               ': KNeighborsClassifier(3),
    'Decision Tree Classifier          ': DecisionTreeClassifier(max_depth=5),
    'Logisitic Regression              ': LogisticRegression(),
    'AdaBoost Classifier               ': AdaBoostClassifier(),
    'Bagging Classifier                ': BaggingClassifier(),
    'Extra-Trees Classifier            ': ExtraTreesClassifier(),
    'Gradient Boosting                 ': GradientBoostingClassifier(),
    'Random Forest                     ': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'Histogram-based GradBoostClassTree': HistGradientBoostingClassifier(),
#     'Gaussian Process Classifier       ': GaussianProcessClassifier(kernel=1.0 * RBF(1.0))
    'Linear Discriminant Analysis      ': LinearDiscriminantAnalysis(),
    'Quadratic Discriminant Analysis   ': QuadraticDiscriminantAnalysis(),
    'Multilayer Perceptron             ': MLPClassifier(alpha=1, max_iter=1000)
}
#     'Voting Classifier              ': VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft'),

In [None]:
from time import time
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

In [None]:
classifier = KNeighborsClassifier(3)
classifier.fit(X_train, y_train)
testing_score = f1_score(y_test, classifier.predict(X_test))
testing_score

In [None]:
# iterate over classifiers
print('Classifiers \t\t\t F1-score(train,CV) \t F1-score(test) \t Time')
for name, classifier in classifiers.items():
    tac = time()
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1_macro')
    testing_score = f1_score(y_test, classifier.predict(X_test))  
    tic = time()
    print(name, round(training_score.mean(), 5), ' \t\t', round(testing_score, 5), '\t\t', round(tic-tac, 2))

In [None]:
classifiers_sc = {
    'Support Vector Classifier, Linear': SVC(kernel="linear", C=0.025),
    'Support Vector Classifier, RBF   ': SVC(kernel="rbf", gamma=2, C=1),
}

In [None]:
# iterate over classifiers_sc
print('Classifiers \t\t Cross_val_score \t Time')
for name, classifier in classifiers_sc.items():
    tac = time()
    classifier.fit(X_sc_train, y_sc_train)
    training_score = cross_val_score(classifier, X_sc_train, y_sc_train, cv=5)
    tic = time()
    print(name, round(training_score.mean(), 5), ' \t', round(tic-tac, 2))

### Submission

In [None]:
print('Classifiers \t\t\t Num-Fraud-on-Test \t\t Time')
for name, classifier in classifiers.items():
    tac = time()
    classifier.fit(X.values, y.values)
    fraud_predict = classifier.predict(df_tst.values)
    tic = time()
    
    fraud_predict = [ 1 if x == -1 else 0 for x in fraud_predict ]
    df_sbm['FraudResult'] = fraud_predict
    df_sbm.to_csv('../submitted/AlBo_07_12_' + name + '.csv', encoding='utf-8', index=False)
    print(name, Counter(fraud_predict), ' \t', round(tic-tac, 2))

In [None]:
X

In [None]:
df_tst

### Submission - AdaBoost

In [None]:
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)
predict = classifier.predict(X_test)
print(f1_score(y_test, predict))

In [None]:
Counter(predict)

In [None]:
X_test = df_tst.values

In [None]:
FraudPred_AB = classifier.predict(df_tst.values)

In [None]:
FraudPred_AB = [ 1 if x == -1 else 0 for x in FraudPred_AB ]

In [None]:
Counter(FraudPred_AB)

In [None]:
df_sbm['FraudResult'] = FraudPred_AB

In [None]:
print('Test Label Distribution: {}'.format(Counter(df_sbm['FraudResult'])))

In [None]:
df_sbm.to_csv('../submitted/AlBo_07_11_AdaBoost.csv', encoding='utf-8', index=False)

### Submission - Decision Tree

In [None]:
classifier = DecisionTreeClassifier(max_depth=5)
classifier.fit(X_train, y_train)
predict = classifier.predict(X_test)
print(f1_score(y_test, predict))

In [None]:
X_test = df_tst.values

In [None]:
FraudPred_AB = classifier.predict(X_test)

In [None]:
FraudPred_AB = [ 1 if x == -1 else 0 for x in FraudPred_AB ]

In [None]:
df_sbm['FraudResult'] = FraudPred_AB

In [None]:
print('Test Label Distribution: {}'.format(Counter(df_sbm['FraudResult'])))

In [None]:
df_sbm.to_csv('../submitted/AlBo_07_11_AdaBoost.csv', encoding='utf-8', index=False)

# Undersample

In [None]:
# We will undersample during cross validating
undersample_X = df.drop('Class', axis=1)
undersample_y = df['Class']

for train_index, test_index in sss.split(undersample_X, undersample_y):
    print("Train:", train_index, "Test:", test_index)
    undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
    undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
    
undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values 

undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []

# Implementing NearMiss Technique 
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way

for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
    undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
    undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
    undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
    
    undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
    undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
    undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
    undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
    undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))