In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.base import TransformerMixin, BaseEstimator

import matplotlib.pyplot as plt
from matplotlib import ticker
# unused but required import for doing 3d projections with matplotlib < 3.2
import mpl_toolkits.mplot3d  # noqa: F401
from sklearn import manifold

from category_encoders.target_encoder import TargetEncoder
from imblearn.under_sampling import NearMiss
from sklearn.metrics import f1_score

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.feature_selection import SelectFromModel

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix, RocCurveDisplay, PrecisionRecallDisplay, classification_report

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import pickle
random_state = 42

import warnings
warnings.filterwarnings("ignore")

In [4]:
raw_data = pd.merge(pd.read_csv('train_transaction.csv'), pd.read_csv('train_identity.csv'),
                    left_on='TransactionID', right_on='TransactionID', how='left')

In [5]:
pd.set_option('display.max_columns', None)

### Разделим данные на train и test

In [6]:
X_train, X_test = train_test_split(raw_data, test_size=0.2, random_state=random_state)
X_train.reset_index(drop=True, inplace=True)
y_train = X_train['isFraud']
X_train.drop(columns=['isFraud', 'TransactionID'], inplace=True)

X_test.reset_index(drop=True, inplace=True)
y_test = X_test['isFraud']
X_test.drop(columns=['isFraud', 'TransactionID'], inplace=True)

In [7]:
del raw_data

### Колонки с большим количеством пропусков

NanFeatureSelector удаляет колонки, в которых доля пропусков больше max_nan_rate (в нашем случае зададим max_nan_rate = 85%)

In [8]:
class NanFeatureSelector(TransformerMixin, BaseEstimator):

    def __init__(self, max_nan_rate):
        self.cols_to_remove = []
        self.max_nan_rate = max_nan_rate

    def fit(self, X, y=None):
        nan_stat = self.get_share_of_NaN(X)
        for i in range(len(nan_stat)):
            column = nan_stat.loc[i]
            if(column['Share_of_NaN'] > self.max_nan_rate):
                self.cols_to_remove.append(column['Name'])
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_remove)
    
    def get_share_of_NaN(self, df):
        '''
        Рассчитывает количество пропусков в каждой колонке, а также долю пропусков в каждой колонке
        Параметры:
        df - датафрейм
        Возвращает датафрейм, содержащий информацию о пропусках в каждой колонке датафрейма df
        '''
        result = pd.DataFrame(columns=['Name', 'Number_of_NaN', 'Share_of_NaN'])
        colcount = df.count()
        length = len(df)
        for col_name in colcount.keys():
            result.loc[len(result)] = [col_name, length-colcount[col_name], (length-colcount[col_name])/length]
        return result

### Колонки с большой корреляцией

CorrFeatureSelector удаляет колонки, у которых корреляция больше max_corr (в нашем случае зададим max_corr = 0.9)


In [9]:
class CorrFeatureSelector(TransformerMixin, BaseEstimator):

    def __init__(self, max_corr):
        self.cols_to_remove = set()
        self.max_corr = max_corr

    def fit(self, X, y=None):
        corrs = X.corr()
        cols = corrs.columns
        for i in range(len(cols)):
            col_name_1 = cols[i]
            if col_name_1 in {'TransactionID', 'isFraud', 'TransactionDT'} or col_name_1 in self.cols_to_remove:
                continue
            for j in range(i+1, len(cols)):
                col_name_2 = cols[j]
                if abs(corrs[col_name_1][col_name_2]) > self.max_corr:
                    self.cols_to_remove.add(col_name_2)
        return self

    def transform(self, X):
        return X.drop(columns=self.cols_to_remove)

### Заполнение пропусков

In [10]:
class CustomImputer(TransformerMixin, BaseEstimator):
    
    def __init__(self, strategy='most_frequent', fill_value=None):
        self.__imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
        self.strategy = strategy
        self.fill_value = fill_value
        
    def fit(self, X, y=None):
        self.__imputer.fit(X)
        return self
    def transform(self, X, y=None):
        X = X.copy()
        X[X.columns] = self.__imputer.transform(X[X.columns])
        
        return X

### Кодирование категориальных признаков
Признаки, у которых количество уникальных значений больше, чем ohe_limit, закодируем с помощью TargetEncoder. Признаки, у которых количество уникальных значений меньше либо равно ohe_limit, закодируем с помощью pd.get_dummies (One-Hot Encoding).
В нашем случае зададим ohe_limit=10

In [11]:
class ObjectEncoder:
    def __init__(self, ohe_limit, obj_cols):
        self.__ohe_limit = ohe_limit
        self.__obj_cols = obj_cols
        self.__encoded_columns = None
        self.__ohe_cols = []
        self.__targ_enc_cols = []
        self.__targ_enc = None
        
    
    def divide_columns(self, df):
        for col in self.__obj_cols:
            if col in df.columns:
                if col in ['P_emaildomain', 'R_emaildomain'] or df[col].unique().shape[0] <= self.__ohe_limit:
                    self.__ohe_cols.append(col)
                else:
                    self.__targ_enc_cols.append(col)
                
    
    def encode_ohe_cols(self, df):
        # use OHE for columns from self.__ohe_cols only
        if self.__encoded_columns is not None:
            df = pd.get_dummies(df, drop_first=False, columns=self.__ohe_cols)

            removed_cols = []
            for col in df.columns:
                if col not in self.__encoded_columns:
                    removed_cols.append(col)
            df.drop(columns=removed_cols, inplace=True)

            for i in range(len(self.__encoded_columns)):
                col = self.__encoded_columns[i]
                if col not in df:
                    df.insert(i, col, 0)

        else:
            df = pd.get_dummies(df, drop_first=True, columns=self.__ohe_cols)
            self.__encoded_columns = df.columns
            
        return df

    def fit(self, X, y):
        self.divide_columns(X)
        self.__targ_enc = TargetEncoder(cols=self.__targ_enc_cols)
        self.__targ_enc.fit(X, y)
    
    def transform(self, X):
        X = self.__targ_enc.transform(X)
        return self.encode_ohe_cols(X)
    
    def fit_transform(self, X, y):
        self.divide_columns(X)
        self.__targ_enc = TargetEncoder(cols=self.__targ_enc_cols)
        X = self.__targ_enc.fit_transform(X, y)
        return self.encode_ohe_cols(X)

### Масштабирование числовых признаков

In [12]:
class CustomScaler(TransformerMixin, BaseEstimator):
    
    def __init__(self, cols, scaler=None):
        self.cols = cols
        self.scaler = scaler or preprocessing.MinMaxScaler()
        
    def fit(self, X, y=None):
        self.cols = list(set(self.cols).intersection(set(X.columns)))
        num_cols = X[self.cols]
        self.scaler.fit(num_cols)
        return self
    def transform(self, X, y=None):
        X_res = X.copy()
        X_res[self.cols] = self.scaler.transform(X_res[self.cols])
        return X_res

### Категориальные признаки

In [13]:
cat_cols = ['ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
            'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'DeviceType', 'DeviceInfo', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20'
            'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32'
            'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

### Числовые признаки

In [14]:
real_cols = list(X_train.columns)

for col in cat_cols:
    if col in real_cols:
        real_cols.remove(col)

### Пайплайн предобработки данных

In [15]:
# Пайплайн удаляет признаки, у которых более 85% пропусков, а также признаки, у которых корреляция больше 0.9,
# заполняет пропуски, кодирует категориальные признаки, масштабирует числовые признаки
prep_data_pipe = Pipeline([
    ('nan_feature_selector_', NanFeatureSelector(0.85)),
    ('corr_feature_selector_', CorrFeatureSelector(0.9)),
    ('imputer_', CustomImputer(strategy='constant', fill_value=-999)),
    ('encoder_', ObjectEncoder(10, cat_cols)),
    ('scaler_', CustomScaler(real_cols, preprocessing.MinMaxScaler()))
])

In [16]:
%%time
X_train = prep_data_pipe.fit_transform(X_train, y_train)

CPU times: user 2min 24s, sys: 36.6 s, total: 3min
Wall time: 3min


## Бустинги с гиперпараметрами, подобранными с помощью optuna

### Функция для расчета ROC AUC по кросс-валидации

In [17]:
def cross_val_modelling(kfold, X_train, y_train, model):

    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
    roc_aucs = []

    for train_idx, valid_idx in skf.split(X_train, y_train):
        trn_X = X_train.iloc[train_idx]
        trn_y = y_train.iloc[train_idx]
        val_X = X_train.iloc[valid_idx]
        val_y = y_train.iloc[valid_idx]
        
        model.fit(trn_X, trn_y, verbose=False)
        score = roc_auc_score(val_y, model.predict_proba(val_X)[:,1])
        
        roc_aucs.append(score)

    return roc_aucs

### LGBMClassifier с гиперпараметрами, подобранными с помощью optuna

In [18]:
lgbm_clf = LGBMClassifier(random_state=42,
                      n_estimators=2500,
                      num_leaves=25,
                      learning_rate=0.05,
                      reg_lambda=5,
                      min_child_samples=45,
                      class_weight='balanced',
                      objective = 'binary',
                      verbose=0)

In [19]:
%%time
lgbm_cv_res = cross_val_modelling(3, X_train, y_train, lgbm_clf)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
CPU times: user 12min 15s, sys: 4.01 s, total: 12min 19s
Wall time: 1min 4s


In [20]:
np.mean(lgbm_cv_res)

0.9666829824707924

In [44]:
%%time
lgbm_cv_sk_5 = cross_val_score(lgbm_clf, X_train, y_train, cv=5)
np.mean(lgbm_cv_sk_5)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
CPU times: user 22min 10s, sys: 5.47 s, total: 22min 16s
Wall time: 1min 54s


0.9603371492684971

### CatBoostClassifier с гиперпараметрами, подобранными с помощью optuna

In [21]:
cb_clf = CatBoostClassifier(n_estimators=8000,
                            depth=7,
                            task_type="GPU",
                            learning_rate=0.1,
                            loss_function='Logloss',
                            verbose=False)


In [22]:
%%time
cb_cv_res = cross_val_modelling(3, X_train, y_train, cb_clf)
np.mean(cb_cv_res)

CPU times: user 3min 45s, sys: 1min 37s, total: 5min 22s
Wall time: 2min 49s


0.9672983908881488

In [42]:
%%time
cb_cv_sk_5 = cross_val_score(cb_clf, X_train, y_train, cv=5)
np.mean(cb_cv_sk_5)

CPU times: user 6min 44s, sys: 2min 55s, total: 9min 39s
Wall time: 5min 8s


0.9874479282297086

In [23]:
%%time
cb_clf.fit(X_train, y_train)
with open('CB_CLF.pkl', 'wb') as f:
    pickle.dump(cb_clf, f)

CPU times: user 1min 31s, sys: 40 s, total: 2min 11s
Wall time: 1min 11s


### XGBClassifier с гиперпараметрами, подобранными с помощью optuna

In [24]:
xgb_clf = XGBClassifier(learning_rate=0.479,
                        n_estimators=1499,
                        tree_method='gpu_hist',
                        gamma=0.203,
                        max_depth=12)

In [25]:
%%time
xgb_cv_res = cross_val_modelling(3, X_train, y_train, xgb_clf)
np.mean(xgb_cv_res)

CPU times: user 1min 36s, sys: 1.67 s, total: 1min 38s
Wall time: 41.4 s


0.9655899969062128

In [43]:
%%time
xgb_cv_sk_5 = cross_val_score(xgb_clf, X_train, y_train, cv=5)
np.mean(xgb_cv_sk_5)

CPU times: user 2min 58s, sys: 3.46 s, total: 3min 1s
Wall time: 1min 19s


0.9863557070708943

In [26]:
%%time
xgb_clf.fit(X_train, y_train)
with open('XGB_CLF.pkl', 'wb') as f:
    pickle.dump(xgb_clf, f)

CPU times: user 40.1 s, sys: 596 ms, total: 40.7 s
Wall time: 19 s


### StackingClassifier

In [27]:
estimators = [('lgbm', lgbm_clf), ('cb', cb_clf), ('xgb', xgb_clf)]
stack_clf = StackingClassifier(estimators=estimators)

In [33]:
import sklearn
from sklearn.model_selection import cross_val_score

In [34]:
%%time
# use sklearn crossval score 
# 
stack_cv_res = cross_val_score(stack_clf, X_train, y_train, cv=3)
np.mean(stack_cv_res)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_w

0.9869166349046471

In [35]:
%%time
stack_clf.fit(X_train, y_train)
with open('STACK_CLF.pkl', 'wb') as f:
    pickle.dump(stack_clf, f)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
CPU times: user 39min 59s, sys: 3min 50s, total: 43min 49s
Wall time: 10min 25s


In [36]:
X_test = prep_data_pipe.transform(X_test)

In [37]:
stack_pred_proba = stack_clf.predict_proba(X_test)

In [39]:
roc_auc_score(y_test, stack_pred_proba[:,1])

0.9651440211205354

In [41]:
%%time
stack_cv_res_5 = cross_val_score(stack_clf, X_train, y_train, cv=5)
np.mean(stack_cv_res_5)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_w

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
CPU times: user 2h 48min 24s, sys: 17min 1s, total: 3h 5min 26s
Wall time: 44min 53s


0.9876151479862413