In [1]:
from typing import Tuple, List
import warnings

warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, validation_curve, learning_curve

In [2]:
data = pd.read_csv('./ieee-fraud-detection/data.csv')
print('data shape = rows {}, columns {}.'.format(*data.shape))
data.head(n=2)

data shape = rows 100000, columns 394.


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [3]:
columns = ['isFraud', 'TransactionAmt', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1']

In [4]:
data_hw = data[columns]
data_hw.head(n=2)

Unnamed: 0,isFraud,TransactionAmt,card1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1
0,0,68.5,13926,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0
1,0,29.0,2755,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [5]:
x_train, x_valid = train_test_split(
    data_hw.drop(['isFraud'], axis=1), train_size=0.7, shuffle=True, random_state=1
)

y_train, y_valid = train_test_split(
    data_hw['isFraud'], train_size=0.7, shuffle=True, random_state=1
)

print('X_train = rows {}, columns {}.'.format(*x_train.shape))
print('X_valid = rows {}, columns {}.'.format(*x_valid.shape))

X_train = rows 70000, columns 17.
X_valid = rows 30000, columns 17.


In [6]:
forest = RandomForestClassifier(random_state=27)
forest.fit(x_train, y_train)

RandomForestClassifier(random_state=27)

In [7]:
train_score = roc_auc_score(y_train, forest.predict_proba(x_train)[:,1])
valid_score = roc_auc_score(y_valid, forest.predict_proba(x_valid)[:,1])

print('Train_score: {}, valid_score: {}'.format(train_score, round(valid_score,4)))

Train_score: 0.9997712635828636, valid_score: 0.8984


In [None]:
train_scores, valid_scores = validation_curve(
    X = x_train,
    y = y_train,
    estimator=RandomForestClassifier(random_state=27),
    param_range = range(2,20),
    param_name = 'max_depth',
    scoring = 'roc_auc',
    n_jobs = -1,
    cv=3
)

In [None]:
plt.plot(
    range(train_scores.shape[0]),
    np.mean(train_scores, axis=1),
    label = 'train',
    linewidth = 3,
    marker = 's',
)

plt.fill_between(
    x = range(train_scores.shape[0]),
    y1 = np.mean(train_scores, axis=1) - np.std(train_scores, axis=1),
    y2 = np.mean(train_scores, axis=1) + np.std(train_scores, axis=1),
    alpha = 0.25
)

plt.plot(
    range(train_scores.shape[0]),
    np.mean(valid_scores, axis=1),
    label='valid',
    linewidth=3,
    marker='s',
)

plt.fill_between(
    x = range(train_scores.shape[0]),
    y1 = np.mean(valid_scores, axis=1) - np.std(valid_scores, axis=1),
    y2 = np.mean(valid_scores, axis=1) + np.std(valid_scores, axis=1),
    alpha = .25,
)

plt.title('Validation Curve: max_depth')
plt.legend(loc='best', fontsize=14)
plt.ylabel('roc_auc')

In [8]:
forest = RandomForestClassifier(max_depth=14, random_state=27)

forest.fit(x_train, y_train)

RandomForestClassifier(max_depth=14, random_state=27)

In [9]:
train_pred = roc_auc_score(y_train, forest.predict_proba(x_train)[:,1])
valid_pred = roc_auc_score(y_valid, forest.predict_proba(x_valid)[:,1])

print('train_pred: {}, valid_pred: {}'.format(train_pred, valid_pred))

train_pred: 0.9713558213535896, valid_pred: 0.9078281513001842


In [11]:
def create_bootstrap_samples(data: np.array, n_samples: int = 1000) -> np.array:
    """
    Создание бутстреп-выборок.

    Parameters
    ----------
    data: np.array
        Исходная выборка, которая будет использоваться для
        создания бутстреп выборок.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_idx: np.array
        Матрица индексов, для создания бутстреп выборок.

    """
    bootstrap_idx = np.random.randint(
        low=0, high=len(data), size=(n_samples, len(data))
    )
    return bootstrap_idx


def create_bootstrap_metrics(y_true: np.array,
                             y_pred: np.array,
                             metric: callable,
                             n_samlpes: int = 1000) -> List[float]:
    """
    Вычисление бутстреп оценок.

    Parameters
    ----------
    y_true: np.array
        Вектор целевой переменной.

    y_pred: np.array
        Вектор прогнозов.

    metric: callable
        Функция для вычисления метрики.
        Функция должна принимать 2 аргумента: y_true, y_pred.

    n_samples: int, optional, default = 1000
        Количество создаваемых бутстреп выборок.
        Опциональный параметр, по умолчанию, равен 1000.

    Returns
    -------
    bootstrap_metrics: List[float]
        Список со значениями метрики качества на каждой бустреп выборке.

    """
    scores = []

    if isinstance(y_true, pd.Series):
        y_true = y_true.values

    bootstrap_idx = create_bootstrap_samples(y_true)
    for idx in bootstrap_idx:
        y_true_bootstrap = y_true[idx]
        y_pred_bootstrap = y_pred[idx]

        score = metric(y_true_bootstrap, y_pred_bootstrap)
        scores.append(score)

    return scores


def calculate_confidence_interval(scores: list, conf_interval: float = 0.95) -> Tuple[float]:
    """
    Вычисление доверительного интервала.

    Parameters
    ----------
    scores: List[float / int]
        Список с оценками изучаемой величины.

    conf_interval: float, optional, default = 0.95
        Уровень доверия для построения интервала.
        Опциональный параметр, по умолчанию, равен 0.95.

    Returns
    -------
    conf_interval: Tuple[float]
        Кортеж с границами доверительного интервала.

    """
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )

    return left_bound, right_bound

In [15]:
scores = create_bootstrap_metrics(y_valid, forest.predict_proba(x_valid)[:,1], roc_auc_score)

In [16]:
calculate_confidence_interval(scores)

(0.8932036561117737, 0.922418576048409)

In [17]:
round(0.8932036561117737 - 0.922418576048409, 4)

-0.0292