In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
warnings.simplefilter("ignore")
%matplotlib inline

In [2]:
df = pd.read_csv('data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 394 entries, TransactionID to V339
dtypes: float64(376), int64(4), object(14)
memory usage: 300.6+ MB


In [3]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
missing_values = df.isnull().sum()
features_without_null = missing_values[missing_values == 0].index.tolist()
len(features_without_null)

75

In [5]:
df = df[features_without_null]
features = ['isFraud', 'TransactionAmt', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1']
df = df[features]
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1
0,0,68.5,13926,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0
1,0,29.0,2755,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0,59.0,4663,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0,50.0,18132,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0
4,0,50.0,4497,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0


## Отложенная выборка

In [6]:
x_train, x_test = train_test_split(
    df.drop(["isFraud"], axis=1), train_size=0.7, shuffle=True, random_state=1,
)
y_train, y_test = train_test_split(
    df["isFraud"], train_size=0.7, shuffle=True, random_state=1,
)

In [7]:
pipeline = Pipeline(
    steps=[
        ("scaling", StandardScaler()),
        ("model", LogisticRegression())
    ]
)

pipeline.fit(x_train, y_train)

In [8]:
proba = pipeline.predict_proba(x_test)[:,1]
roc_auc = roc_auc_score(y_test, proba)
roc_auc

0.7121705958222013

## Валидация на отложенной и тестовой выборках

In [9]:
x_train, x_valid = train_test_split(
    df.drop(["isFraud"], axis=1), train_size=0.5, shuffle=True, random_state=1,
)
y_train, y_valid = train_test_split(
    df["isFraud"], train_size=0.5, shuffle=True, random_state=1,
)

x_valid, x_test = train_test_split(
    x_valid, train_size=0.5, shuffle=True, random_state=1
)
y_valid, y_test = train_test_split(
    y_valid, train_size=0.5, shuffle=True, random_state=1
)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))

x_train.shape = 50000 rows, 17 cols
x_valid.shape = 25000 rows, 17 cols
x_test.shape = 25000 rows, 17 cols


In [10]:
pipeline.fit(x_train, y_train)

In [11]:
train_score = roc_auc_score(y_train, pipeline.predict_proba(x_train)[:, 1])
valid_score = roc_auc_score(y_valid, pipeline.predict_proba(x_valid)[:, 1])
test_score = roc_auc_score(y_test, pipeline.predict_proba(x_test)[:, 1])

print(f"Train-score: {train_score}, Valid-score: {valid_score}, Test-score: {test_score}")

Train-score: 0.7064504287132019, Valid-score: 0.7056096778146643, Test-score: 0.7173168128431394


## Кросс-валидация

In [12]:
kfold = KFold(n_splits=5, shuffle=True, random_state=27)

cv = cross_val_score(
    estimator=pipeline,
    X=df.drop(["isFraud"], axis=1),
    y=df["isFraud"],
    scoring="roc_auc",
    cv=kfold
)

print(f"CV-results: {round(np.mean(cv), 4)} +/- {round(np.std(cv), 3)}")

CV-results: 0.6936 +/- 0.021
