In [8]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
warnings.simplefilter("ignore")
%matplotlib inline

In [9]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
data.isnull().sum()

TransactionID         0
isFraud               0
TransactionDT         0
TransactionAmt        0
ProductCD             0
                  ...  
V335              68229
V336              68229
V337              68229
V338              68229
V339              68229
Length: 394, dtype: int64

In [11]:
data.isnull().mean()

TransactionID     0.00000
isFraud           0.00000
TransactionDT     0.00000
TransactionAmt    0.00000
ProductCD         0.00000
                   ...   
V335              0.68229
V336              0.68229
V337              0.68229
V338              0.68229
V339              0.68229
Length: 394, dtype: float64

In [12]:
without_missing = data.columns[data.isnull().sum() == 0]
without_missing

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
       'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'V95', 'V96', 'V97',
       'V98', 'V99', 'V100', 'V101', 'V102', 'V103', 'V104', 'V105', 'V106',
       'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115',
       'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124',
       'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133',
       'V134', 'V135', 'V136', 'V137', 'V281', 'V282', 'V283', 'V288', 'V289',
       'V296', 'V300', 'V301', 'V313', 'V314', 'V315'],
      dtype='object')

In [13]:
len(data.columns[data.isnull().sum() == 0])

75

In [14]:
X = data[['TransactionAmt', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1']]
y = data['isFraud']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [16]:
print("x_train.shape = {} rows, {} cols".format(*X_train.shape))
print("x_test.shape = {} rows, {} cols".format(*X_test.shape))

x_train.shape = 70000 rows, 17 cols
x_test.shape = 30000 rows, 17 cols


In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [19]:
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

In [20]:
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC: {roc_auc:.4f}")

ROC-AUC: 0.7107


In [21]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=1)

In [22]:
# Разобьем данные на valid / test

X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

In [23]:
print("x_train.shape = {} rows, {} cols".format(*X_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*X_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*X_test.shape))

x_train.shape = 50000 rows, 17 cols
x_valid.shape = 25000 rows, 17 cols
x_test.shape = 25000 rows, 17 cols


In [24]:
pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('model', LogisticRegression(random_state=27))])

In [25]:
pipeline.fit(X_train, y_train)

In [26]:
train_score = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1])
valid_score = roc_auc_score(y_valid, pipeline.predict_proba(X_valid)[:, 1])
test_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

print(f"Train-score: {round(train_score, 4)}, Valid-score: {round(valid_score, 4)}, Test-score: {round(test_score, 4)}")

Train-score: 0.7117, Valid-score: 0.7073, Test-score: 0.7199


In [28]:
accuracy_difference = valid_score - test_score
print(f"Accuracy Difference: {round(accuracy_difference, 4)}")

Accuracy Difference: -0.0126


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
# Предположим, что у вас уже есть X и y определенные.
# Разделение на train, valid, test в соотношении 50% / 25% / 25%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=1)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)
# Создание пайплайна
pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('model', LogisticRegression(random_state=27))])
# Обучение модели
pipeline.fit(X_train, y_train)
# Оценка модели
train_score = roc_auc_score(y_train, pipeline.predict_proba(X_train)[:, 1])
valid_score = roc_auc_score(y_valid, pipeline.predict_proba(X_valid)[:, 1])
test_score = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
# Вывод результатов
print(f"Train-score: {round(train_score, 4)}, Valid-score: {round(valid_score, 4)}, Test-score: {round(test_score, 4)}")
# Разница между валидируемым и тестовым значением
accuracy_difference = valid_score - test_score
print(f"Accuracy Difference: {round(accuracy_difference, 4)}")


Train-score: 0.7117, Valid-score: 0.7073, Test-score: 0.7199
Accuracy Difference: -0.0126


In [32]:
# определяем стратегию для проведения кросс-валидации
kfold = KFold(n_splits=5, shuffle=True, random_state=27)

# проводим кросс-валидацию
cv = cross_val_score(
    estimator=pipeline,
    X = data[['TransactionAmt', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1']],
    y = data['isFraud'],
    scoring="roc_auc",
    cv=kfold
)

# Считаем среднее значение метрики на каждом фолде и выводим среднее значение
print(f"CV-results: {round(np.mean(cv), 4)} +/- {round(np.std(cv), 4)}")

CV-results: 0.6916 +/- 0.0216
