In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
warnings.simplefilter("ignore")
%matplotlib inline

In [2]:
df = pd.read_csv("data.csv", nrows=60000
)

df.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,


In [3]:
# Посчитать количество пропусков в признаках
missing_values_count = df.isnull().sum()

# Выбрать признаки без пропущенных значений
non_missing_features = missing_values_count[missing_values_count == 0].index

# Подсчитать количество признаков без пропущенных значений
num_non_missing_features = len(non_missing_features)

print("Количество признаков без пропущенных значений:")
print(num_non_missing_features)

Количество признаков без пропущенных значений:
75


In [4]:
df.fillna(0, inplace=True)

In [5]:
df.replace('XNA', np.nan, inplace=True)

In [6]:
#это все колонки, которые имеют количественные значения или числовые (например бинарные 0,1)
numeric_features = df.dtypes[(df.dtypes == np.float64) | (df.dtypes == np.int64)].keys().tolist()
#также выберем заинетерсовавшие нас категориальные признаки
cat_features = ['TransactionAmt', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1']

In [7]:
df = df[numeric_features + cat_features]
df.head(n=2)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1
0,2987000,0,86400,68.5,13926,0.0,150.0,142.0,315.0,87.0,...,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [8]:
# Разбиваем данные на обучающую и тестовую часть
X_train, X_test, y_train, y_test = train_test_split(df[cat_features], df['isFraud'], test_size=0.7, random_state=1)

# Масштабируем признаки
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Обучаем модель линейной регрессии
model = LinearRegression()
model.fit(X_train, y_train)

# Оцениваем модель на тестовой части
y_pred = model.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
print('ROC-AUC:', roc_auc)

ROC-AUC: 0.7061197582262433


In [12]:
pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('model', LogisticRegression(random_state=27))])

In [13]:
pipeline.fit(x_train, y_train)

In [14]:
# Разобьем данные на train / valid

x_train, x_valid = train_test_split(
    df.drop("isFraud", axis=1), train_size=0.5, shuffle=True, random_state=1,
)
y_train, y_valid = train_test_split(
    df["isFraud"], train_size=0.5, shuffle=True, random_state=1,
)

# Разобьем данные на valid / test

x_valid, x_test = train_test_split(
    x_valid, train_size=0.25, shuffle=True, random_state=1
)
y_valid, y_test = train_test_split(
    y_valid, train_size=0.25, shuffle=True, random_state=1
)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_test.shape))

x_train.shape = 30000 rows, 396 cols
x_valid.shape = 7500 rows, 396 cols
x_test.shape = 22500 rows, 396 cols


In [15]:
train_score = roc_auc_score(y_train, pipeline.predict_proba(x_train)[:, 1])
valid_score = roc_auc_score(y_valid, pipeline.predict_proba(x_valid)[:, 1])
test_score = roc_auc_score(y_test, pipeline.predict_proba(x_test)[:, 1])

print(f"Train-score: {round(train_score, 3)}, Valid-score: {round(valid_score, 3)}, Test-score: {round(test_score, 3)}")

difference = valid_score - test_score
print("Количество признаков без пропущенных значений:")
print(difference)

Train-score: 0.872, Valid-score: 0.854, Test-score: 0.837
Количество признаков без пропущенных значений:
0.017012155997670364


In [16]:
accuracy_difference = valid_score - test_score
print(f"Accuracy Difference: {round(accuracy_difference, 4)}")


Accuracy Difference: 0.017
