-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
130 lines (111 loc) · 5.05 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import recall_score, f1_score, accuracy_score, roc_auc_score
# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Load dataset
df = pd.read_csv("fraud.csv", index_col=0)
"""
user_id amount device_type is_fraud age income debt credit_score
0 32fe1d6f-2c2e-476f-981b-b254a7bca753 4701.31 Mobile 1 29 139977.00 10069.32 437
1 d684726b-775b-451c-8e2a-7a5bfbdc840c 1904.93 Tablet 1 23 137934.91 3623.21 303
2 778f102c-7932-48b3-80e3-1375a152b791 381.68 Desktop 0 44 47439.27 44627.27 663
3 30458cfd-f9b7-4efb-afce-cc4470d5baf1 2262.44 Tablet 1 58 44551.91 16533.34 479
4 21076278-c8fa-4446-b002-ced7e8b870da 15.91 Desktop 0 32 88902.17 3719.38 695
... ... ... ... ... ... ... ...
1920275 d1efd401-7cf3-461d-9936-18fdca061f17 233.76 Desktop 0 24 86110.04 25129.20 474
1920276 3c3be93c-cc06-40df-ac08-c0c663db97ee 589.03 Desktop 0 22 65111.03 44359.28 471
1920277 f5e7b397-4cb3-4716-a6b2-ba480477b8ea 782.52 Desktop 0 33 30342.53 47478.12 636
1920278 5e5b10c9-a915-4448-952d-8d2fac51ff7f 2429.14 Tablet 1 53 90436.71 3405.45 469
1920279 db3248c8-9432-4eff-a43e-478fc8a13533 235.08 Desktop 0 21 96415.34 33382.39 592
"""
# Drop 'user_id' column as it's not useful for modeling
df.drop(["user_id"], axis=1, inplace=True)
# Check for missing values
print("Missing values:\n",df.isnull().sum())
"""
amount 0
device_type 0
is_fraud 0
age 0
income 0
debt 0
credit_score 0
"""
# Boxplot visualization for outliers.
num_cols = [col for col in df.columns if col not in ["is_fraud"]]
for col in num_cols:
sns.boxplot(x=df[col])
plt.show(block=True)
# No outliers.
# Scale numerical features using StandardScaler
scale_cols = [col for col in df.columns if col not in ["device_type", "is_fraud"]]
ss = StandardScaler()
df[scale_cols] = ss.fit_transform(df[scale_cols])
# Encode categorical feature 'device_type' using LabelEncoder
le = LabelEncoder()
df["device_type"] = le.fit_transform(df["device_type"])
"""
amount device_type is_fraud age income debt credit_score
0 2.01 1 1 -0.98 1.46 -1.03 -0.87
1 0.14 2 1 -1.37 1.41 -1.48 -1.71
2 -0.88 0 0 -0.00 -1.00 1.36 0.55
3 0.37 2 1 0.91 -1.08 -0.59 -0.60
4 -1.13 0 0 -0.78 0.10 -1.47 0.76
... ... ... ... ... ... ...
1920275 -0.98 0 0 -1.31 0.03 0.01 -0.63
1920276 -0.74 0 0 -1.44 -0.53 1.34 -0.65
1920277 -0.62 0 0 -0.72 -1.46 1.56 0.38
1920278 0.49 2 1 0.59 0.14 -1.49 -0.66
1920279 -0.98 0 0 -1.50 0.30 0.58 0.11
"""
# Modeling
X = df.drop("is_fraud", axis=1).values
y = df["is_fraud"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Define a neural network model
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(2, activation='softmax')
])
# Compile the model
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
# Convert target variable to categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
# Train the model
model.fit(X_train, y_train_categorical, epochs=2, batch_size=32, validation_split=0.2)
# Predict probabilities and classes
y_pred_proba = model.predict(X_test)[:, 1]
y_pred_classes = y_pred_proba.round()
# Calculate evaluation metrics
recall = recall_score(y_test, y_pred_classes, average='macro')
f1 = f1_score(y_test, y_pred_classes, average='macro')
accuracy = accuracy_score(y_test, y_pred_classes)
roc_auc = roc_auc_score(y_test, y_pred_proba)
# Print evaluation metrics
print(f"""
Recall Score: {recall}
F1 Score: {f1}
Accuracy Score: {accuracy}
ROC-AUC Score: {roc_auc}
""")
"""
Recall Score: 0.9982765651295072
F1 Score: 0.9983030732345686
Accuracy Score: 0.9983058026260059
ROC-AUC Score: 0.9999883558011051
"""