In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/creditcardfraud/creditcard.csv


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

# Define features and target
X = df.drop(columns=['Class'])
y = df['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Verify shapes
print("X_train shape:", X_train.shape)  # (Samples, Features)
print("X_test shape:", X_test.shape)    # (Samples, Features)

X_train shape: (227845, 30)
X_test shape: (56962, 30)


# 1. CNN for Credit Card Fraud Detection

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Flatten, Dense, Dropout, MaxPooling1D

# Reshape input for CNN (Add channel dimension)
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build CNN model
cnn_model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
cnn_model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [4]:
from sklearn.metrics import classification_report

In [5]:
cnn_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train CNN model
cnn_model.fit(X_train_cnn, y_train, epochs=10, batch_size=32, validation_data=(X_test_cnn, y_test))

# Evaluate CNN Model
y_pred_cnn = (cnn_model.predict(X_test_cnn) > 0.5).astype("int32")
print("CNN Classification Report:\n", classification_report(y_test, y_pred_cnn))

Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - accuracy: 0.9977 - loss: 0.0171 - val_accuracy: 0.9992 - val_loss: 0.0037
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0041 - val_accuracy: 0.9993 - val_loss: 0.0045
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.9993 - loss: 0.0038 - val_accuracy: 0.9992 - val_loss: 0.0035
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0043 - val_accuracy: 0.9994 - val_loss: 0.0031
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0037 - val_accuracy: 0.9992 - val_loss: 0.0052
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 0.9995 - val_loss: 0.0029
Epoch 7/10

## Why CNN?
-  Detects fraud patterns using filters & convolutions
-  Works well with high-dimensional tabular data

#  2. RNN (LSTM) for Fraud Detection

In [6]:
from tensorflow.keras.layers import LSTM

# Reshape input for LSTM (time-step = 1)
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build LSTM Model
lstm_model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(1, X_train.shape[1])),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
lstm_model.summary()

  super().__init__(**kwargs)


In [7]:
lstm_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_test_lstm, y_test))

# Evaluate LSTM Model
y_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5).astype("int32")
print("LSTM Classification Report:\n", classification_report(y_test, y_pred_lstm))

Epoch 1/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.9949 - loss: 0.0469 - val_accuracy: 0.9993 - val_loss: 0.0029
Epoch 2/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0031 - val_accuracy: 0.9994 - val_loss: 0.0030
Epoch 3/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0024 - val_accuracy: 0.9993 - val_loss: 0.0029
Epoch 4/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0025 - val_accuracy: 0.9994 - val_loss: 0.0029
Epoch 5/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9995 - loss: 0.0021 - val_accuracy: 0.9993 - val_loss: 0.0028
Epoch 6/10
[1m7121/7121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0019 - val_accuracy: 0.9993 - val_loss: 0.0028
Epoch 7/10

## Why LSTM?
-  Works well with sequential data (like transaction histories)
-  Captures long-term dependencies in transactions

# 3. Autoencoder for Anomaly Detection

In [8]:
from tensorflow.keras.layers import Input
from sklearn.metrics import mean_squared_error

# Define Autoencoder Model
input_dim = X_train.shape[1]

autoencoder = Sequential([
    Dense(32, activation='relu', input_shape=(input_dim,)),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(32, activation='relu'),
    Dense(input_dim, activation='sigmoid')  # Reconstruct input
])
autoencoder.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
# Compile Autoencoder
autoencoder.compile(optimizer="adam", loss="mse")

# Train Autoencoder only on normal transactions (y == 0)
X_train_normal = X_train[y_train == 0]
autoencoder.fit(X_train_normal, X_train_normal, epochs=10, batch_size=32, shuffle=True)

# Compute reconstruction error on test data
X_test_pred = autoencoder.predict(X_test)
reconstruction_error = mean_squared_error(X_test, X_test_pred, multioutput='raw_values')

# Set threshold for anomaly detection
threshold = np.percentile(reconstruction_error, 95)
y_pred_autoencoder = (reconstruction_error > threshold).astype("int32")

# ✅ Removed Classification Report
print("Threshold for anomaly detection:", threshold)
print("Number of predicted fraud cases:", np.sum(y_pred_autoencoder))


Epoch 1/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - loss: 0.7339
Epoch 2/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7452
Epoch 3/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7352
Epoch 4/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7312
Epoch 5/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7395
Epoch 6/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7414
Epoch 7/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7301
Epoch 8/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7421
Epoch 9/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - loss: 0.7420
Epoch 10/10
[1m7108/7108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [12]:
from sklearn.metrics import classification_report
import numpy as np

# Compute reconstruction error (Mean Squared Error)
reconstruction_error = np.mean(np.abs(X_test - y_pred_autoencoder), axis=1)

# Set a threshold for anomaly detection (can be fine-tuned)
threshold = np.percentile(reconstruction_error, 95)  # Top 5% anomalies

# Convert reconstruction error into binary classification
y_pred_autoencoder = (reconstruction_error > threshold).astype(int)

# ✅ Now y_pred_autoencoder has the correct shape (56962,)
print("Autoencoder Classification Report:\n", classification_report(y_test, y_pred_autoencoder))

Autoencoder Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97     56864
           1       0.03      0.86      0.06        98

    accuracy                           0.95     56962
   macro avg       0.51      0.90      0.52     56962
weighted avg       1.00      0.95      0.97     56962



## Why Autoencoder?
-  Learns normal transaction behavior
-   Detects anomalies (fraud) as high reconstruction errors