#  Reinforcement Learning for Fraud Detection
This notebook uses **Stable-Baselines3** and **RLlib** to train a full RL agent to detect online fraud based on reward feedback. 

In [8]:
!pip install stable-baselines3[extra] ray[rllib]

Collecting ray[rllib]
  Using cached ray-2.10.0-cp38-cp38-win_amd64.whl.metadata (13 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray[rllib])
  Downloading msgpack-1.1.0-cp38-cp38-win_amd64.whl.metadata (8.6 kB)
Collecting aiosignal (from ray[rllib])
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting frozenlist (from ray[rllib])
  Downloading frozenlist-1.5.0-cp38-cp38-win_amd64.whl.metadata (14 kB)
Collecting tensorboardX>=1.9 (from ray[rllib])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting dm-tree (from ray[rllib])
  Downloading dm_tree-0.1.8-cp38-cp38-win_amd64.whl.metadata (2.0 kB)
INFO: pip is looking at multiple versions of ray[rllib] to determine which version is compatible with other requirements. This could take a while.
Collecting ray[rllib]
  Using cached ray-2.9.3-cp38-cp38-win_amd64.whl.metadata (14 kB)
Collecting pyarrow<7.0.0,>=6.0.1 (from ray[rllib])
  Downloading pyarrow-6.0.1-cp38-cp38-win_amd64.whl.metadata

## Step 2: Prepare Environment and Dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import gymnasium as gym
from gymnasium import spaces
import numpy as np

# Load and prepare data
df = pd.read_csv("enhanced_online_fraud_dataset.csv")
features = ['amount', 'balance_diff_org', 'balance_diff_dest', 'txn_ratio', 'is_large_txn', 'is_receiver_zero_before']
X = df[features]
y = df['isFraud']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Creating a custom Gym environment
class FraudEnv(gym.Env):
    def __init__(self, X, y):
        super(FraudEnv, self).__init__()
        self.X = X
        self.y = y
        self.current_step = 0
        self.action_space = spaces.Discrete(2)  # 0: not fraud, 1: fraud
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(X.shape[1],), dtype=np.float32)

    def reset(self, seed=None, options=None):
        self.current_step = 0
        obs = self.X[self.current_step].astype(np.float32)
        return obs, {}

    def step(self, action):
        label = self.y[self.current_step]
        reward = 1 if action == label else -1
        self.current_step += 1
        done = self.current_step >= len(self.X)
        obs = self.X[self.current_step].astype(np.float32) if not done else np.zeros_like(self.X[0], dtype=np.float32)
        return obs, reward, done, False, {}

env = FraudEnv(X_train, y_train.values)


## Step 3: Trainin Agent using Stable-Baselines3 (DQN)

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

# Checking environment compliance
check_env(env, warn=True)

# Train agent
model = DQN("MlpPolicy", env, verbose=1, learning_rate=1e-3, buffer_size=10000, learning_starts=1000, batch_size=32, train_freq=1, target_update_interval=500)
model.learn(total_timesteps=5000)

# Save model
model.save("dqn_fraud_model")
print("✅ DQN model saved as 'dqn_fraud_model'")

AssertionError: The observation returned by the `reset()` method does not match the data type (cannot cast) of the given observation space Box(-inf, inf, (6,), float32). Expected: float32, actual dtype: float64

## Step 4: Evaluatin the RL Agent on Test Set

In [None]:
# Evaluating d model
env_test = FraudEnv(X_test, y_test.values)
obs = env_test.reset()
total_reward = 0

for _ in range(len(X_test)):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, done, _ = env_test.step(action)
    total_reward += reward
    if done:
        break

print(f"🎯 Total Reward on Test Set: {total_reward}")

## Using RLlib for Distributed RL

In [None]:
# import ray
# from ray import tune
# from ray.rllib.agents.dqn import DQNTrainer

# ray.init(ignore_reinit_error=True)

# # Register environment
# from ray.tune.registry import register_env
# def env_creator(_):
#     return FraudEnv(X_train, y_train.values)
# register_env("FraudEnv", env_creator)

# # Train using RLlib DQNTrainer
# analysis = tune.run(
#     "DQN",
#     stop={"training_iteration": 10},
#     config={
#         "env": "FraudEnv",
#         "framework": "torch",
#         "lr": 1e-3,
#         "num_workers": 1,
#     },
#     verbose=1
# )