In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [3]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [4]:
Q1 = data['amount'].quantile(0.25)
Q3 = data['amount'].quantile(0.75)

# Calculate IQR
IQR = Q3 - Q1

# Define outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_clean = data[(data['amount'] >= lower_bound) & (data['amount'] <= upper_bound)]

data_clean = data_clean.drop('isFlaggedFraud', axis=1)
data_clean = data_clean.drop('step', axis=1)
data_clean = data_clean.drop(['nameOrig','nameDest'], axis=1)
data_clean = data_clean[(data_clean['type']=='TRANSFER') | (data_clean['type']=='CASH_OUT')]

from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder()
data_clean['type']= label_encoder.fit_transform(data_clean['type'])

data_clean = data_clean.drop('isFraud', axis=1)
data_clean = data_clean[:4350]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_clean = scaler.fit_transform(data_clean.values)

data_clean

array([[ 1.69617374, -1.36092024, -0.20493388, -0.1312186 , -0.45896354,
        -0.57421594],
       [-0.58956225, -1.36092024, -0.20493388, -0.1312186 , -0.45194604,
        -0.57421594],
       [-0.58956225,  0.44024546, -0.15108933, -0.1312186 , -0.45727957,
        -0.56080362],
       ...,
       [-0.58956225, -1.31690549, -0.20557743, -0.1312186 , -0.43286494,
        -0.55220118],
       [-0.58956225, -0.36946241, -0.20557743, -0.1312186 , -0.24910185,
        -0.31710762],
       [-0.58956225, -0.37926434, -0.20557743, -0.1312186 ,  0.88336574,
         0.92002473]])

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable, grad

In [6]:
# Generator Network
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(100, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 6),
            nn.Tanh()  # Output values will be normalized between -1 and 1
        )

    def forward(self, x):
        return self.model(x)

# Discriminator Network
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(6, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1)  # Outputs a real-valued score
        )

    def forward(self, x):
        score = self.fc(x)
        return score

In [7]:
def compute_gradient_penalty(critic, real_samples, fake_samples):
    alpha = torch.rand(real_samples.size(0), 1, 1, 1).to(real_samples.device)
    interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)

    scores = critic(interpolates)

    # Compute gradients of the critic output with respect to the interpolated samples
    gradients = grad(
        outputs=scores, # The critic scores on the interpolated samples
        inputs=interpolates, # The interpolated samples themselves
        grad_outputs=torch.ones(scores.size()).to(real_samples.device),  # Gradients w.r.t. critic output
        create_graph=True,  # Retain computational graph for higher order derivatives
        retain_graph=True,  # Retain the graph for backpropagation
        only_inputs=True    # Only compute gradients for interpolates
    )[0]

     # Compute the L2 norm of the gradients for each sample
    gradients = gradients.view(gradients.size(0), -1)    # Flatten the gradients
    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()   #  Compute the gradient penalty (||grad||_2 - 1)^2

    return gradient_penalty

In [8]:
lambda_gp = 10  
critic_iters = 5  

generator = Generator()
critic = Critic()


# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator.to(device)
critic.to(device)


Critic(
  (fc): Sequential(
    (0): Linear(in_features=6, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.2, inplace=True)
    (6): Linear(in_features=128, out_features=1, bias=True)
  )
)

In [9]:
# Binary Cross Entropy Loss
adversarial_loss = nn.BCELoss()

# Optimizers for Generator and Discriminator
optimizer_G = optim.Adam(generator.parameters(), lr=0.001 ,betas=(0.5, 0.9))
optimizer_C = optim.Adam(critic.parameters(), lr=0.001,betas=(0.5, 0.9))

In [10]:
# Training Parameters
n_epochs = 500
batch_size = 58

# Sample real transaction data for training
class FraudDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Load your real transaction data here (after preprocessing)
# real_transactions = ...  # Placeholder for real transaction data (e.g., from Kaggle dataset)
dataset = FraudDataset(data_clean)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(n_epochs):
    for i , (real_data) in enumerate(dataloader):
        real_data = real_data.to(device)

        # ---------------------
        #  Train Discriminator
        # ---------------------
        optimizer_C.zero_grad()

        # Generate fake transactions
        z = torch.randn(batch_size, 100).to(device)
        fake_data = generator(z)

        # Real transactions as labels = 1
        real_labels = torch.ones(real_data.size(0), 1).to(device)
        # Fake transactions as labels = 0
        fake_labels = torch.zeros(fake_data.size(0), 1).to(device)

        # Discriminator loss for real and fake data
        real_score = critic(real_data)
        fake_score = critic(fake_data)
        gradient_penalty = compute_gradient_penalty(critic, real_data, fake_data)
        loss_C = fake_score.mean() - real_score.mean() + lambda_gp * gradient_penalty
        loss_C.backward()
        optimizer_C.step()

         # Train generator every critic_iters iterations
        if i % critic_iters == 0:
            # ---------------------
            #  Train Generator
            # ---------------------
            optimizer_G.zero_grad()

            # Generate fake images
            z = torch.randn(batch_size, 100).to(device)
            fake_data = generator(z)

            # Generator loss (minimize the critic's score for fake images)
            loss_G = -critic(fake_data).mean()

            loss_G.backward()
            optimizer_G.step()

    # Print progress every few epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, critic Loss: {loss_C.item()}, Generator Loss: {loss_G.item()}')


Epoch 0, critic Loss: -0.06520448625087738, Generator Loss: 0.10125872492790222
Epoch 10, critic Loss: -0.10147793591022491, Generator Loss: -0.16862978041172028
Epoch 20, critic Loss: -0.03902210295200348, Generator Loss: -0.1436454951763153
Epoch 30, critic Loss: -0.09354464709758759, Generator Loss: -0.10645350068807602
Epoch 40, critic Loss: -0.06352050602436066, Generator Loss: -0.18661457300186157
Epoch 50, critic Loss: 0.02928391844034195, Generator Loss: -0.14876045286655426
Epoch 60, critic Loss: -0.08103220164775848, Generator Loss: -0.19962535798549652
Epoch 70, critic Loss: 0.21582457423210144, Generator Loss: -0.26792824268341064
Epoch 80, critic Loss: -0.040969207882881165, Generator Loss: -0.29057979583740234
Epoch 90, critic Loss: -0.15585972368717194, Generator Loss: -0.07562022656202316
Epoch 100, critic Loss: -0.013955190777778625, Generator Loss: -0.02125595510005951
Epoch 110, critic Loss: 0.08650507032871246, Generator Loss: -0.05278163030743599
Epoch 120, critic 

In [11]:
# Generate synthetic fraud transactions
generator.eval()
noise = torch.randn(10, 100).to(device)
synthetic_data = generator(noise).detach().cpu().numpy()
print("Synthetic Fraud Transactions Generated: \n", pd.DataFrame(scaler.inverse_transform(synthetic_data)))


Synthetic Fraud Transactions Generated: 
           0              1             2            3             4  \
0  0.013349  300286.437500 -38221.015625 -7194.257324  9.442073e+05   
1  0.106872   99173.031250  63362.812500 -1737.460571  4.403814e+06   
2  0.012918   46072.472656  -5120.703125 -1557.023071 -4.776933e+04   
3  0.003257  124761.554688    262.648529 -1163.800415  5.149829e+05   
4  0.004812  133287.531250   -566.609314 -1260.433228  8.676694e+05   
5  0.005836   67985.937500   8019.152344   456.031586  4.184739e+05   
6  0.026435   46058.925781 -17492.164062 -3538.312256 -2.745848e+05   
7  0.005743  188694.046875  -1383.472534 -2600.136475  6.677405e+05   
8  0.005212   71834.039062   6739.746094   287.008148  4.461637e+05   
9  0.695328   46059.175781  -6985.242188 -4330.319824 -1.656108e+05   

              5  
0  6.013998e+06  
1  6.046178e+06  
2  6.821845e+04  
3  7.759944e+05  
4  1.903966e+06  
5  1.721898e+06  
6 -4.055610e+05  
7  7.975183e+05  
8  1.559801e+0