### BAF GAN ile Yapay Veri

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import time

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
df = pd.read_csv('base.csv')  

categorical_columns = ['income', 'customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'device_distinct_emails_8w', 'device_fraud_count', 'month']  # Kendi kategorik sütunlarınızı ekleyin

encoder = OneHotEncoder(sparse=False, drop='first')
encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))

encoded_df.columns = encoder.get_feature_names_out(categorical_columns)

df = df.drop(columns=categorical_columns) 
df = pd.concat([df, encoded_df], axis=1)  

In [3]:
df.head()

Unnamed: 0,fraud_bool,name_email_similarity,prev_address_months_count,current_address_months_count,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,...,device_distinct_emails_8w_0,device_distinct_emails_8w_1,device_distinct_emails_8w_2,month_1,month_2,month_3,month_4,month_5,month_6,month_7
0,0,0.986506,-1,25,0.006735,102.453711,1059,13096.035018,7850.955007,6742.080561,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.617426,-1,89,0.010095,-0.849551,1658,9223.283431,5745.251481,5941.664859,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.996707,9,14,0.012316,-1.490386,1095,4471.472149,5471.988958,5992.555113,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.4751,11,14,0.006991,-1.863101,3483,14431.993621,6755.344479,5970.336831,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.842307,-1,29,5.742626,47.152498,2339,7601.511579,5124.04693,5940.734212,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
columns_to_scale = df.columns.difference(encoded_df.columns)  # Encode edilmemiş sütunlar

scaler = StandardScaler()
scaled_df = df.copy()
scaled_df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [6]:
latent_dim = 100
num_epochs = 20
batch_size = 64

class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator(input_dim=latent_dim, output_dim=scaled_df.shape[1]).to(device)
discriminator = Discriminator(input_dim=scaled_df.shape[1]).to(device)

optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)
criterion = nn.BCELoss()

data_tensor = torch.tensor(scaled_df.values, dtype=torch.float32).to(device)
train_loader = DataLoader(TensorDataset(data_tensor), batch_size=batch_size, shuffle=True)

def train_model():
    start_time = time.time()
    for epoch in range(num_epochs):
        for real_data in train_loader:
            real_data = real_data[0]

            batch_size = real_data.size(0)
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)

            optimizer_d.zero_grad()
            outputs = discriminator(real_data)
            d_loss_real = criterion(outputs, real_labels)
            d_loss_real.backward()

            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z)
            outputs = discriminator(fake_data.detach())
            d_loss_fake = criterion(outputs, fake_labels)
            d_loss_fake.backward()
            optimizer_d.step()

            optimizer_g.zero_grad()
            outputs = discriminator(fake_data)
            g_loss = criterion(outputs, real_labels)
            g_loss.backward()
            optimizer_g.step()

        print(f'Epoch [{epoch+1}/{num_epochs}] | D Loss: {d_loss_real.item() + d_loss_fake.item()} | G Loss: {g_loss.item()}')
    
    end_time = time.time()
    return end_time - start_time

if device.type == 'cuda':
    gpu_time = train_model()
    print(f"GPU eğitim süresi: {gpu_time:.2f} saniye")
else:
    cpu_time = train_model()
    print(f"CPU eğitim süresi: {cpu_time:.2f} saniye")


Epoch [1/20] | D Loss: 0.6731310486793518 | G Loss: 1.508975625038147
Epoch [2/20] | D Loss: 0.30043816566467285 | G Loss: 2.3337020874023438
Epoch [3/20] | D Loss: 0.18001993000507355 | G Loss: 3.4846248626708984
Epoch [4/20] | D Loss: 0.07149004004895687 | G Loss: 3.3612284660339355
Epoch [5/20] | D Loss: 0.03926484752446413 | G Loss: 4.341398239135742
Epoch [6/20] | D Loss: 0.028377607464790344 | G Loss: 4.7615647315979
Epoch [7/20] | D Loss: 0.015369255093901302 | G Loss: 4.975200653076172
Epoch [8/20] | D Loss: 0.026982859708368778 | G Loss: 5.356912612915039
Epoch [9/20] | D Loss: 0.008024647468118928 | G Loss: 6.384058475494385
Epoch [10/20] | D Loss: 0.03985612466931343 | G Loss: 6.252026557922363
Epoch [11/20] | D Loss: 0.011148582596206325 | G Loss: 6.298121929168701
Epoch [12/20] | D Loss: 0.019673853181302547 | G Loss: 6.340665817260742
Epoch [13/20] | D Loss: 0.022295768838375807 | G Loss: 6.545035362243652
Epoch [14/20] | D Loss: 0.008048539748415351 | G Loss: 6.76337003

In [7]:
num_samples = 1000000
noise = torch.randn(num_samples, latent_dim)

generated_data = generator(noise).detach().numpy()

generated_df = pd.DataFrame(generated_data, columns=scaled_df.columns)

generated_df[columns_to_scale] = scaler.inverse_transform(generated_df[columns_to_scale])

categorical_output = pd.DataFrame(encoder.inverse_transform(generated_df[encoded_df.columns]))
categorical_output.columns = categorical_columns

generated_df = generated_df.drop(columns=encoded_df.columns)
generated_df = pd.concat([generated_df, categorical_output], axis=1)

In [8]:
generated_df

Unnamed: 0,fraud_bool,name_email_similarity,prev_address_months_count,current_address_months_count,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,...,income,customer_age,payment_type,employment_status,housing_status,source,device_os,device_distinct_emails_8w,device_fraud_count,month
0,0.000447,0.204684,2.702811,70.169731,-0.052859,-0.382075,784.069336,8504.684570,3290.570068,4600.003906,...,0.2,30,AB,CB,BB,TELEAPP,windows,1,0,3
1,0.001885,0.782818,3.069206,2.013908,0.079433,-2.157656,2577.997070,8674.651367,6247.224609,5092.886719,...,0.2,20,AC,CD,BB,TELEAPP,windows,1,0,3
2,-0.001922,0.204570,10.759277,97.225716,0.028109,-0.984463,954.362122,8674.675781,6245.492188,4833.170410,...,0.9,30,AB,CB,BB,TELEAPP,other,1,0,3
3,0.002730,0.208721,36.008873,-1.526658,0.195154,-1.759615,786.242981,8674.675781,6052.395020,5068.629395,...,0.2,20,AC,CD,BC,TELEAPP,macintosh,1,0,3
4,0.004260,0.782461,3.433317,131.136383,0.079333,-1.607598,973.947754,2694.678467,5646.514648,4751.785156,...,0.2,30,AC,CD,BB,TELEAPP,macintosh,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.000591,0.204610,-1.936128,81.806961,0.014792,-1.416410,1068.948975,8666.314453,5816.229004,4812.607910,...,0.2,40,AB,CD,BB,TELEAPP,macintosh,1,0,3
999996,0.003705,0.204936,16.410046,3.609832,0.071203,-1.363992,742.711060,8651.046875,5900.079590,4691.570801,...,0.2,20,AC,CD,BC,TELEAPP,macintosh,1,0,3
999997,0.000912,0.759650,60.764149,4.912353,0.005283,-1.012112,735.335510,2655.919189,5772.518066,4748.659668,...,0.2,20,AC,CD,BC,TELEAPP,other,1,0,3
999998,0.001634,0.204648,2.835705,99.907539,-0.017335,-0.615986,985.256531,8641.280273,5575.520508,4754.000977,...,0.3,40,AB,CB,BB,TELEAPP,macintosh,1,0,3


In [9]:
generated_df['fraud_bool'] =generated_df['fraud_bool'].apply(lambda x: 1 if x > 0.05 else 0)

In [10]:
generated_df['fraud_bool'].value_counts()

fraud_bool
0    1000000
Name: count, dtype: int64

In [5]:
df = pd.read_csv('base.csv')  

In [6]:
features = df.columns[:-1]

for feature in features:
    plt.figure(figsize=(16, 10))
    
    min_val = min(df[feature].min(), generated_df[feature].min())
    max_val = max(df[feature].max(), generated_df[feature].max())
    
    plt.hist(df[feature], bins=30, alpha=0.3, label='Gerçek Veri', color='purple', range=(min_val, max_val))
    
    plt.hist(generated_df[feature], bins=30, alpha=0.5, label='Yapay Veri', color='green', range=(min_val, max_val))
    
    plt.title(f'{feature} - Gerçek ve Yapay Veriler')
    plt.xlabel(feature)
    plt.ylabel('Frekans')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

NameError: name 'generated_df' is not defined

<Figure size 1152x720 with 0 Axes>