### BAF GAN ile Yapay Veri

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt
import time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

In [15]:
df = pd.read_csv('sample10.csv')  

categorical_columns = ['income', 'customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'device_distinct_emails_8w', 'device_fraud_count', 'month']  

encoder = OneHotEncoder(sparse=False, drop='first')
encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))

encoded_df.columns = encoder.get_feature_names_out(categorical_columns)

df = df.drop(columns=categorical_columns) 
df = pd.concat([df, encoded_df], axis=1)  

In [16]:
df.head()

Unnamed: 0,name_email_similarity,prev_address_months_count,current_address_months_count,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,...,device_distinct_emails_8w_0,device_distinct_emails_8w_1,device_distinct_emails_8w_2,month_1,month_2,month_3,month_4,month_5,month_6,month_7
0,0.423184,-1,174,0.022402,3.354768,1140,3659.730906,6041.862379,6791.873595,278,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.246312,-1,60,0.009289,-0.305261,1118,4950.849565,4039.940368,4141.882552,14,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.921065,29,148,0.009191,-1.054443,399,574.335478,2493.948454,3726.858015,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.547707,56,8,0.004642,-1.124991,4062,3484.847833,4210.51295,4288.197328,172,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.860882,-1,46,0.005121,-1.067835,1052,14073.211653,5469.019495,4997.056511,11,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [17]:
columns_to_scale = df.columns.difference(encoded_df.columns)  
columns_to_scale = df.columns.difference(['fraud_bool'])

# Scaler'ı uygulayın
scaler = StandardScaler()
scaled_df = df.copy()
scaled_df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

scaler = StandardScaler()
scaled_df = df.copy()
scaled_df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

In [18]:
latent_dim = 100
num_epochs = 20
batch_size = 64


class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),      
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),      
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),     
            nn.Linear(1024, output_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.model(x)


class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

generator = Generator(input_dim=latent_dim, output_dim=scaled_df.shape[1]).to(device)
discriminator = Discriminator(input_dim=scaled_df.shape[1]).to(device)

optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)
criterion = nn.BCELoss()

data_tensor = torch.tensor(scaled_df.values, dtype=torch.float32).to(device)
train_loader = DataLoader(TensorDataset(data_tensor), batch_size=batch_size, shuffle=True)

def train_model():
    start_time = time.time()
    for epoch in range(num_epochs):
        for real_data in train_loader:
            real_data = real_data[0]

            batch_size = real_data.size(0)
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)

            optimizer_d.zero_grad()
            outputs = discriminator(real_data)
            d_loss_real = criterion(outputs, real_labels)
            d_loss_real.backward()

            z = torch.randn(batch_size, latent_dim).to(device)
            fake_data = generator(z)
            outputs = discriminator(fake_data.detach())
            d_loss_fake = criterion(outputs, fake_labels)
            d_loss_fake.backward()
            optimizer_d.step()

            optimizer_g.zero_grad()
            outputs = discriminator(fake_data)
            g_loss = criterion(outputs, real_labels)
            g_loss.backward()
            optimizer_g.step()

        print(f'Epoch [{epoch+1}/{num_epochs}] | D Loss: {d_loss_real.item() + d_loss_fake.item()} | G Loss: {g_loss.item()}')
    
    end_time = time.time()
    return end_time - start_time

if device.type == 'cuda':
    gpu_time = train_model()
    print(f"GPU eğitim süresi: {gpu_time:.2f} saniye")
else:
    cpu_time = train_model()
    print(f"CPU eğitim süresi: {cpu_time:.2f} saniye")

Epoch [1/20] | D Loss: 1.4370484948158264 | G Loss: 0.5966274738311768
Epoch [2/20] | D Loss: 1.3731122016906738 | G Loss: 0.6637866497039795
Epoch [3/20] | D Loss: 1.3599011301994324 | G Loss: 0.6741238832473755
Epoch [4/20] | D Loss: 1.3609206080436707 | G Loss: 0.6939775943756104
Epoch [5/20] | D Loss: 1.3819621801376343 | G Loss: 0.6746560335159302
Epoch [6/20] | D Loss: 1.3383379578590393 | G Loss: 0.7056883573532104
Epoch [7/20] | D Loss: 1.3922966122627258 | G Loss: 0.6882023811340332
Epoch [8/20] | D Loss: 1.302218496799469 | G Loss: 0.7321239113807678
Epoch [9/20] | D Loss: 1.3130157589912415 | G Loss: 0.6974195241928101
Epoch [10/20] | D Loss: 1.3692587614059448 | G Loss: 0.694636344909668
Epoch [11/20] | D Loss: 1.3156582713127136 | G Loss: 0.7417076230049133
Epoch [12/20] | D Loss: 1.305370569229126 | G Loss: 0.7377785444259644
Epoch [13/20] | D Loss: 1.2382065057754517 | G Loss: 0.8276582956314087
Epoch [14/20] | D Loss: 1.2781305313110352 | G Loss: 0.7399060130119324
Epoc

In [19]:
df = pd.read_csv('sample10.csv')  

In [49]:
# Gürültü vektörü oluşturma
num_samples = 10000
noise = torch.randn(num_samples, latent_dim)

# Yapay veriyi üretme
generated_data = generator(noise).detach().numpy()

# Yapay veriyi DataFrame'e dönüştürme
generated_df = pd.DataFrame(generated_data, columns=scaled_df.columns)


df = pd.read_csv('sample10.csv')  

categorical_columns = ['income', 'customer_age', 'payment_type', 'employment_status', 'housing_status', 'source', 'device_os', 'device_distinct_emails_8w', 'device_fraud_count', 'month']  

encoder = OneHotEncoder(sparse=False, drop='first')
encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_columns]))

encoded_df.columns = encoder.get_feature_names_out(categorical_columns)

Training Data Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      9890
           1       0.00      0.00      0.00       110

    accuracy                           0.99     10000
   macro avg       0.49      0.50      0.50     10000
weighted avg       0.98      0.99      0.98     10000

Generated Data Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6135
           1       1.00      1.00      1.00      3865

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
generated_data_with_target['fraud_bool'].value_counts()

fraud_bool
0    6196
1    3804
Name: count, dtype: int64

In [None]:
generated_data_with_target[columns_to_scale] = scaler.inverse_transform(generated_data_with_target[columns_to_scale])

categorical_output = pd.DataFrame(encoder.inverse_transform(generated_data_with_target[encoded_df.columns]))
categorical_output.columns = categorical_columns

generated_data_with_target = generated_data_with_target.drop(columns=encoded_df.columns)
generated_data_with_target = pd.concat([generated_data_with_target, categorical_output], axis=1)

In [None]:
generated_df

In [None]:
generated_df['fraud_bool'].value_counts()

In [None]:
df = pd.read_csv("sample10.csv")

In [None]:
features = df.columns
generated_df = generated_data_with_target
num_features = len(features)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns  

fig, axes = plt.subplots(num_rows, num_columns, figsize=(20, num_rows * 5))
fig.tight_layout(pad=5.0)  

for i, feature in enumerate(features):
    row = i // num_columns
    col = i % num_columns
    
    ax = axes[row, col]
    
    min_val = min(df[feature].min(), generated_df[feature].min())
    max_val = max(df[feature].max(), generated_df[feature].max())
    
    ax.hist(df[feature], bins=30, alpha=0.5, label='Gerçek Veri', color='blue', range=(min_val, max_val))
    ax.hist(generated_df[feature], bins=30, alpha=0.5, label='Yapay Veri', color='yellow', range=(min_val, max_val))
    
    ax.set_title(f'{feature} - Gerçek ve Yapay Veriler')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frekans')
    ax.legend()

for j in range(len(features), num_rows * num_columns):
    fig.delaxes(axes[j // num_columns, j % num_columns])

plt.show()