<a href="https://colab.research.google.com/github/bsong75/brendensong.github.io/blob/main/2024_10_01_WGAN_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, LeakyReLU, BatchNormalization, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [3]:
# Step 1: Create an imbalanced dataset
def generate_fake_data(n_samples=10000, imbalance_ratio=0.1):
    X = np.random.randn(n_samples, 10)  # 10 continuous features
    y = np.random.choice([0, 1], size=n_samples, p=[1-imbalance_ratio, imbalance_ratio])
    return pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)]), pd.Series(y, name='target')

# Generate dataset
X, y = generate_fake_data()
print("Original class distribution:", y.value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

Original class distribution: target
0    8983
1    1017
Name: count, dtype: int64


In [6]:
# Step 2: Build WGAN components
latent_dim = 10
num_features = X.shape[1]

# Generator
def build_generator():
    noise_input = Input(shape=(latent_dim,))
    label_input = Input(shape=(1,))
    label_embedding = Flatten()(Embedding(2, latent_dim)(label_input))
    combined_input = Concatenate()([noise_input, label_embedding])

    x = Dense(32)(combined_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(num_features, activation='tanh')(x)
    return Model([noise_input, label_input], x, name='Generator')

# Discriminator (Critic)
def build_critic():
    feature_input = Input(shape=(num_features,))
    label_input = Input(shape=(1,))
    label_embedding = Flatten()(Embedding(2, num_features)(label_input))
    combined_input = Concatenate()([feature_input, label_embedding])

    x = Dense(32)(combined_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(1)(x)  # No activation function for WGAN
    return Model([feature_input, label_input], x, name='Critic')

# Build and compile WGAN
generator = build_generator()
critic = build_critic()
critic.compile(loss='mse', optimizer=RMSprop(learning_rate=0.00005))

# WGAN Model
noise_input = Input(shape=(latent_dim,))
label_input = Input(shape=(1,))
generated_sample = generator([noise_input, label_input])
critic.trainable = False
validity = critic([generated_sample, label_input])
wgan = Model([noise_input, label_input], validity)
wgan.compile(loss='mse', optimizer=RMSprop(learning_r=0.00005))

# Step 3: Train WGAN
def train_wgan(epochs=5000, batch_size=64):
    half_batch = batch_size // 2
    for epoch in range(epochs):
        idx = np.random.randint(0, X_train.shape[0], half_batch)
        real_samples, real_labels = X_train.iloc[idx], y_train.iloc[idx]

        noise = np.random.randn(half_batch, latent_dim)
        fake_labels = np.ones((half_batch, 1))
        fake_samples = generator.predict([noise, fake_labels])

        d_loss_real = critic.train_on_batch([real_samples, real_labels], np.ones((half_batch, 1)))
        d_loss_fake = critic.train_on_batch([fake_samples, fake_labels], -np.ones((half_batch, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.randn(batch_size, latent_dim)
        misleading_labels = np.ones((batch_size, 1))
        g_loss = wgan.train_on_batch([noise, misleading_labels], np.ones((batch_size, 1)))

        if epoch % 1000 == 0:
            print(f"Epoch {epoch}: Critic Loss = {d_loss}, Generator Loss = {g_loss}")

train_wgan(epochs=500)

# Step 4: Generate synthetic samples
num_synthetic_samples = sum(y_train == 0) - sum(y_train == 1)
noise = np.random.randn(num_synthetic_samples, latent_dim)
synthetic_labels = np.ones((num_synthetic_samples, 1))
X_synthetic = generator.predict([noise, synthetic_labels])
y_synthetic = np.ones(num_synthetic_samples)

# Combine with real data
X_balanced = pd.concat([X_train, pd.DataFrame(X_synthetic, columns=X_train.columns)])
y_balanced = pd.concat([y_train, pd.Series(y_synthetic, name='target')])
print("Balanced class distribution:", y_balanced.value_counts())

# Step 5: Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_balanced, y_balanced)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 851ms/step




Epoch 0: Critic Loss = 0.9302552342414856, Generator Loss = 0.8176910877227783
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1797
           1       0.06      0.03      0.04       203

    accuracy                           0.85      2000
   macro avg       0.48      0.49      0.48      2000
weighted avg       0.81      0.85      0.83      2000



In [7]:
# Step 4: Generate synthetic samples
num_synthetic_samples = sum(y_train == 0) - sum(y_train == 1)
noise = np.random.randn(num_synthetic_samples, latent_dim)
synthetic_labels = np.ones((num_synthetic_samples, 1))
X_synthetic = generator.predict([noise, synthetic_labels])
y_synthetic = np.ones(num_synthetic_samples)

# Combine with real data
X_balanced = pd.concat([X_train, pd.DataFrame(X_synthetic, columns=X_train.columns)])
y_balanced = pd.concat([y_train, pd.Series(y_synthetic, name='target')])
print("Balanced class distribution:", y_balanced.value_counts())

# Step 5: Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_balanced, y_balanced)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Balanced class distribution: target
0.0    7186
1.0    7186
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.90      0.94      0.92      1797
           1       0.05      0.02      0.03       203

    accuracy                           0.85      2000
   macro avg       0.47      0.48      0.48      2000
weighted avg       0.81      0.85      0.83      2000

