<a href="https://colab.research.google.com/github/bsong75/brendensong.github.io/blob/main/2024_0901_CGAN_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, LeakyReLU, BatchNormalization, Embedding, Flatten, Concatenate
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [6]:
# Step 1: Create an imbalanced dataset
def generate_fake_data(n_samples=10000, imbalance_ratio=0.1):
    X = np.random.randn(n_samples, 10)  # 10 continuous features
    y = np.random.choice([0, 1], size=n_samples, p=[1-imbalance_ratio, imbalance_ratio])
    return pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)]), pd.Series(y, name='target')

# Generate dataset
X, y = generate_fake_data()
print("Original class distribution:", y.value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Step 2: Build CGAN components
latent_dim = 10
num_features = X.shape[1]

# Generator
def build_generator():
    noise_input = Input(shape=(latent_dim,))
    label_input = Input(shape=(1,))
    label_embedding = Flatten()(Embedding(2, latent_dim)(label_input))
    combined_input = Concatenate()([noise_input, label_embedding])

    x = Dense(32)(combined_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(num_features, activation='tanh')(x)
    return Model([noise_input, label_input], x, name='Generator')

# Discriminator
def build_discriminator():
    feature_input = Input(shape=(num_features,))
    label_input = Input(shape=(1,))
    label_embedding = Flatten()(Embedding(2, num_features)(label_input))
    combined_input = Concatenate()([feature_input, label_embedding])

    x = Dense(32)(combined_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(1, activation='sigmoid')(x)
    return Model([feature_input, label_input], x, name='Discriminator')

# Build and compile GAN
generator = build_generator()
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# GAN Model
noise_input = Input(shape=(latent_dim,))
label_input = Input(shape=(1,))
generated_sample = generator([noise_input, label_input])
discriminator.trainable = False
validity = discriminator([generated_sample, label_input])
gan = Model([noise_input, label_input], validity)
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Step 3: Train CGAN
def train_cgan(epochs=5000, batch_size=64):
    half_batch = batch_size // 2
    for epoch in range(epochs):
        idx = np.random.randint(0, X_train.shape[0], half_batch)
        real_samples, real_labels = X_train.iloc[idx], y_train.iloc[idx]

        noise = np.random.randn(half_batch, latent_dim)
        fake_labels = np.ones((half_batch, 1))
        fake_samples = generator.predict([noise, fake_labels])

        d_loss_real = discriminator.train_on_batch([real_samples, real_labels], np.ones((half_batch, 1)))
        d_loss_fake = discriminator.train_on_batch([fake_samples, fake_labels], np.zeros((half_batch, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        noise = np.random.randn(batch_size, latent_dim)
        misleading_labels = np.ones((batch_size, 1))
        g_loss = gan.train_on_batch([noise, misleading_labels], np.ones((batch_size, 1)))

        if epoch % 1000 == 0:
            print(f"Epoch {epoch}: D Loss = {d_loss[0]}, G Loss = {g_loss}")

train_cgan(epochs=100)




Original class distribution: target
0    8981
1    1019
Name: count, dtype: int64




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step




Epoch 0: D Loss = 0.7225111722946167, G Loss = 0.7363081574440002
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/ste

In [7]:
# Step 4: Generate synthetic samples
num_synthetic_samples = sum(y_train == 0) - sum(y_train == 1)
noise = np.random.randn(num_synthetic_samples, latent_dim)
synthetic_labels = np.ones((num_synthetic_samples, 1))
X_synthetic = generator.predict([noise, synthetic_labels])
y_synthetic = np.ones(num_synthetic_samples)

# Combine with real data
X_balanced = pd.concat([X_train, pd.DataFrame(X_synthetic, columns=X_train.columns)])
y_balanced = pd.concat([y_train, pd.Series(y_synthetic, name='target')])
print("Balanced class distribution:", y_balanced.value_counts())

# Step 5: Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_balanced, y_balanced)
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Balanced class distribution: target
0.0    7185
1.0    7185
Name: count, dtype: int64


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.90      0.97      0.93      1796
           1       0.19      0.07      0.10       204

    accuracy                           0.88      2000
   macro avg       0.55      0.52      0.52      2000
weighted avg       0.83      0.88      0.85      2000

