<a href="https://colab.research.google.com/github/mrrajatgarg/StackGAN/blob/master/stackgan_stage_I_imp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage I GAN

# Importing Libraries

In [1]:
import os
import pickle
import random
import time

import PIL
import numpy as np
import pandas as pd
import tensorflow as tf
from PIL import Image
from keras import Input, Model
from keras import backend as K
from keras.callbacks import TensorBoard
from keras.layers import Dense, LeakyReLU, BatchNormalization, ReLU, Reshape, UpSampling2D, Conv2D, Activation, \
    concatenate, Flatten, Lambda, Concatenate
from keras.optimizers import Adam
from matplotlib import pyplot as plt

# Loading of Dataset

In [2]:
def load_class_ids(class_info_file_path):
    """
    Load class ids from class_info.pickle file
    """
    with open(class_info_file_path, 'rb') as f:
        class_ids = pickle.load(f, encoding='latin1')
        return class_ids

In [3]:
def load_embeddings(embeddings_file_path):
    """
    Load embeddings
    """
    with open(embeddings_file_path, 'rb') as f:
        embeddings = pickle.load(f, encoding='latin1')
        embeddings = np.array(embeddings)
        print('embeddings: ', embeddings.shape)
    return embeddings

In [4]:
def load_filenames(filenames_file_path):
    """
    Load filenames.pickle file and return a list of all file names
    """
    with open(filenames_file_path, 'rb') as f:
        filenames = pickle.load(f, encoding='latin1')
    return filenames

In [5]:
def load_bounding_boxes(dataset_dir):
    """
    Load bounding boxes and return a dictionary of file names and corresponding bounding boxes
    """
    # Paths
    bounding_boxes_path = os.path.join(dataset_dir, 'bounding_boxes.txt')
    file_paths_path = os.path.join(dataset_dir, 'images.txt')

    # Read bounding_boxes.txt and images.txt file
    df_bounding_boxes = pd.read_csv(bounding_boxes_path,
                                    delim_whitespace=True, header=None).astype(int)
    df_file_names = pd.read_csv(file_paths_path, delim_whitespace=True, header=None)

    # Create a list of file names
    file_names = df_file_names[1].tolist()

    # Create a dictionary of file_names and bounding boxes
    filename_boundingbox_dict = {img_file[:-4]: [] for img_file in file_names[:2]}

    # Assign a bounding box to the corresponding image
    for i in range(0, len(file_names)):
        # Get the bounding box
        bounding_box = df_bounding_boxes.iloc[i][1:].tolist()
        key = file_names[i][:-4]
        filename_boundingbox_dict[key] = bounding_box

    return filename_boundingbox_dict


In [6]:
def get_img(img_path, bbox, image_size):
    """
    Load and resize image
    """
    img = Image.open(img_path).convert('RGB')
    width, height = img.size
    if bbox is not None:
        R = int(np.maximum(bbox[2], bbox[3]) * 0.75)
        center_x = int((2 * bbox[0] + bbox[2]) / 2)
        center_y = int((2 * bbox[1] + bbox[3]) / 2)
        y1 = np.maximum(0, center_y - R)
        y2 = np.minimum(height, center_y + R)
        x1 = np.maximum(0, center_x - R)
        x2 = np.minimum(width, center_x + R)
        img = img.crop([x1, y1, x2, y2])
    img = img.resize(image_size, PIL.Image.BILINEAR)
    return img

In [7]:
def load_dataset(filenames_file_path, class_info_file_path, cub_dataset_dir, embeddings_file_path, image_size):
    """
    Load dataset
    """
    filenames = load_filenames(filenames_file_path)
    class_ids = load_class_ids(class_info_file_path)
    bounding_boxes = load_bounding_boxes(cub_dataset_dir)
    all_embeddings = load_embeddings(embeddings_file_path)

    X, y, embeddings = [], [], []

    print("Embeddings shape:", all_embeddings.shape)

    for index, filename in enumerate(filenames):
        bounding_box = bounding_boxes[filename]

        try:
            # Load images
            img_name = '{}/images/{}.jpg'.format(cub_dataset_dir, filename)
            img = get_img(img_name, bounding_box, image_size)

            all_embeddings1 = all_embeddings[index, :, :]

            embedding_ix = random.randint(0, all_embeddings1.shape[0] - 1)
            embedding = all_embeddings1[embedding_ix, :]

            X.append(np.array(img))
            y.append(class_ids[index])
            embeddings.append(embedding)
        except Exception as e:
            print(e)

    X = np.array(X)
    y = np.array(y)
    embeddings = np.array(embeddings)
    return X, y, embeddings

# Model Creation

In [8]:
def generate_c(x):
    mean = x[:, :128]
    log_sigma = x[:, 128:]
    stddev = K.exp(log_sigma)
    epsilon = K.random_normal(shape=K.constant((mean.shape[1],), dtype='int32'))
    c = stddev * epsilon + mean
    return c

In [9]:
def build_ca_model():
    """
    Get conditioning augmentation model.
    Takes an embedding of shape (1024,) and returns a tensor of shape (256,)
    """
    input_layer = Input(shape=(1024,))
    x = Dense(256)(input_layer)
    x = LeakyReLU(alpha=0.2)(x)
    model = Model(inputs=[input_layer], outputs=[x])
    return model

In [10]:
def build_embedding_compressor_model():
    """
    Build embedding compressor model
    """
    input_layer = Input(shape=(1024,))
    x = Dense(128)(input_layer)
    x = ReLU()(x)

    model = Model(inputs=[input_layer], outputs=[x])
    return model

In [11]:
def build_stage1_generator():
    """
    Builds a generator model used in Stage-I
    """
    input_layer = Input(shape=(1024,))
    x = Dense(256)(input_layer)
    mean_logsigma = LeakyReLU(alpha=0.2)(x)

    c = Lambda(generate_c)(mean_logsigma)

    input_layer2 = Input(shape=(100,))

    gen_input = Concatenate(axis=1)([c, input_layer2])

    x = Dense(128 * 8 * 4 * 4, use_bias=False)(gen_input)
    x = ReLU()(x)

    x = Reshape((4, 4, 128 * 8), input_shape=(128 * 8 * 4 * 4,))(x)

    x = UpSampling2D(size=(2, 2))(x)
    x = Conv2D(512, kernel_size=3, padding="same", strides=1, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = UpSampling2D(size=(2, 2))(x)
    x = Conv2D(256, kernel_size=3, padding="same", strides=1, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = UpSampling2D(size=(2, 2))(x)
    x = Conv2D(128, kernel_size=3, padding="same", strides=1, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = UpSampling2D(size=(2, 2))(x)
    x = Conv2D(64, kernel_size=3, padding="same", strides=1, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = Conv2D(3, kernel_size=3, padding="same", strides=1, use_bias=False)(x)
    x = Activation(activation='tanh')(x)

    stage1_gen = Model(inputs=[input_layer, input_layer2], outputs=[x, mean_logsigma])
    return stage1_gen


In [12]:
def build_stage1_discriminator():
    """
    Create a model which takes two inputs
    1. One from above network
    2. One from the embedding layer
    3. Concatenate along the axis dimension and feed it to the last module which produces final logits
    """
    input_layer = Input(shape=(64, 64, 3))

    x = Conv2D(64, (4, 4),
               padding='same', strides=2,
               input_shape=(64, 64, 3), use_bias=False)(input_layer)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv2D(128, (4, 4), padding='same', strides=2, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv2D(256, (4, 4), padding='same', strides=2, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    x = Conv2D(512, (4, 4), padding='same', strides=2, use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    input_layer2 = Input(shape=(4, 4, 128))

    merged_input = concatenate([x, input_layer2])

    x2 = Conv2D(64 * 8, kernel_size=1,
                padding="same", strides=1)(merged_input)
    x2 = BatchNormalization()(x2)
    x2 = LeakyReLU(alpha=0.2)(x2)
    x2 = Flatten()(x2)
    x2 = Dense(1)(x2)
    x2 = Activation('sigmoid')(x2)

    stage1_dis = Model(inputs=[input_layer, input_layer2], outputs=[x2])
    return stage1_dis

In [13]:
def build_adversarial_model(gen_model, dis_model):
    input_layer = Input(shape=(1024,))
    input_layer2 = Input(shape=(100,))
    input_layer3 = Input(shape=(4, 4, 128))

    x, mean_logsigma = gen_model([input_layer, input_layer2])

    dis_model.trainable = False
    valid = dis_model([x, input_layer3])

    model = Model(inputs=[input_layer, input_layer2, input_layer3], outputs=[valid, mean_logsigma])
    return model

# Defining Loss

In [14]:
def KL_loss(y_true, y_pred):
    mean = y_pred[:, :128]
    logsigma = y_pred[:, :128]
    loss = -logsigma + .5 * (-1 + K.exp(2. * logsigma) + K.square(mean))
    loss = K.mean(loss)
    return loss

In [15]:

def custom_generator_loss(y_true, y_pred):
    # Calculate binary cross entropy loss
    return K.binary_crossentropy(y_true, y_pred)


In [16]:
def save_rgb_img(img, path):
    """
    Save an rgb image
    """
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.imshow(img)
    ax.axis("off")
    ax.set_title("Image")

    plt.savefig(path)
    plt.close()

In [17]:
def write_log(callback, name, loss, batch_no):
    """
    Write training summary to TensorBoard
    """
    summary = tf.Summary()
    summary_value = summary.value.add()
    summary_value.simple_value = loss
    summary_value.tag = name
    callback.writer.add_summary(summary, batch_no)
    callback.writer.flush()

# Main File

In [22]:
import os
import zipfile
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, optimizers
from keras.callbacks import TensorBoard
import time
from PIL import Image
import pickle


In [23]:
def load_filenames(file_path):
    """Loads filenames from a pickle file."""
    with open(file_path, "rb") as f:
        filenames = pickle.load(f)
    return filenames

def load_class_ids(file_path):
    """Loads class IDs from a pickle file."""
    with open(file_path, "rb") as f:
        class_ids = pickle.load(f)
    return class_ids

def load_bounding_boxes(cub_dataset_dir):
    """
    Placeholder: In real code, you'd load bounding boxes from .txt or .pickle.
    Returns a dict of {filename: (x, y, width, height)} or similar.
    """
    # For simplicity, return an empty dict or random boxes
    return {}

def load_embeddings(embeddings_file_path):
    """Loads embeddings from a pickle file."""
    with open(embeddings_file_path, "rb") as f:
        all_embeddings = pickle.load(f)
    return all_embeddings

def get_img(img_path, bounding_box, image_size):
    """
    Loads and returns a resized image. 
    bounding_box is ignored here, but you could crop the image accordingly if needed.
    """
    img = Image.open(img_path).convert("RGB")
    img = img.resize(image_size, Image.ANTIALIAS)
    return img


In [24]:
#------------------------------------------------------------------------------
# 1) Conditioning Augmentation (CA) Model
#    Takes an embedding and learns mu/logvar, then reparameterizes to produce c
#------------------------------------------------------------------------------

def build_ca_model(embedding_dim=1024, condition_dim=128):
    """
    CA model that takes text embedding of size embedding_dim and
    outputs a (condition_dim)-dim vector c after reparameterization.
    """
    embedding_input = layers.Input(shape=(embedding_dim,))
    x = layers.Dense(256, activation="relu")(embedding_input)
    mu = layers.Dense(condition_dim)(x)
    logvar = layers.Dense(condition_dim)(x)

    # We'll output mu and logvar; the reparameterization trick can be done outside
    model = keras.Model(inputs=embedding_input, outputs=[mu, logvar])
    return model

#------------------------------------------------------------------------------
# 2) KL Divergence Loss for the CA Model
#------------------------------------------------------------------------------

def KL_loss(y_true, y_pred):
    """
    The second output of the adversarial model is (mu, logvar),
    but we typically compute KL inside the CA pipeline. 
    Here is a placeholder that expects y_pred = [mu, logvar] concatenated 
    or another custom approach. For simplicity, let's do a naive version.
    """
    # Suppose we packed mu and logvar along the last dimension
    # i.e. y_pred.shape == (batch_size, condition_dim*2)
    half = y_pred.shape[-1] // 2
    mu = y_pred[:, :half]
    logvar = y_pred[:, half:]
    # KL
    kld = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mu) - tf.exp(logvar), axis=1)
    return tf.reduce_mean(kld)

#------------------------------------------------------------------------------
# 3) Embedding Compressor (optional)
#    Sometimes used to reduce embedding dim (e.g., 1024 -> 128)
#------------------------------------------------------------------------------

def build_embedding_compressor_model(embedding_dim=1024, condition_dim=128):
    """
    Simple FC to reduce large embedding_dim -> condition_dim
    """
    embedding_input = layers.Input(shape=(embedding_dim,))
    x = layers.Dense(condition_dim, activation="relu")(embedding_input)
    model = keras.Model(inputs=embedding_input, outputs=x)
    return model

#------------------------------------------------------------------------------
# 4) Stage 1 Generator
#    Takes random noise z + condition c, outputs a 64x64 image
#------------------------------------------------------------------------------

def build_stage1_generator(z_dim=100, condition_dim=128):
    """
    Example DCGAN-like generator for 64x64 output.
    Input: concatenated [z (100), c (128)]
    Output: 64x64x3
    """
    input_layer = layers.Input(shape=(z_dim + condition_dim,))

    x = layers.Dense(4*4*256, use_bias=False)(input_layer)
    x = layers.Reshape((4, 4, 256))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    x = layers.Conv2DTranspose(128, (4,4), strides=(2,2), padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    x = layers.Conv2DTranspose(64, (4,4), strides=(2,2), padding="same", use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    x = layers.Conv2DTranspose(3, (4,4), strides=(2,2), padding="same", use_bias=False)(x)
    x = layers.Activation("tanh")(x)

    model = keras.Model(inputs=input_layer, outputs=x)
    return model

#------------------------------------------------------------------------------
# 5) Stage 1 Discriminator
#    Takes 64x64 image + condition embedding, outputs real/fake
#------------------------------------------------------------------------------

def build_stage1_discriminator(condition_dim=128):
    """
    Discriminator that also takes text embedding. 
    For simplicity, we project the embedding to a spatial map and concatenate.
    """
    # Image input
    image_input = layers.Input(shape=(64, 64, 3))
    # Condition input
    cond_input = layers.Input(shape=(condition_dim,))

    # Downsample image
    x_img = layers.Conv2D(64, (4,4), strides=(2,2), padding="same")(image_input)
    x_img = layers.LeakyReLU(alpha=0.2)(x_img)

    x_img = layers.Conv2D(128, (4,4), strides=(2,2), padding="same")(x_img)
    x_img = layers.BatchNormalization()(x_img)
    x_img = layers.LeakyReLU(alpha=0.2)(x_img)

    x_img = layers.Conv2D(256, (4,4), strides=(2,2), padding="same")(x_img)
    x_img = layers.BatchNormalization()(x_img)
    x_img = layers.LeakyReLU(alpha=0.2)(x_img)

    # Project condition to 4x4
    cond = layers.Dense(4*4, activation="relu")(cond_input)
    cond = layers.Reshape((4, 4, 1))(cond)

    # Concatenate condition map with image
    x = layers.Concatenate(axis=-1)([x_img, cond])

    x = layers.Conv2D(256, (3,3), strides=(1,1), padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU(alpha=0.2)(x)

    # Flatten & final output
    x = layers.Flatten()(x)
    x = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=[image_input, cond_input], outputs=x)
    return model

#------------------------------------------------------------------------------
# 6) Adversarial Model
#    Wires up Generator + Discriminator for generator training
#------------------------------------------------------------------------------

def build_adversarial_model(gen_model, dis_model, ca_model, z_dim=100, condition_dim=128):
    """
    The 'adversarial model' is used to train the generator end-to-end:
     - Input: [noise (z), text embedding]
     - Output: [discriminator score, KL info (mu+logvar)]
    """
    # Noise + text embedding as input
    embedding_input = layers.Input(shape=(1024,))
    z_input = layers.Input(shape=(z_dim,))

    # CA model -> mu, logvar
    mu, logvar = ca


In [None]:
if __name__ == "__main__":
    # ----------------------------
    # 1) Setup & Hyperparameters
    # ----------------------------
    data_dir = "./data/coco/"
    train_dir = os.path.join(data_dir, "train")
    test_dir = os.path.join(data_dir, "val")  # Using val as test

    # Example: extract train2014 if train_dir is empty
    if not os.listdir(train_dir):
        with zipfile.ZipFile(os.path.join(data_dir, "train2014"), "r") as zip_ref:
            zip_ref.extractall(train_dir)

    # Create results directory if it doesn't exist
    if not os.path.exists("results"):
        os.makedirs("results")

    image_size = (64, 64)
    batch_size = 32
    z_dim = 100
    epochs = 10
    embedding_dim = 1024
    condition_dim = 128


    # File paths for training
    embeddings_file_path_train = os.path.join(train_dir, "char-CNN-RNN-embeddings.pickle")
    filenames_file_path_train = os.path.join(train_dir, "filenames.pickle")
    # class_info_file_path_train = os.path.join(train_dir, "class_info.pickle")

    # Similarly for test data if needed
    embeddings_file_path_test = os.path.join(test_dir, "char-CNN-RNN-embeddings.pickle")
    filenames_file_path_test = os.path.join(test_dir, "filenames.pickle")

    # Optimizers
    dis_optimizer = optimizers.Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.999)
    gen_optimizer = optimizers.Adam(learning_rate=0.0002, beta_1=0.5, beta_2=0.999)

    # ----------------------------
    # 2) Load Training Data
    # ----------------------------
    print("Loading training data...")
    X_train, y_train, embeddings_train = load_dataset(
        filenames_file_path=filenames_file_path_train,
        # class_info_file_path=class_info_file_path_train,
        cub_dataset_dir=train_dir,
        embeddings_file_path=embeddings_file_path_train,
        image_size=image_size,
    )

    print("Loading test data...")
    X_test, y_test, embeddings_test = load_dataset(
        filenames_file_path=filenames_file_path_test,
        # class_info_file_path=class_info_file_path_test,
        cub_dataset_dir=test_dir,
        embeddings_file_path=embeddings_file_path_test,
        image_size=image_size,
    )

    # ----------------------------
    # 3) Build & Compile Models
    # ----------------------------
    print("Building models...")
    ca_model = build_ca_model(embedding_dim=embedding_dim, condition_dim=condition_dim)
    embedding_compressor_model = build_embedding_compressor_model(
        embedding_dim=embedding_dim, condition_dim=condition_dim
    )
    stage1_gen = build_stage1_generator(z_dim=z_dim, condition_dim=condition_dim)
    stage1_dis = build_stage1_discriminator(condition_dim=condition_dim)

    # Compile Discriminator
    stage1_dis.compile(
        loss="binary_crossentropy", optimizer=dis_optimizer, metrics=["accuracy"]
    )

    # Adversarial model (for training Generator)
    adversarial_model = build_adversarial_model(
        gen_model=stage1_gen,
        dis_model=stage1_dis,
        ca_model=ca_model,
        z_dim=z_dim,
        condition_dim=condition_dim,
    )
    # We'll use the custom KL_loss for the second output
    adversarial_model.compile(
        loss=["binary_crossentropy", KL_loss],
        loss_weights=[1.0, 2.0],
        optimizer=gen_optimizer,
    )

    # ----------------------------
    # 4) Training Loop
    # ----------------------------
    real_labels = np.ones((batch_size, 1), dtype=np.float32) * 0.9
    fake_labels = np.zeros((batch_size, 1), dtype=np.float32)

    # We’ll assume embeddings_train.shape[0] == X_train.shape[0]
    num_batches = X_train.shape[0] // batch_size

    for epoch in range(epochs):
        print(f"================== Epoch {epoch+1}/{epochs} ==================")
        np.random.shuffle(indices := np.arange(X_train.shape[0]))

        for batch_i in range(num_batches):
            # ---------------------------
            # 4.1) Get real batch
            # ---------------------------
            batch_indices = indices[batch_i * batch_size : (batch_i + 1) * batch_size]
            real_imgs = X_train[batch_indices]
            real_embeddings = embeddings_train[batch_indices]

            # ---------------------------
            # 4.2) Sample random noise
            # ---------------------------
            z_noise = np.random.normal(0, 1, (batch_size, z_dim))

            # ---------------------------
            # 4.3) Generate fake images
            # ---------------------------
            # Pass embeddings through CA to get c, then generate
            mu, logvar = ca_model.predict_on_batch(real_embeddings)
            epsilon = np.random.normal(0, 1, (batch_size, condition_dim))
            c = mu + np.exp(logvar / 2) * epsilon  # reparameterize

            gen_input = np.concatenate([z_noise, c], axis=1)
            fake_imgs = stage1_gen.predict_on_batch(gen_input)

            # ---------------------------
            # 4.4) Train Discriminator
            # ---------------------------
            # 4.4.1) Train on real
            d_loss_real = stage1_dis.train_on_batch([real_imgs, c], real_labels)

            # 4.4.2) Train on fake
            d_loss_fake = stage1_dis.train_on_batch([fake_imgs, c], fake_labels)

            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------------
            # 4.5) Train Generator
            # ---------------------------
            # We want the generator to produce images that the discriminator
            # classifies as real (label=1.0), plus we want to minimize KL divergence
            # The adversarial_model takes [z, embedding] -> [D_out, mu_logvar]
            # So we must feed real_embeddings again (for CA) plus z_noise
            valid_y = np.ones(
                (batch_size, 1), dtype=np.float32
            )  # generator wants them real
            # For the KL loss, we pass a dummy array (shape=(batch_size, condition_dim*2))
            # or we rely on the model to parse it. Let's just pass zeros:
            dummy_kl = np.zeros((batch_size, condition_dim * 2), dtype=np.float32)

            g_loss = adversarial_model.train_on_batch(
                [z_noise, real_embeddings], [valid_y, dummy_kl]
            )

            # Print every few batches
            if batch_i % 50 == 0:
                print(
                    f"Batch {batch_i}/{num_batches} | D loss: {d_loss[0]:.4f} | G loss: {g_loss[0]:.4f} (KL: {g_loss[1]:.4f})"
                )

        # ---------------------------
        # 4.6) Save Weights Periodically
        # ---------------------------
        if (epoch + 1) % 2 == 0:
            stage1_gen.save_weights(f"stage1_gen_epoch_{epoch+1}.keras")
            stage1_dis.save_weights(f"stage1_dis_epoch_{epoch+1}.keras")

    # ----------------------------
    # 5) Final Save
    # ----------------------------
    stage1_gen.save_weights("stage1_gen_final.keras")
    stage1_dis.save_weights("stage1_dis_final.keras")
    print("Training complete!")

Loading training data...


FileNotFoundError: [Errno 2] No such file or directory: './data/coco/train\\class_info.pickle'