In [19]:
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import collections
import random
import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

In [20]:
# Download caption annotation files
annotation_folder = '/annotations/'
if not os.path.exists(os.path.abspath('.') + annotation_folder):
  annotation_zip = tf.keras.utils.get_file('captions.zip',
                                          cache_subdir=os.path.abspath('.'),
                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
                                          extract = True)
  annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'
  os.remove(annotation_zip)

# Download image files
image_folder = '/train2014/'
if not os.path.exists(os.path.abspath('.') + image_folder):
  image_zip = tf.keras.utils.get_file('train2014.zip',
                                      cache_subdir=os.path.abspath('.'),
                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',
                                      extract = True)
  PATH = os.path.dirname(image_zip) + image_folder
  os.remove(image_zip)
else:
  PATH = os.path.abspath('.') + image_folder

Downloading data from http://images.cocodataset.org/annotations/annotations_trainval2014.zip


In [21]:
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

In [22]:
# Group all captions together having the same image ID.
image_path_to_caption = collections.defaultdict(list)
for val in annotations['annotations']:
  caption = f"<start> {val['caption']} <end>"
  image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (val['image_id'])
  image_path_to_caption[image_path].append(caption)

In [23]:
image_paths = list(image_path_to_caption.keys())
random.shuffle(image_paths)

# Select the first 6000 image_paths from the shuffled set.
# Approximately each image id has 5 captions associated with it, so that will 
# lead to 30,000 examples.
train_image_paths = image_paths[:6000]
print(len(train_image_paths))

6000


In [24]:
train_captions = []
img_name_vector = []

for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

In [25]:
print(f'===> number of images: {len(img_name_vector)}\n===> number of captions: {len(train_captions)}')

===> number of images: 30013
===> number of captions: 30013


In [26]:
top_k = 5000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,
                                                  oov_token="<unk>",
                                                  filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions)

tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

train_seqs = tokenizer.texts_to_sequences(train_captions)

cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [27]:
def decode_sequence(encoded_text):
  lst = []
  for i in encoded_text[0]:
    lst.append(tokenizer.index_word[i])
  return ' '.join(lst)

In [28]:
def tokenize_sequence(sequence):
  tokezied =  tokenizer.texts_to_sequences(sequence)

In [29]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Reshape,Dropout,LeakyReLU,Flatten,BatchNormalization,Conv2D,Conv2DTranspose,Input
from tensorflow.keras.models import Sequential

In [115]:
width = 32 
height = 32

generator = Sequential()
generator.add(Input(shape=(49,)))
generator.add(Dense(128,activation='tanh'))
generator.add(Dense(16 * width * height, use_bias=False))
generator.add(BatchNormalization())
generator.add(LeakyReLU())

generator.add(Reshape((width, height, 16)))

generator.add(Conv2DTranspose(32, (5, 5), strides=(1, 1), padding='same', use_bias=False))
generator.add(BatchNormalization())
generator.add(LeakyReLU())


generator.add(Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))

In [116]:
discriminator = Sequential()
discriminator.add(Conv2D(64, kernel_size=5, strides=2, padding="same",
                        activation=LeakyReLU(0.3),
                        input_shape=[32, 32, 1]))
discriminator.add(Dropout(0.5))
discriminator.add(Conv2D(128, kernel_size=5, strides=2, padding="same",
                        activation=LeakyReLU(0.3)))
discriminator.add(Dropout(0.5))
discriminator.add(Flatten())
discriminator.add(Dense(1, activation="sigmoid"))

In [117]:
GAN = Sequential([generator, discriminator])

In [118]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

In [119]:
discriminator.compile(loss=discriminator_loss, optimizer="adam")
discriminator.trainable = False

In [120]:
GAN.compile(loss="binary_crossentropy", optimizer="adam")


In [121]:
from PIL import Image

epochs = 1
batch_size = 1000
WIDTH, HEIGHT = 32, 32

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

with tf.device('/device:GPU:0'):
  for epoch in range(1,epochs + 1):
    print(f'Epoch {epoch}/{epochs}')
    batch_no = 1
    train_captions, train_images = list(), list()
    import time as t
    t1 = t.time()
    for batch_captions, batch_images in zip(cap_vector, img_name_vector):
      if len(train_captions) < batch_size and len(train_images) < batch_size:
        train_captions.append(batch_captions)
        img = Image.open(batch_images).convert('L')
        resized_img = img.resize((WIDTH, HEIGHT))
        normalized_img = np.array(resized_img) / 255
        train_images.append(normalized_img.reshape(WIDTH, HEIGHT,1))

      else:
        captions = np.array(train_captions)
        print(captions.shape)
        print(f'bacth number : {batch_no}')

        gen_images = generator(captions)

        X_fake_vs_real = tf.concat([gen_images, tf.dtypes.cast(train_images,tf.float32)], axis=0)
        
        y1 = tf.constant([[0.]] * batch_size + [[1.]] * batch_size)
        
        discriminator.trainable = True
        
        discriminator.train_on_batch(X_fake_vs_real, y1)
        
        y2 = tf.constant([[1.]] * batch_size)
        
        discriminator.trainable = False
        
        GAN.train_on_batch(captions, y2)

        train_captions.clear()
        train_images.clear()
        print(len(train_captions),len(train_images))
        batch_no += 1
        t2 = t.time()
        print(t2-t1)

Found GPU at: /device:GPU:0
Epoch 1/1
(1000, 49)
bacth number : 1


ResourceExhaustedError: ignored

In [None]:
plt.imshow(train_images[4].reshape(28,28))
print(decode_sequence([train_captions[4]]))

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Reshape,Dropout,Input,LeakyReLU,Flatten,BatchNormalization,Conv2D,Conv2DTranspose
from tensorflow.keras.models import Sequential
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import collections
import random
import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle

class Stage1(object):

	def __init__(self):
		
		# *********GENERATOR*********
		self.generator = Sequential()
		self.generator.add(Input(shape=(46,)))
		self.generator.add(Dense(1024,activation='tanh'))
		self.generator.add(Dense(7*7*256, use_bias=False))
		self.generator.add(BatchNormalization())
		self.generator.add(LeakyReLU())
		self.generator.add(Reshape((7, 7, 256)))
		self.generator.add(Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False))
		self.generator.add(BatchNormalization())
		self.generator.add(LeakyReLU())
		self.generator.add(Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False))
		self.generator.add(BatchNormalization())
		self.generator.add(LeakyReLU())
		self.generator.add(Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))

		# *********DISCRIMINATOR*********
		self.discriminator = Sequential()
		self.discriminator.add(Conv2D(64, kernel_size=5, strides=2, padding="same", activation=LeakyReLU(0.3), input_shape=[28, 28, 1]))
		self.discriminator.add(Dropout(0.5))
		self.discriminator.add(Conv2D(128, kernel_size=5, strides=2, padding="same", activation=LeakyReLU(0.3)))
		self.discriminator.add(Dropout(0.5))
		self.discriminator.add(Flatten())
		self.discriminator.add(Dense(1, activation="sigmoid"))

		self.GAN = Sequential([self.generator, self.discriminator])
		
		self.discriminator.compile(loss=self.discriminator_loss, optimizer="adam")
		self.discriminator.trainable = False

		self.GAN.compile(loss="binary_crossentropy", optimizer="adam")



	def fit(self,X_captions,X_img_names,epochs=10,batch_size=10000,WIDTH=28, HEIGHT=28, cuda=False):
		if cuda:
			device = '/device:GPU:0'
		else:
			device = '/cpu:0'
			
		with tf.device(device):
			for epoch in range(1,epochs + 1):

				print(f'Epoch {epoch}/{epochs}')
				batch_number = 1

				train_captions, train_images = list(), list()

				for batch_captions, batch_images in zip(X_captions, X_img_names):
					if len(train_captions) < batch_size and len(train_images) < batch_size:
						train_captions.append(batch_captions)
						img = Image.open(batch_images).convert('L')
						resized_img = img.resize((WIDTH, HEIGHT))
						normalized_img = np.array(resized_img) / 255
						train_images.append(normalized_img.reshape(WIDTH, HEIGHT,1))

					else:
						captions = np.array(train_captions)
						print(captions.shape)
						print(f'  bacth number : {batch_number}')

						gen_images = self.generator(captions)

						X_fake_vs_real = tf.concat([gen_images, tf.dtypes.cast(train_images,tf.float32)], axis=0)

						y1 = tf.constant([[0.]] * batch_size + [[1.]] * batch_size)

						self.discriminator.trainable = True

						self.discriminator.train_on_batch(X_fake_vs_real, y1,verbose=1)

						y2 = tf.constant([[1.]] * batch_size)

						self.discriminator.trainable = False

						self.GAN.train_on_batch(captions, y2, verbose=1)

						train_captions.clear()
						train_images.clear()
						print(len(train_captions),len(train_images))
						batch_number += 1

	
	def discriminator_loss(self,real_output, fake_output):
		cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
		real_loss = cross_entropy(tf.ones_like(real_output), real_output)
		fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
		total_loss = real_loss + fake_loss
		return total_loss

	def load_weights(self,PATH_G,PATH_D):
		if os.path.exists(PATH):
			self.generator.load_weights(PATH_G)
			self.discriminator.load_weights(PATH_D)
		else:
			raise FileNotFoundError

	def predict(self, caption):
		results = self.generator(caption)
		return results


In [None]:
gan = Stage1()

In [None]:
gan.fit(cap_vector,img_name_vector,batch_size=10000,cuda=True)

Epoch 1/10
(10000, 46)
  bacth number : 1


TypeError: ignored

In [None]:
inpt = np.array([  3,   2,  38, 368, 253,  77, 214,   8,   2,  29,  36,   4,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])

In [None]:
inpt = np.expand_dims(inpt,axis = 0)

In [None]:
inpt

array([[  3,   2,  38, 368, 253,  77, 214,   8,   2,  29,  36,   4,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [None]:
gan.predict(inpt)

<tf.Tensor: shape=(1, 28, 28, 1), dtype=float32, numpy=
array([[[[ 4.87837829e-02],
         [-4.27461267e-01],
         [ 2.22369701e-01],
         [-4.53597784e-01],
         [ 5.07472716e-02],
         [-3.82371485e-01],
         [ 1.80752620e-01],
         [-4.00502622e-01],
         [ 7.09283054e-02],
         [-4.37214822e-01],
         [ 1.52660385e-01],
         [-3.42121661e-01],
         [ 8.78570750e-02],
         [-4.11942571e-01],
         [ 1.60549477e-01],
         [-3.45751286e-01],
         [ 9.23416838e-02],
         [-3.70180100e-01],
         [ 1.47758603e-01],
         [-3.30483556e-01],
         [ 9.74254981e-02],
         [-3.65562797e-01],
         [ 1.51099533e-01],
         [-3.49080563e-01],
         [ 8.00035596e-02],
         [-3.43351901e-01],
         [ 1.63460329e-01],
         [-2.98675895e-01]],

        [[-1.95036512e-02],
         [-1.53174385e-01],
         [-5.23104407e-02],
         [-1.00565515e-01],
         [ 1.15321562e-01],
         [-1.03601

In [None]:
cap_vector[1]

array([  3,   2,  38, 368, 253,  77, 214,   8,   2,  29,  36,   4,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)