https://towardsdatascience.com/image-captioning-using-deep-learning-fe0d929cf337




In [21]:
!pip install graphviz 

Collecting graphviz
  Downloading graphviz-0.20-py3-none-any.whl (46 kB)
Installing collected packages: graphviz
Successfully installed graphviz-0.20


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
import cv2
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Embedding, Conv2D, Concatenate, Flatten, Add, Dropout, GRU
import random
import datetime
from nltk.translate.bleu_score import sentence_bleu

In [2]:
def get_random_set():
    obj = reqd[np.random.randint(len(reqd))]
    for i in range(len(obj["imgs"])):
        plt.subplot(1, len(obj["imgs"]),i+1)
        plt.axis('off')
        img = plt.imread(get_img_path(obj["imgs"][i]))
        plt.imshow(img)
    print(obj["desc"])

def get_random_set2(df, k):
    obj = df[k]
    for i in range(len(obj["imgs"])):
        plt.subplot(1, len(obj["imgs"]),i+1)
        plt.axis('off')
        img = plt.imread(get_img_path(obj["imgs"][i]))
        plt.imshow(img)
    print(obj["desc"])
    
def get_img_path(x):
    return "./imgs/"+x+".png"

from PIL import Image
def load_image(img_name):
    '''Function to load the image'''
    image = Image.open(img_name)
    image = image.resize((224,224))
    image_array = np.asarray(image.convert("RGB"))
    image_array = image_array / 255.
    
    X = np.expand_dims(image_array, axis=0)
    X = np.asarray(X) 
    return X

In [3]:
train_dataset = pd.read_csv('df_train.csv')
test_dataset = pd.read_csv('df_test.csv')
cv_dataset = pd.read_csv('df_cv.csv')

In [4]:
train_dataset

Unnamed: 0.1,Unnamed: 0,desc,img1,img2,img3,img4,img5
0,0,startseq right upper lobe airspace disease con...,CXR3069_IM-1432-1001,CXR3069_IM-1432-2001,,,
1,1,startseq mediastinal contours are normal lungs...,CXR2629_IM-1116-1001,CXR2629_IM-1116-2001,,,
2,2,startseq cardiomediastinal contours are unchan...,CXR1888_IM-0576-1001,CXR1888_IM-0576-4004,,,
3,3,startseq heart size normal the lungs are clear...,CXR2248_IM-0844-1001,CXR2248_IM-0844-1002,,,
4,4,startseq the cardiomediastinal contours are wi...,CXR3408_IM-1648-1001,CXR3408_IM-1648-1002,,,
...,...,...,...,...,...,...,...
2063,2331,startseq the heart normal size the mediastinum...,CXR2243_IM-0840-1001,CXR2243_IM-0840-2001,CXR2243_IM-0840-3001,CXR2243_IM-0840-4001,
2064,2332,startseq normal heart size normal mediastinal ...,CXR1356_IM-0231-1001,CXR1356_IM-0231-2001,,,
2065,2333,startseq frontal and lateral views the chest s...,CXR1883_IM-0572-1001,CXR1883_IM-0572-2001,,,
2066,2334,startseq normal heart size and mediastinal con...,CXR2622_IM-1110-1001,CXR2622_IM-1110-1002,,,


In [5]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [6]:
gpus

[]

In [7]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(train_dataset.desc)

In [8]:
vocab_size = len(tokenizer.word_counts) + 1
vocab_size

1313

In [9]:
f = open('Image_features_attention.pickle','rb') # contains the features from chexNet
Xnet_Features = pickle.load(f)
f.close()

In [10]:
try:
    with open(r"embedding_matrix.pickle", "rb") as output_file:
        embedding_matrix = pickle.load(output_file)
    print("loaded")
except:
    embedding_matrix = np.zeros((vocab_size,300))
    filepath = r"./utils/glove.6B/glove.6B.300d.txt"
    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in tokenizer.word_index.keys():
                idx = tokenizer.word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:300]
    with open(r"embedding_matrix.pickle", "wb") as output_file:
        pickle.dump(embedding_matrix, output_file)

loaded


In [11]:
BATCH_SIZE = 14

In [59]:
def load_image(id_, report):
    '''Loads the Image Features with their corresponding Ids'''
#     test_2827 train_2335
    if id_ > 2827:
        img_feature = Xnet_Features["cv_"+str(id_)][0]
    elif id_ > 2335:
        img_feature = Xnet_Features["test_"+str(id_)][0]
    else:
        img_feature = Xnet_Features["train_"+str(id_)][0]
    return img_feature, report

In [60]:
def create_dataset(img_name, caption):
    dataset = tf.data.Dataset.from_tensor_slices((img_name, caption))
    # Use map to load the numpy files in parallel
    dataset = dataset.map(lambda item1, item2: tf.numpy_function(load_image, [item1, item2], [tf.float32, tf.string]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # Shuffle and batch
    dataset = dataset.shuffle(500).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [61]:
train_generator = create_dataset(train_dataset.iloc[:,0], train_dataset.desc)
train_generator

<PrefetchDataset shapes: (<unknown>, <unknown>), types: (tf.float32, tf.string)>

In [62]:
cv_generator = create_dataset(cv_dataset.iloc[:,0], cv_dataset.desc)

In [63]:
input1 = Input(shape=(2048), name='Image_1')
dense1 = Dense(256, kernel_initializer=tf.keras.initializers.glorot_uniform(seed = 56), name='dense_encoder')(input1)

input2 = Input(shape=(153), name='Text_Input')
emb_layer = Embedding(input_dim = vocab_size, output_dim = 300, input_length=153, mask_zero=True, trainable=False, 
                weights=[embedding_matrix], name="Embedding_layer")
emb = emb_layer(input2)

LSTM1 = LSTM(units=256, activation='tanh', recurrent_activation='sigmoid', use_bias=True, 
            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=23),
            recurrent_initializer=tf.keras.initializers.orthogonal(seed=7),
            bias_initializer=tf.keras.initializers.zeros(), return_sequences=True, name="LSTM1")(emb)
#LSTM1_output = LSTM1(emb)

LSTM2 = LSTM(units=256, activation='tanh', recurrent_activation='sigmoid', use_bias=True, 
            kernel_initializer=tf.keras.initializers.glorot_uniform(seed=23),
            recurrent_initializer=tf.keras.initializers.orthogonal(seed=7),
            bias_initializer=tf.keras.initializers.zeros(), name="LSTM2")
LSTM2_output = LSTM2(LSTM1)

dropout1 = Dropout(0.5, name='dropout1')(LSTM2_output)

dec =  tf.keras.layers.Add()([dense1, dropout1])

fc1 = Dense(256, activation='relu', kernel_initializer=tf.keras.initializers.he_normal(seed = 63), name='fc1')
fc1_output = fc1(dec)
dropout2 = Dropout(0.4, name='dropout2')(fc1_output)
output_layer = Dense(vocab_size, activation='softmax', name='Output_layer')
output = output_layer(dropout2)

encoder_decoder = Model(inputs = [input1, input2], outputs = output)
encoder_decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Text_Input (InputLayer)         [(None, 153)]        0                                            
__________________________________________________________________________________________________
Embedding_layer (Embedding)     (None, 153, 300)     393900      Text_Input[0][0]                 
__________________________________________________________________________________________________
LSTM1 (LSTM)                    (None, 153, 256)     570368      Embedding_layer[0][0]            
__________________________________________________________________________________________________
Image_1 (InputLayer)            [(None, 2048)]       0                                            
____________________________________________________________________________________________

In [64]:
tf.keras.utils.plot_model(encoder_decoder)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [65]:
optimizer = tf.keras.optimizers.Adam(0.001)

In [66]:
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=False, reduction='auto')

def maskedLoss(y_true, y_pred):
    #getting mask value
    mask = tf.math.logical_not(tf.math.equal(y_true, 0))
    
    #calculating the loss
    loss_ = loss_function(y_true, y_pred)
    
    #converting mask dtype to loss_ dtype
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    #applying the mask to loss
    loss_ = loss_*mask
    
    #getting mean over all the values
    loss_ = tf.reduce_mean(loss_)
    return loss_

In [67]:
embedding_matrix.shape

(1313, 300)

In [68]:
encoder_decoder.compile(optimizer, loss = maskedLoss)

In [69]:
def bytes_to_string(arr):
    '''The generator gives provides data in bytes. This function converts them back to strings for manipulation'''
    for i in range(len(arr)):
        arr[i] = arr[i].decode('utf-8')
    return arr

In [70]:
def convert(images, reports):
    '''This function takes the batch of data and converts them into a new dataset(A WORD BY WORD DATASET)'''
    imgs = []
    in_reports = []
    out_reports = []
    for i in range(len(images)):
        sequence = [tokenizer.word_index[e] for e in reports[i].split() if e in tokenizer.word_index.keys()]
      #  print(sequence)
        for j in range(1,len(sequence)):
            in_seq = sequence[:j]
            out_seq = sequence[j]
            out_seq = tf.keras.utils.to_categorical(out_seq, num_classes=vocab_size)
            imgs.append(images[i])
          #  print(in_seq)
            in_reports.append(in_seq)
           # print(out_seq)
            out_reports.append(out_seq)
    return np.array(imgs), np.array(in_reports), np.array(out_reports)

In [71]:
EPOCH = 20

In [72]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'Tensorboard/logs_m1/fit3/' + current_time + '/train'
val_log_dir = 'Tensorboard/logs_m1/fit3/' + current_time + '/test'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)

In [74]:
epoch_train_loss = []
epoch_val_loss = []

for epoch in range(20):
    print('EPOCH : ',epoch+1)
    start = time.time()
    batch_loss_tr = 0
    batch_loss_vl = 0
    
    for img, report in train_generator:
        r1 = bytes_to_string(report.numpy())
        img_input, rep_input, output_word = convert(img.numpy(), r1)
        rep_input = pad_sequences(rep_input, maxlen=153, padding='post')
        results = encoder_decoder.train_on_batch([img_input, rep_input], output_word)

        batch_loss_tr += results
    
    train_loss = batch_loss_tr/(len(train_dataset)//14)
 #   print('Saving Tensorboard')
    with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss, step = epoch)
    
    for img, report in cv_generator:
        
        r1 = bytes_to_string(report.numpy())
        img_input, rep_input, output_word = convert(img.numpy(), r1)
        rep_input = pad_sequences(rep_input, maxlen=153, padding='post')
        results = encoder_decoder.test_on_batch([img_input, rep_input], output_word)
        batch_loss_vl += results
    
    val_loss = batch_loss_vl/(len(cv_dataset)//14)
    
    with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss, step = epoch)

    epoch_train_loss.append(train_loss)

    epoch_val_loss.append(val_loss)
    
    print('Training Loss: {},  Val Loss: {}'.format(train_loss, val_loss))
    print('Time Taken for this Epoch : {} sec'.format(time.time()-start))   
    encoder_decoder.save_weights('Weights_re/encoder_decoder_epoch_'+ str(epoch+1) + '.h5')

EPOCH :  1


  return np.array(imgs), np.array(in_reports), np.array(out_reports)


Training Loss: 0.0033988541603519194,  Val Loss: 0.0030873050132105427
Time Taken for this Epoch : 392.8474745750427 sec
EPOCH :  2
Training Loss: 0.0029646353669413902,  Val Loss: 0.002589465021305988
Time Taken for this Epoch : 413.4568979740143 sec
EPOCH :  3
Training Loss: 0.0025642738902472515,  Val Loss: 0.0022834883176631503
Time Taken for this Epoch : 339.41609168052673 sec
EPOCH :  4
Training Loss: 0.0022942615596919643,  Val Loss: 0.0020640030570869003
Time Taken for this Epoch : 331.6139030456543 sec
EPOCH :  5
Training Loss: 0.0021123053081536372,  Val Loss: 0.0019294517415185128
Time Taken for this Epoch : 330.2767524719238 sec
EPOCH :  6
Training Loss: 0.0019868127241426583,  Val Loss: 0.0018830453045666218
Time Taken for this Epoch : 331.7625858783722 sec
EPOCH :  7
Training Loss: 0.0018867052644871327,  Val Loss: 0.001786282627783235
Time Taken for this Epoch : 3036.2138550281525 sec
EPOCH :  8
Training Loss: 0.0018009650980306117,  Val Loss: 0.001722438294710892
Time T

In [49]:
train_dataset

Unnamed: 0.1,Unnamed: 0,desc,img1,img2,img3,img4,img5
0,0,startseq right upper lobe airspace disease con...,CXR3069_IM-1432-1001,CXR3069_IM-1432-2001,,,
1,1,startseq mediastinal contours are normal lungs...,CXR2629_IM-1116-1001,CXR2629_IM-1116-2001,,,
2,2,startseq cardiomediastinal contours are unchan...,CXR1888_IM-0576-1001,CXR1888_IM-0576-4004,,,
3,3,startseq heart size normal the lungs are clear...,CXR2248_IM-0844-1001,CXR2248_IM-0844-1002,,,
4,4,startseq the cardiomediastinal contours are wi...,CXR3408_IM-1648-1001,CXR3408_IM-1648-1002,,,
...,...,...,...,...,...,...,...
2063,2331,startseq the heart normal size the mediastinum...,CXR2243_IM-0840-1001,CXR2243_IM-0840-2001,CXR2243_IM-0840-3001,CXR2243_IM-0840-4001,
2064,2332,startseq normal heart size normal mediastinal ...,CXR1356_IM-0231-1001,CXR1356_IM-0231-2001,,,
2065,2333,startseq frontal and lateral views the chest s...,CXR1883_IM-0572-1001,CXR1883_IM-0572-2001,,,
2066,2334,startseq normal heart size and mediastinal con...,CXR2622_IM-1110-1001,CXR2622_IM-1110-1002,,,
