<a href="https://colab.research.google.com/github/ananya-carpediem08/AI-Ml2023/blob/main/ISOC_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import os
import tensorflow as tf
from PIL import Image
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, add
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import warnings


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR']='/content/drive/MyDrive'
!kaggle datasets download -d adityajn105/flickr8k

Downloading flickr8k.zip to /content
100% 1.03G/1.04G [00:28<00:00, 44.4MB/s]
100% 1.04G/1.04G [00:28<00:00, 39.2MB/s]


the following code is generating an error because I cancelled the operation midway as the file was already downloaded and mounted in my drive


In [None]:
import zipfile

# Specify the path to the uploaded ZIP file
zip_file_path = "/content/flickr8k.zip"  # Update this with the actual path

# Specify the directory where you want to extract the contents of the ZIP file
extract_to_directory = "/content/drive/MyDrive"  # Update this with the desired extraction path

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_directory)

# Check the extracted files
extracted_files = zip_ref.namelist()  # List of extracted files
print("Extracted files:", extracted_files)


FileNotFoundError: ignored

#Pre-processing the images
Including the Inception V3 model , which converts the images into feature vectors and also including exception handling for images which have unsupported image formats.


In [None]:
# making a function for loading the image and pre-processing the image

def load_image_features(directory):
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    features = {}
    for img_name in tqdm(os.listdir(directory)):
        filename = os.path.join(directory, img_name)
        try:
            image = Image.open(filename)
            image = image.resize((299, 299))
            image = np.expand_dims(image, axis=0)
            image = preprocess_input(image)
            feature = model.predict(image, verbose=0)
            image_id = img_name.split('.')[0]
            features[image_id] = feature
        except (OSError, PIL.UnidentifiedImageError) as e:
            warnings.warn(f"Error processing {filename}: {e}")
            continue  # Skip to the next image
    return features

Pre-Processing the captions


In [None]:
def preprocess_captions(captions):
    table = str.maketrans('', '', string.punctuation)
    for img_id, caption_list in captions.items():
        for i in range(len(caption_list)):
            caption = caption_list[i].split()
            caption = [word.lower() for word in caption]
            caption = [w.translate(table) for w in caption]
            caption = [word for word in caption if len(word) > 1]
            caption = [word for word in caption if word.isalpha()]
            caption_list[i] = ' '.join(caption)
    return captions

In [None]:
# Creating tokens for the captions
def create_tokenizer(captions):
    captions_list = list(captions.values())
    all_captions = []
    for caption_list in captions_list:
        all_captions += caption_list
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    return tokenizer

*Since the datapoints in the caption file is of the format (image_name.jpg,caption), it is necessary to separate the captions and the image_name.*

In [None]:
def load_annotations(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    image_captions = {}
    for line in lines:
        parts = line.strip().split(',')
        if len(parts) == 2:  # Ensuring the line has both image name and caption
            image_name = parts[0].strip()
            caption = parts[1].strip()
            if image_name not in image_captions:
                image_captions[image_name] = []
            image_captions[image_name].append(caption)

    return image_captions


In [None]:
# Create sequences of images, input sequences, and output words for training the model
def create_sequences(tokenizer, captions, image_features, vocab_size, max_length):
    X1, X2, y = [], [], []
    for img_id, caption_list in captions.items():
        image_feature = image_features[img_id][0]
        for caption in caption_list:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(image_feature)
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [None]:
# Define the image captioning model
def define_model(vocab_size, max_length):
    input1 = Input(shape=(2048,))
    fe1 = Dense(256, activation='relu')(input1)
    input2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(input2)
    se2 = LSTM(256)(se1)
    decoder1 = add([fe1, se2])
    decoder2 = Dense(256, activation='relu')(decoder1)
    output = Dense(vocab_size, activation='softmax')(decoder2)
    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [None]:
# Load Flickr 8k dataset annotations
annotations_path = '/content/drive/MyDrive/captions.txt'
captions = load_annotations(annotations_path)
captions = preprocess_captions(captions)
captions

{'image': ['caption'],
 '1000268201_693b08cb0e.jpg': ['child in pink dress is climbing up set of stairs in an entry way',
  'girl going into wooden building',
  'little girl climbing into wooden playhouse',
  'little girl climbing the stairs to her playhouse',
  'little girl in pink dress going into wooden cabin'],
 '1001773457_577c3a7d70.jpg': ['black dog and spotted dog are fighting',
  'black dog and tricolored dog playing with each other on the road',
  'black dog and white dog with brown spots are staring at each other in the street',
  'two dogs of different breeds looking at each other on the road',
  'two dogs on pavement moving toward each other'],
 '1002674143_1b742ab4b8.jpg': ['little girl covered in paint sits in front of painted rainbow with her hands in bowl',
  'little girl is sitting in front of large painted rainbow',
  'small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it',
  'there is girl with pigtails sitting in front of rainb

In [None]:
image_features_directory = '/content/drive/MyDrive/Images'
image_features = load_image_features(image_features_directory)
image_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


100%|██████████| 8089/8089 [15:37<00:00,  8.63it/s]


{'1926129518_4350f4f552': array([[0.21642402, 0.6197052 , 0.07396493, ..., 0.12976904, 0.7567191 ,
         0.28012124]], dtype=float32),
 '1924234308_c9ddcf206d': array([[0.25118873, 0.06160817, 1.0082335 , ..., 0.07204767, 0.9761707 ,
         0.78691566]], dtype=float32),
 '1925434818_2949a8f6d8': array([[0.4691009 , 0.15489951, 0.48743623, ..., 0.35811326, 0.44115573,
         0.9695739 ]], dtype=float32),
 '1928319708_ccf1f4ee72': array([[0.08152099, 0.3943622 , 0.059411  , ..., 0.17081073, 0.73480487,
         0.9359209 ]], dtype=float32),
 '1931690777_897a7d8ab6': array([[0.86859936, 1.0595572 , 0.6533682 , ..., 0.7826594 , 0.80521756,
         0.07662986]], dtype=float32),
 '1932314876_9cc46fd054': array([[0.7451909 , 0.04939948, 0.44436955, ..., 0.1633572 , 0.37904403,
         0.23047443]], dtype=float32),
 '1932161768_996eadac87': array([[0.02780378, 0.05399302, 0.08157939, ..., 0.7927498 , 0.40521774,
         0.07050139]], dtype=float32),
 '1936215201_d03a75cbba': array([[

In [None]:
# Creating tokenizer and further preprocessing captions
tokenizer = create_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption_list in captions.values() for caption in caption_list)

In [None]:
# Creating sequences for training the model
def create_sequences(tokenizer, captions, image_features, vocab_size, max_length):
    X1, X2, y = [], [], []
    for img_id, caption_list in captions.items():
        image_feature = image_features.get(img_id)
        if image_feature is not None:
            for caption in caption_list:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(image_feature)
                    X2.append(in_seq)
                    y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)




#Training the model

In [None]:
X1train, X2train, ytrain = create_sequences(tokenizer, captions, image_features, vocab_size, max_length)


In [None]:
# Defining the model
model = define_model(vocab_size, max_length)

In [None]:
def data_generator(X1, X2, y, vocab_size, max_length, batch_size):
    while True:
        for i in range(0, len(X1), batch_size):
            X1_batch = X1[i:i+batch_size]
            X2_batch = X2[i:i+batch_size]
            y_batch = y[i:i+batch_size]

            X2_batch = pad_sequences(X2_batch, maxlen=max_length)

            y_batch = [to_categorical([word], num_classes=vocab_size)[0] for word in y_batch]

            yield ([np.array(X1_batch), np.array(X2_batch)], np.array(y_batch))


In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Assuming you have already loaded X1train, X2train, ytrain, tokenizer, max_length, vocab_size

# Pad input sequences and one-hot encode output sequences
X2train_padded = pad_sequences(X2train, maxlen=max_length)
ytrain_encoded = to_categorical(ytrain, num_classes=vocab_size)

# Train the model
epochs = 15
batch_size = 64


history = model.fit(
    [np.array(X1train), X2train_padded],
    ytrain_encoded,
    epochs=epochs,
    batch_size=batch_size,
    verbose=1
)

# Save the model
model.save('image_captioning_model.h5')


KeyboardInterrupt: ignored