In [335]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.applications import EfficientNetV2S
from keras.utils import load_img, img_to_array, to_categorical, pad_sequences
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.layers import MaxPooling2D, Flatten, Dense, Dropout, Conv2D
from sklearn.model_selection import train_test_split
from os import listdir
import os
import shutil

In [318]:
imagefolder = 'flickr8k/images/'

In [319]:
df1 = pd.read_pickle('caption1')
df1.head()

caption_order,image,1
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man in an orange hat starring at something .


In [320]:
df1.rename(columns={1:'caption1'},inplace=True)
df1.head()

caption_order,image,caption1
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man in an orange hat starring at something .


In [321]:
X = df1['image']
y = df1['caption1']

In [322]:
y.head()

0    A child in a pink dress is climbing up a set o...
1           A black dog and a spotted dog are fighting
2    A little girl covered in paint sits in front o...
3    A man lays on a bench while his dog sits by him .
4       A man in an orange hat starring at something .
Name: caption1, dtype: object

In [323]:
#sampling down for my potato computer
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=0.2, random_state=2)

In [324]:
X_remainder, X_test, y_remainder, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=2)

In [325]:
print(X_remainder.shape)
print(y_remainder.shape)

(1294,)
(1294,)


In [326]:
X_train, X_val, y_train, y_val = train_test_split(X_remainder, y_remainder, test_size=0.2, random_state=2)

In [327]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(y_train)
train_sequences = tokenizer.texts_to_sequences(y_train)
val_sequences = tokenizer.texts_to_sequences(y_val)
test_sequences = tokenizer.texts_to_sequences(y_test)

In [328]:
print(train_sequences[1])

[1, 10, 8, 5, 1, 351, 8, 104, 4, 3, 44]


In [329]:
traincaptions = tokenizer.sequences_to_texts(train_sequences)

In [330]:
print(traincaptions[1])

a black dog and a grey dog run on the beach


In [331]:
vocab_size = len(tokenizer.word_index) + 1
print(f'Vocabulary Size:  {vocab_size}')

Vocabulary Size:  1561


In [332]:
tokenizer.word_index

{'a': 1,
 'in': 2,
 'the': 3,
 'on': 4,
 'and': 5,
 'is': 6,
 'of': 7,
 'dog': 8,
 'man': 9,
 'black': 10,
 'with': 11,
 'boy': 12,
 'white': 13,
 'girl': 14,
 'are': 15,
 'to': 16,
 'brown': 17,
 'at': 18,
 'an': 19,
 'while': 20,
 'red': 21,
 'water': 22,
 'blue': 23,
 'woman': 24,
 'group': 25,
 'shirt': 26,
 'people': 27,
 'his': 28,
 'two': 29,
 'ball': 30,
 'playing': 31,
 'child': 32,
 'wearing': 33,
 'standing': 34,
 'running': 35,
 'through': 36,
 'down': 37,
 'over': 38,
 'green': 39,
 'dogs': 40,
 'snow': 41,
 'another': 42,
 'grass': 43,
 'beach': 44,
 'jumping': 45,
 'sitting': 46,
 'holding': 47,
 'front': 48,
 'yellow': 49,
 'by': 50,
 'pink': 51,
 'near': 52,
 'up': 53,
 'player': 54,
 'into': 55,
 'walking': 56,
 'children': 57,
 'field': 58,
 'for': 59,
 'out': 60,
 'from': 61,
 'riding': 62,
 'off': 63,
 'crowd': 64,
 'little': 65,
 'camera': 66,
 'dressed': 67,
 'runs': 68,
 'stands': 69,
 'air': 70,
 'person': 71,
 'their': 72,
 'as': 73,
 'jacket': 74,
 'her': 75,

Modeling using Efficientnetv2s

In [333]:
X_train

6770    3665569615_9a71c4b6e4.jpg
2111    2507312812_768b53b023.jpg
134     1142847777_2a0c1c2551.jpg
2686    2698614194_b4e6e11dff.jpg
7175     405615014_03be7ef618.jpg
                  ...            
2682    2697909987_128f11d1b7.jpg
5131    3336759846_5220e27deb.jpg
5014    3315353266_70f0bbb1c3.jpg
2170    2527163162_d0fb802992.jpg
7875     742073622_1206be8f7f.jpg
Name: image, Length: 1035, dtype: object

In [344]:
train_images = []
train_captions = []

# Iterate over the filenames in X_train and corresponding captions
for filename, caption in zip(X_train, train_sequences):
    image_path = os.path.join(imagefolder, filename)  # Create the complete path to each image
    image = load_img(image_path, target_size=(224, 224,3))  # Load and resize the image
    image_array = img_to_array(image) / 255.0  # Convert the image to an array and normalize the pixel values
    train_images.append(image_array)  # Append the image to the list
    train_captions.append(caption)


In [366]:
train_captions = pad_sequences(train_captions, maxlen=vocab_size, padding='post')

In [367]:
train_images = np.array(train_images)
train_captions = np.array(train_captions)

In [372]:
train_captions2 = to_categorical(train_captions, num_classes=vocab_size)

In [365]:
train_captions[1]

array([  1,  10,   8,   5,   1, 351,   8, 104,   4,   3,  44,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [375]:
train_captions.shape

(1035, 1561)

In [359]:
train_images.shape

(1035, 224, 224, 3)

In [348]:
basemodel = EfficientNetV2S(
    weights='imagenet',
    include_preprocessing=True,
    include_top=False,
    input_shape=(224,224,3)
)
basemodel.summary()

Model: "efficientnetv2-s"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling_7 (Rescaling)        (None, 224, 224, 3)  0           ['input_8[0][0]']                
                                                                                                  
 stem_conv (Conv2D)             (None, 112, 112, 24  648         ['rescaling_7[0][0]']            
                                )                                                                 
                                                                                   

In [349]:
for layer in basemodel.layers:
    layer.trainable = False

In [377]:
output = basemodel.output
output = Flatten()(output)
output = Dense(1561, activation='softmax')(output)

model = Model(inputs=basemodel.input, outputs=output)

In [378]:
model.compile(loss = 'categorical_crossentropy', 
                  optimizer = 'Adam', 
                  metrics = ['accuracy'])

In [379]:
model.fit(x = train_images,
          y = train_captions,
              batch_size = 32,
              epochs = 2,
              verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1a919b77880>