In [35]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.applications import EfficientNetV2S
from keras.utils import load_img, img_to_array, to_categorical, pad_sequences
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.layers import MaxPooling2D, Flatten, Dense, Dropout, Conv2D, LSTM, Embedding, Add, Input
from sklearn.model_selection import train_test_split
from os import listdir
import os
import shutil
import re

In [36]:
imagefolder = 'flickr8k/images/'
imagesize = 224

In [37]:
df1 = pd.read_pickle('caption1')
df1.head()

caption_order,image,1
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man in an orange hat starring at something .


In [38]:
df1.rename(columns={1:'caption1'},inplace=True)
df1.head()

caption_order,image,caption1
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
2,1002674143_1b742ab4b8.jpg,A little girl covered in paint sits in front o...
3,1003163366_44323f5815.jpg,A man lays on a bench while his dog sits by him .
4,1007129816_e794419615.jpg,A man in an orange hat starring at something .


In [39]:
df1_sample, _, _, _ = train_test_split(df1, df1, train_size=0.2, random_state=2)

In [40]:
df1_sample.shape

(1618, 2)

In [41]:
df1_sample['caption1'] = 'start ' + df1_sample['caption1'] + ' end'
df1_sample.head()

caption_order,image,caption1
6687,3644142276_caed26029e.jpg,start a blond girl and brunette brown hanging ...
6793,3670907052_c827593564.jpg,start A BMX rider in a red and black outfit is...
7916,774009278_8e75b7d498.jpg,start A dog is jumping to play in the water end
3742,3027399066_ca85495775.jpg,start A group of greyhound dogs racing with mu...
2122,2511019188_ca71775f2d.jpg,start A dog with a Frisbee in front of a brown...


In [42]:
df1_sample['caption1'] = df1_sample['caption1'].apply(lambda text: text.lower())
df1_sample['caption1'] = df1_sample['caption1'].apply(lambda text: re.sub(r'[^\w\s]', '', text))
df1_sample['caption1'] = df1_sample['caption1'].apply(lambda text: text.strip())

In [43]:
df1_sample['caption1'][0]

'start a child in a pink dress is climbing up a set of stairs in an entry way  end'

In [44]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df1_sample['caption1'])
df1_sample['tokenized'] = tokenizer.texts_to_sequences(df1_sample['caption1'])

In [45]:
df1_sample.head()

caption_order,image,caption1,tokenized
6687,3644142276_caed26029e.jpg,start a blond girl and brunette brown hanging ...,"[3, 1, 181, 15, 7, 901, 17, 296, 48, 902, 400, 2]"
6793,3670907052_c827593564.jpg,start a bmx rider in a red and black outfit is...,"[3, 1, 484, 202, 4, 1, 28, 7, 13, 254, 8, 41, ..."
7916,774009278_8e75b7d498.jpg,start a dog is jumping to play in the water end,"[3, 1, 9, 8, 41, 20, 105, 4, 5, 22, 2]"
3742,3027399066_ca85495775.jpg,start a group of greyhound dogs racing with mu...,"[3, 1, 26, 10, 255, 42, 203, 12, 626, 627, 93,..."
2122,2511019188_ca71775f2d.jpg,start a dog with a frisbee in front of a brown...,"[3, 1, 9, 12, 1, 161, 4, 47, 10, 1, 17, 9, 2]"


In [46]:
max_length = max(len(d.split()) for d in df1_sample['caption1'])

In [47]:
max_length

33

In [48]:
#x is the image
#x2 is the natural language caption
X1 = df1_sample['image']
y = df1_sample['tokenized']

In [49]:
y

6687    [3, 1, 181, 15, 7, 901, 17, 296, 48, 902, 400, 2]
6793    [3, 1, 484, 202, 4, 1, 28, 7, 13, 254, 8, 41, ...
7916               [3, 1, 9, 8, 41, 20, 105, 4, 5, 22, 2]
3742    [3, 1, 26, 10, 255, 42, 203, 12, 626, 627, 93,...
2122        [3, 1, 9, 12, 1, 161, 4, 47, 10, 1, 17, 9, 2]
                              ...                        
3606           [3, 1, 14, 69, 36, 5, 74, 199, 214, 28, 2]
5704         [3, 1, 11, 127, 1, 437, 1979, 52, 1, 177, 2]
6637    [3, 1, 11, 124, 25, 210, 52, 23, 35, 92, 20, 3...
2575    [3, 1, 762, 1980, 5, 1981, 10, 296, 8, 895, 4,...
7336       [3, 1, 17, 9, 158, 52, 20, 216, 21, 67, 78, 2]
Name: tokenized, Length: 1618, dtype: object

In [50]:
len(y)

1618

In [51]:
vocab_size = len(tokenizer.word_index) + 1
print(f'Vocabulary Size:  {vocab_size}')

Vocabulary Size:  1982


In [52]:
X1_remain, X1_test, y_remain, y_test = train_test_split(X1, y, train_size=0.8, random_state=2)

In [53]:
X1_remain.shape

(1294,)

In [54]:
tokenizer.word_index

{'a': 1,
 'end': 2,
 'start': 3,
 'in': 4,
 'the': 5,
 'on': 6,
 'and': 7,
 'is': 8,
 'dog': 9,
 'of': 10,
 'man': 11,
 'with': 12,
 'black': 13,
 'boy': 14,
 'girl': 15,
 'white': 16,
 'brown': 17,
 'are': 18,
 'at': 19,
 'to': 20,
 'an': 21,
 'water': 22,
 'while': 23,
 'woman': 24,
 'his': 25,
 'group': 26,
 'two': 27,
 'red': 28,
 'people': 29,
 'blue': 30,
 'shirt': 31,
 'ball': 32,
 'playing': 33,
 'wearing': 34,
 'standing': 35,
 'through': 36,
 'child': 37,
 'another': 38,
 'running': 39,
 'grass': 40,
 'jumping': 41,
 'dogs': 42,
 'green': 43,
 'down': 44,
 'over': 45,
 'snow': 46,
 'front': 47,
 'by': 48,
 'sitting': 49,
 'beach': 50,
 'holding': 51,
 'up': 52,
 'near': 53,
 'yellow': 54,
 'from': 55,
 'player': 56,
 'into': 57,
 'walking': 58,
 'field': 59,
 'runs': 60,
 'for': 61,
 'pink': 62,
 'her': 63,
 'children': 64,
 'mouth': 65,
 'bike': 66,
 'orange': 67,
 'stands': 68,
 'jumps': 69,
 'person': 70,
 'crowd': 71,
 'as': 72,
 'street': 73,
 'air': 74,
 'jacket': 75,
 

In [55]:
traindirectory = 'flickr8k/images/test'
for filename in X1_test:
    source_path = os.path.join(imagefolder, filename)
    destination_path = os.path.join(traindirectory, filename)
    shutil.copyfile(source_path, destination_path)

traindirectory = 'flickr8k/images/train'
for filename in X1_remain:
    source_path = os.path.join(imagefolder, filename)
    destination_path = os.path.join(traindirectory, filename)
    shutil.copyfile(source_path, destination_path)

In [56]:
X1_remain.to_pickle('remainpics')
X1_test.to_pickle('testpics')

In [57]:
y_remain.to_pickle('remaintokens')
y_test.to_pickle('testtokens')

In [58]:
remainimages = []

#turning images into arrays
for filename, token in zip(X1_remain, y_remain):
    image_path = os.path.join(imagefolder, filename)  
    image = load_img(image_path, target_size=(224, 224,3))  
    image_array = img_to_array(image) / 255.0 
    remainimages.append(image_array)
array(remainimages)


array([[[[0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         ...,
         [0.40392157, 0.6156863 , 0.7254902 ],
         [0.40784314, 0.61960787, 0.7294118 ],
         [0.4117647 , 0.6117647 , 0.7254902 ]],

        [[0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         ...,
         [0.38431373, 0.59607846, 0.7058824 ],
         [0.38431373, 0.59607846, 0.7058824 ],
         [0.39607844, 0.59607846, 0.70980394]],

        [[0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         ...,
         [0.4       , 0.61960787, 0.7254902 ],
         [0.40392157, 0.62352943, 0.7294118 ],
         [0.4       , 0.6117647 , 0.72156864]],

        ...,

        [[0.7137255 , 0.56078434, 0.30588236],
         [0.68235296, 0.5254902 , 0.28235295]

In [59]:
array(remainimages).shape

(1294, 224, 224, 3)

In [60]:
remainimages = np.array(remainimages)

In [123]:
def generate_sequences(photo, tokens, max_length, vocab_size):
    x1 = []
    x2 = []
    y = []
    for a in tokens:
        for n in range(len(a)):
            inseq, outseq = a[:n], a[n]
            inseq = pad_sequences([inseq], maxlen=max_length, padding='post')
            outseq = to_categorical([outseq], num_classes=vocab_size)
            x1.append(photo)
            x2.append(inseq)
            y.append(outseq)
    return array(x1), array(x2), array(y)

In [124]:
def data_generator(images, tokens, max_length, vocab_size):
    for t in range(len(tokens)):
        photo = images[t]
        inimg, inseq, outword = generate_sequences(photo, tokens, max_length, vocab_size)
        yield [inimg, inseq], outword

In [125]:
generator = data_generator(remainimages, y_remain, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(16906, 224, 224, 3)
(16906, 1, 33)
(16906, 1, 1982)


## HELP HOW DO I GET THE BATCH SHAPES TO WORK????? OBV ITS NOT DOING THIS PER LINE IN THE REMAIN IMAGES ARRAY!!

In [111]:
type(y_remain)

pandas.core.series.Series

In [91]:
for a in y_remain:
    print(a)
    for n in range(len(a)):
        print(n)
    break

[3, 1, 490, 94, 146, 8, 1717, 48, 38, 2]
0
1
2
3
4
5
6
7
8
9


In [80]:
y_remainlist = y_remain.tolist()

In [83]:
y_remainlist

[[3, 1, 490, 94, 146, 8, 1717, 48, 38, 2],
 [3, 27, 42, 539, 92, 20, 272, 2],
 [3, 29, 289, 4, 1225, 300, 2],
 [3, 1, 1959, 1960, 5, 1961, 6, 25, 83, 2],
 [3, 1, 11, 7, 1, 98, 37, 233, 90, 5, 50, 558, 2],
 [3, 1, 159, 56, 808, 20, 809, 5, 32, 2],
 [3, 1, 15, 391, 55, 1, 22, 387, 2],
 [3, 1, 14, 257, 1, 154, 6, 25, 111, 6, 1, 301, 2],
 [3, 1, 14, 4, 153, 313, 23, 35, 4, 442, 22, 2],
 [3, 1, 181, 15, 35, 4, 47, 10, 1, 87, 737, 569, 2],
 [3, 1, 14, 34, 220, 6, 25, 182, 119, 19, 5, 83, 2],
 [3, 1, 24, 7, 1, 98, 15, 18, 35, 235, 12, 266, 761, 2],
 [3, 82, 126, 18, 415, 52, 1, 1626, 6, 1, 115, 1627, 2],
 [3, 1, 14, 403, 80, 10, 25, 481, 270, 21, 244, 262, 2],
 [3, 1, 14, 4, 13, 166, 7, 43, 425, 69, 207, 12, 25, 111, 2],
 [3, 1, 159, 56, 589, 5, 32, 270, 1, 170, 2],
 [3, 1, 17, 7, 16, 9, 12, 1, 17, 7, 13, 306, 39, 310, 512, 2],
 [3,
  1,
  181,
  15,
  4,
  1,
  43,
  138,
  7,
  1401,
  341,
  757,
  68,
  4,
  47,
  10,
  1,
  811,
  212,
  7,
  1,
  11,
  2],
 [3, 1, 71, 10, 29, 18, 35, 12

In [84]:
y_remain

7432             [3, 1, 490, 94, 146, 8, 1717, 48, 38, 2]
1721                     [3, 27, 42, 539, 92, 20, 272, 2]
2237                        [3, 29, 289, 4, 1225, 300, 2]
1375            [3, 1, 1959, 1960, 5, 1961, 6, 25, 83, 2]
7048     [3, 1, 11, 7, 1, 98, 37, 233, 90, 5, 50, 558, 2]
                              ...                        
2990                     [3, 82, 13, 42, 18, 6, 1, 50, 2]
3335        [3, 1, 241, 234, 41, 4, 5, 74, 45, 1, 273, 2]
767     [3, 27, 42, 18, 39, 36, 22, 176, 1, 204, 7, 28...
6411           [3, 1, 11, 8, 33, 12, 1, 9, 4, 134, 22, 2]
3803    [3, 1, 11, 107, 52, 21, 796, 561, 23, 222, 7, ...
Name: tokenized, Length: 1294, dtype: object

Modeling using Efficientnetv2s

In [126]:
basemodel = EfficientNetV2S(
    weights='imagenet',
    include_preprocessing=True,
    include_top=False,
)
basemodel.summary()

Model: "efficientnetv2-s"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, None,  0           []                               
                                 3)]                                                              
                                                                                                  
 rescaling (Rescaling)          (None, None, None,   0           ['input_1[0][0]']                
                                3)                                                                
                                                                                                  
 stem_conv (Conv2D)             (None, None, None,   648         ['rescaling[0][0]']              
                                24)                                                

In [127]:
for layer in basemodel.layers:
    layer.trainable = False

In [128]:
inputs1 = Input(shape=(224, 224, 3))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(len(tokenizer.word_index) + 1, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = Add()([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
output = Dense(len(tokenizer.word_index) + 1, activation='softmax')(decoder2)

model = Model(inputs = [inputs1, inputs2], outputs = output)

In [129]:
model.compile(loss = 'categorical_crossentropy', 
                  optimizer = 'Adam', 
                  metrics = ['accuracy'],
                  run_eagerly=True)

In [130]:
steps = len(inputs)

In [131]:
model.fit(generator,
            epochs = 2,
            steps_per_epoch = steps,
            verbose=1,
            validation_data=0.2)

MemoryError: Unable to allocate 9.48 GiB for an array with shape (16906, 224, 224, 3) and data type float32