##### Copyright 2018 The TensorFlow Authors.



In [23]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Image captioning with visual attention

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/text/image_captioning">
    <img src="https://www.tensorflow.org/images/tf_logo_32px.png" />
    View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/text/image_captioning.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />
    Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/text/image_captioning.ipynb">
    <img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />
    View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/text/image_captioning.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

Given an image like the example below, our goal is to generate a caption such as "a surfer riding on a wave".

![Man Surfing](https://tensorflow.org/images/surf.jpg)

*[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg); License: Public Domain*

To accomplish this, you'll use an attention-based model, which enables us to see what parts of the image the model focuses on as it generates a caption.

![Prediction](https://tensorflow.org/images/imcap_prediction.png)

The model architecture is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044).

This notebook is an end-to-end example. When you run the notebook, it downloads the [MS-COCO](http://cocodataset.org/#home) dataset, preprocesses and caches a subset of images using Inception V3, trains an encoder-decoder model, and generates captions on new images using the trained model.

In this example, you will train a model on a relatively small amount of data—the first 30,000 captions  for about 20,000 images (because there are multiple captions per image in the dataset).

In [24]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [12]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
from tqdm import tqdm

## Download and prepare the MS-COCO dataset

You will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. The dataset contains over 82,000 images, each of which has at least 5 different caption annotations. The code below downloads and extracts the dataset automatically.

**Caution: large download ahead**. You'll use the training set, which is a 13GB file.

In [9]:
# annotation_zip = tf.keras.utils.get_file('captions.zip',
#                                           cache_subdir=os.path.abspath('.'),
#                                           origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',
#                                           extract = True)
# annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'

# name_of_zip = 'train2014.zip'
# if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):
#   image_zip = tf.keras.utils.get_file(name_of_zip,
#                                       cache_subdir=os.path.abspath('.'),
#                                       origin = 'http://images.cocodataset.org/zips/train2014.zip',
#                                       extract = True)
#   PATH = os.path.dirname(image_zip)+'/train2014/'
# else:
#   PATH = os.path.abspath('.')+'/train2014/'

annotation_file = 'annotations/captions_train2014.json'
PATH = 'train2014/'

## Optional: limit the size of the training set 
To speed up training for this tutorial, you'll use a subset of 30,000 captions and their corresponding images to train our model. Choosing to use more data would result in improved captioning quality.

In [13]:
# Read the json file
with open(annotation_file, 'r') as f:
    annotations = json.load(f)

# Store captions and image names in vectors
all_captions = []
all_img_name_vector = []

for annot in annotations['annotations']:
    caption = '<start> ' + annot['caption'] + ' <end>'
    image_id = annot['image_id']
    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)

    all_img_name_vector.append(full_coco_image_path)
    all_captions.append(caption)

# Shuffle captions and image_names together
# Set a random state
train_captions, img_name_vector = shuffle(all_captions,
                                          all_img_name_vector,
                                          random_state=1)

# Select the first 30000 captions from the shuffled set
num_examples = 60000
train_captions = train_captions[:num_examples]
img_name_vector = img_name_vector[:num_examples]

In [14]:
print(len(train_captions), len(all_captions))
f = open('train_captions.pickle', 'wb')   #SPECIFY PATH TO LOCAL DIRECTORY IS RUNNING LOCALLY
pickle.dump(train_captions, f)
f.close()

60000 414113


In [16]:
for i in train_captions:
    print(i)

<start> A skateboarder performing a trick on a skateboard ramp. <end>
<start> a person soaring through the air on skis <end>
<start> a wood door with some boards laid against it <end>
<start> A Do Not Enter sign is posted along a road leading to a stadium. <end>
<start> Small child in a high chair eating off of a green plate.  <end>
<start> groups of people standing around the toilet area <end>
<start> a hand is holding a black and silver cellphone <end>
<start> People working on laptop computers in a student dining room <end>
<start> two birds near one another in a field  <end>
<start> A man talking on a cell phone in a park. <end>
<start> A group of men standing around a table with microphones while one makes a speech. <end>
<start> A bathroom with a toilet and sprayer attached to the wall. <end>
<start> A woman sitting on a wooden bench talking on a phone. <end>
<start> A woman with a clock on her purse at the market. <end>
<start> The surfer is offering his favorite hand signal. <e

<start> Woman in blue shirt holding a small brown bird in her hands.  <end>
<start> The scrawny black and white cat stood on the tile. <end>
<start> A cat laying on top of a bed next to pillows. <end>
<start> A pink and brown teddy bear are sitting on the bed. <end>
<start> The person looks like a little kid with skis and a snowsuit.  <end>
<start> A sandwich and some delicious looking onion rings decorate this plate. <end>
<start> A plate holds pieces of an orange donut. <end>
<start> Two skiiers and a snowboarder getting ready to ride down a mountain. <end>
<start> A small bathroom with a toilet that has buttons on the side. <end>
<start> The plane is flying up in the sky <end>
<start> A box full of apples addressed to a person named Cortland. <end>
<start> A group of people on a court playing tennis. <end>
<start> Red traffic lights viewed through a windshield on a rainy night. <end>
<start> A wooden tray holding a pizza next to a bowl of salad. <end>
<start> A pug laying on the edg

<start> A bathroom that has two mirrors and two sinks. <end>
<start> Two people riding horses side by side together
 <end>
<start> The woman sits in between two small children as they eat. <end>
<start> A large locomotive engine traveling down a graffiti covered wall. <end>
<start> A small girl wearing a pink dress plays with a frisbee <end>
<start> Hands holding small white objects on person seated one standing <end>
<start> A green apple is surrounded by a group of bananas. <end>
<start> A boat is sailing on water in front of a city. <end>
<start> a male tennis player an alien mouth and a racket <end>
<start> A home computer setup with two laptops on a desk. <end>
<start> A young girl in glasses holding onto a tennis racquet <end>
<start> Several different fruits and vegetables arranged on a counter. <end>
<start> A street sign attached to a pole at an intersection. <end>
<start> A woman laying on top of a surfboard next to a black cat. <end>
<start> A man riding a skateboard with a 

<start> A young man in a kitchen shapes dough into balls. <end>
<start> A plane passing down a runway, near a forest. <end>
<start> A man in white playing croquet on green grass. <end>
<start> Spectators and television cameras watching professional baseball in the sun <end>
<start> A horse wears a harness as it stands in a parking lot. <end>
<start> A man is holding an umbrella while riding an elephant <end>
<start> A restroom tub, sink, and toilet with the lid up <end>
<start> A female surfer runs into the ocean with her yellow surfboard. <end>
<start> some people are walking on the curbs and in the street <end>
<start> A cow with horns in a beautiful green valley with cloudy skies above it. <end>
<start> Patrons dining in an outdoor patio area with a staff member by the entrance. <end>
<start> REMOTE CONTROL SITTING NEXT TO AN OPEN TOILET <end>
<start> Fancily decorated cupcakes are on a cooling rack. <end>
<start> The small dog walks with a Frisbee in it's mouth. <end>
<start> A red

<start> A very elegant bathroom with marble and accented with red. <end>
<start> A stop sign and obstructions stop travelers on a path. <end>
<start> The bird is sitting next to red berries on the branch. <end>
<start> Store window display featuring tennis balls and mannequins wearing lingerie. <end>
<start> Kites being flown from city park in sky. <end>
<start> A close up of a breaded food with vegetables and cheese. <end>
<start> A zebra standing in a field with an impala in the distance. <end>
<start> A baseball player at home plate who has just broken a bat after hitting the ball <end>
<start> A dog is running along side a beach <end>
<start> An elderly gentleman is playing a bowling game. <end>
<start> A white boat traveling across a lake with people and a dog. <end>
<start> Several motorcycles sitting parked in the grass near horses. <end>
<start> A man is wearing a cap and has an orange tie.  <end>
<start> A baseball player at bat waiting for a pitch. <end>
<start> A man stands 

<start> An airplane flying through the air with a man hanging on. <end>
<start> A boy in the air after going up a ramp on a skateboard.  <end>
<start> View of a white church with clutter by the door. <end>
<start> A woman is standing by busses using a cell phone. <end>
<start> A man holds open a box of pizza that's stacked on top of other boxes of pizza. <end>
<start> apples and a cantaloupe in a dish on a table <end>
<start> Someone is hand gliding through the water.  <end>
<start> Women playing partners in tennis shake each others hands on the tennis court. <end>
<start> Raw food is arranged artistically on a plate. <end>
<start> a couple of goats are standing in a field <end>
<start> A train traveling on top of a railway that runs above ground through a city. <end>
<start> there is a piece of chocolate cake on a paper plate <end>
<start> A person riding a horse around an arena. <end>
<start> Women in dresses and men in tuxedos standing in a park.  <end>
<start> A horse-drawn carriag

<start> A group of people are sitting in a meeting room. <end>
<start> A toddler wearing a party hat has cake all over his hands and face.  <end>
<start> A airplane flying high up in the sky. <end>
<start> A plate with rice and some sort of meat smothered in gravy, surrounded bowls of other meals <end>
<start> Two young men playing the Nintendo Wii together. <end>
<start> two people standing near one another with a plate of food  <end>
<start> Two people in coats walk together under an umbrella. <end>
<start> A monitor sitting next to a keyboard, headphones and another monitor. <end>
<start> One giraffe is wrapping it's neck around another giraffe.   <end>
<start> A long mirror over a row of sinks in a rest room <end>
<start> A boy standing holding a wii remote in his hand <end>
<start> small serving cups that have decorated with googly eyes <end>
<start> 2 birds standing on the ground in a field  <end>
<start> An airplane flying over a city and a parking lot <end>
<start> a stuffed an

<start> A herd of horses grazing on the snow. <end>
<start> there are many surfers that are walking along this beach <end>
<start> A street sign for Rockaway Beach protruding from a concrete pole <end>
<start> A man that is in the grass with a frisbee. <end>
<start> a bathroom with a sink and door inside of it  <end>
<start> A cup of coffee and some doughnuts on a table. <end>
<start> a plan taking off that reads transavia.com on the side <end>
<start> Boats are docked at a pier near the ocean. <end>
<start> Large cathedral building with multiple women seats leading towards the front.  <end>
<start> Several people on a beach are flying kites near the ocean. <end>
<start> Young teen plays tennis with friends on the court  <end>
<start> A man with an apron prepares a mixture in a bowl. <end>
<start> A type of animal sits on top of a shoe.  <end>
<start> People are laying on the grass and playing frisbee. <end>
<start> A baseball player slide into the base on a hot day <end>
<start> A lar

<start> an old time picture of a group of skiers <end>
<start> A single bear stands still in a grassy green field. <end>
<start> A large cow laying on top of a sandy surface. <end>
<start> A man holding a baseball bat during a baseball game. <end>
<start> Three men sitting around a table using laptops. <end>
<start> Youths play baseball during a game while fans watch <end>
<start> there are many people riding elephants across the water <end>
<start> traffic lights at an intersection by some businesses  <end>
<start> a small stuffed animal near a banana on a table <end>
<start> Two People playing inside a living room together while other people sit. <end>
<start> Several sheep grazing and roaming in their green pasture <end>
<start> A man flying through the air while riding a skateboard. <end>
<start> A baseball player holding a bat while standing on a field. <end>
<start> two men feeding each other a piece of a cake <end>
<start> a breakfast dish made with eggs and bacon on a plate <en

<start> A man that is on a skateboard grinding on a rail. <end>
<start> A cup of coffee on a coaster next to a mouse <end>
<start> Two men wearing suits smiling and holding small presents. <end>
<start> A baby holds a pick teddy bear and looks at the camera. <end>
<start> Two white horses are looking over a metal fence. <end>
<start> A person that is standing in the street with an umbrella. <end>
<start> A man slicing into a pizza with a knife. <end>
<start> A red fire hydrant on the side of the road with smoke and people in the background <end>
<start> a tennis match with a crowd of people in the stands. <end>
<start> A boy picking toppings off of a very large pizza. <end>
<start> A man sitting in front of a computer desk. <end>
<start> The lady watches the dog on the surfboard.  <end>
<start> A man that is sitting on the back of a motorcycle. <end>
<start> an image of  a bathroom setting with items on sink <end>
<start> Two ships in the ocean sailing during the day.  <end>
<start> ve

<start> A skateboarder getting ready to do a trick on the tar. <end>
<start> A blue motorcycle parked in front of a building. <end>
<start> A bus is parked in between other buses.  <end>
<start> A skiier smiles as he makes his way down the slope. <end>
<start> A baseball player pitching a ball on top of a field. <end>
<start> Someone over the ocean connected to a parachute while people are on the beach.  <end>
<start> A large handicapped toilet area with several assistive devices. <end>
<start> A dog sitting on a bed in a bedroom. <end>
<start> A young child is riding on a snow board in the snow. <end>
<start> A long wire hanging over a  large body of water. <end>
<start> Bikers following a rural trail through the mountains leading towards clouds above  <end>
<start> a man ready playing frisebee and ready to caught it <end>
<start> A motorcycle that is sitting near a helicopter. <end>
<start> A stopped train at a train crossing with people crossing the tracks. <end>
<start> A dog stick

<start> A white cat sits on an office chair in a home. <end>
<start> A cat hanging out inside of a fridge with food <end>
<start> A cute small bird by some very pretty plants. <end>
<start> A person performs a trick on a skateboard. <end>
<start> An airport where an airplane is being prepared for flying <end>
<start> A plate that has some very colorful food on it. <end>
<start> A pair of scissors are laying on a table <end>
<start> Woman with cell phone posing on city street at night. <end>
<start> As others watch, a woman, next to a table with a cake, holds a knife and a bottle. <end>
<start> a truck is parked near a large cactus <end>
<start> A child in mid-jump from jumping on a bed. <end>
<start> A bunch of men dressed for a team sport slapping hands. <end>
<start> A laptop is sitting on a desk with the screen flipped open.  <end>
<start> A market with bins for fruit like oranges, limes, and lemons. <end>
<start> A woman lying on a bed with high heel shoes on. <end>
<start> Three p

<start> An elephant walking on dirt road with trees on each side. <end>
<start> a tray covered in paper has several cakes decorated trains <end>
<start> a tv on a dresser near a desk  <end>
<start> Bunches of bananas are on a branch near flowers. <end>
<start> Woman on cell phone writing down notes in kitchen  <end>
<start> A number of scooters and bicycles parked in an alley. <end>
<start> Surfer kicking up a large wave on green ocean water.  <end>
<start> A red washer sitting in a room under an animal cage. <end>
<start> A young man wearing a catchers mitt next to a red car. <end>
<start> Bunches of ripe and unripened bananas hanging from wooden beams in on the ceiling. <end>
<start> A jet that looks to have been sitting a while on the tarmac of an airport. <end>
<start> A tall red building with a big white clock on the top.  <end>
<start> A bathroom with a long counter near a bathtub.   <end>
<start> A large white bed with pillows in a room. <end>
<start> A man sitting on a curb whi

<start> Three people walk on the sidewalk with rolling luggage bags.  <end>
<start> Modern furniture set with a brightly lit lamp beside picture.
 <end>
<start> a single elephant kicking up some dirt by a fence <end>
<start> A person laying in bed reading a book. <end>
<start> There is a sign that reads "King Harald Street" <end>
<start> A bunch of people are standing around watching a woman cut a piece of cake and there are a bunch of coffee cups and saucers on the table. <end>
<start> Baseball player poised to swing with catcher and referee poised <end>
<start> Silhouette of two birds in a tree with hazy sky in background. <end>
<start> A horse being pulled by a rope held by its handler.  <end>
<start> A group of zebras standing next to a body of water.  <end>
<start> A woman and child sitting together on a brown couch. <end>
<start> Big yellow bus in highway traffic facing into the camera <end>
<start> A blue train is sitting still at a station. <end>
<start> There is a black and wh

<start> A large street sign is pointing out the directions. <end>
<start> Two giraffe grazing on tree leaves under a hazy sky. <end>
<start> a black bear with a white face is in some water <end>
<start> many microwave with price label on the shelves <end>
<start> a transit bus on a city street near a building <end>
<start> A very cute lady kneeling down by a bunch of umbrellas. <end>
<start> Red commuter train on track in rural mountainous region. <end>
<start> A long red couch in a living room <end>
<start> A woman holds a rainbow kite while standing in the middle of a field. <end>
<start> A giraffe sticking its head through a fence with building in background. <end>
<start> There's a tennis player serving a tennis ball at a tennis match. <end>
<start> A living room area with tables and a kitchen in background. <end>
<start> A white toilet sitting next to a white sink. <end>
<start> A meter made standing next to a parking meter. <end>
<start> hundreds of sail boats docked in the water

<start> A bowl of apples and an apple that has been bitten <end>
<start> a home has various plants and seating in front of it. <end>
<start> A single zebra is seen near some giraffes. <end>
<start> A mechanized steel parking machine in front of a wooden fence. <end>
<start> a man in a hat carrying a backup in the forest <end>
<start> a little boy in green and white baseball clothes <end>
<start> Happy woman showing off red frosted doughnut in serving paper. <end>
<start> A man holding a blue camera in front of his face. <end>
<start> Bathroom scene, white toilet on light brown tiled floor. <end>
<start> two horses, a big horse and little horse and a gentleman is sitting on the big horse. <end>
<start> A boy holding a skateboard with images on the bottom. <end>
<start> A crane in front of a building with cones around it. <end>
<start> some people and the male is holding a baseball bat <end>
<start> two woman sitting on the ground one is on a cell phone <end>
<start> Business people havi

<start> Two individuals playing with a Frisbee in the sand. <end>
<start> Tourist busses next to each other on a road <end>
<start> A tennis player in the middle of a play with spectators in the background. <end>
<start> Two people are skiing down hill near some trees.  <end>
<start> A black and white photo of a baby giraffe. <end>
<start> a sheep dog watching a group of sheep in a field <end>
<start> A man laughing in a room cluttered with cloths and stuff. <end>
<start> A herd of sheep standing next to a flock of chickens. <end>
<start> A young man taking a swing at a ball <end>
<start> A woman laying on a beach next to a colorful umbrella. <end>
<start> a room with a bunch of furniture in it <end>
<start> Three stuffed animals sitting next to each other. <end>
<start> A young boy standing on flat area on skis. <end>
<start> A slice of strawberry cheesecake on a plate with a fork <end>
<start> A dog wearing a wooden tie sitting in its master's lap. <end>
<start> a guy in a  uniform r

<start> A herd of sheep walking down a city road following a person. <end>
<start> a small girl with a red shirt is talking on a cell phone <end>
<start> A basket with a sandwich, coleslaw, and onion rings is sitting on the table. <end>
<start> Various street signs including one reading "humped zebra crossing." <end>
<start> Horse drawn carriage on cobblestone road in urban area. <end>
<start> An old photo on skies on top of the snow hill. <end>
<start> A man holding a tennis racquet while standing on a  tennis court. <end>
<start> Adult trying modern commode with drinking spout on display. <end>
<start> Many people ski near the snowboard racks at the bottom of a slop. <end>
<start> A picture of an elephant with huge tusks. <end>
<start> a close up of a table with an ipod headphones and a remote <end>
<start> a Wii box controllers cables instructions and a power adapter <end>
<start> There is a bottle of Gatorade on the table next to a laptop computer.  <end>
<start> A woman standing b

<start> A chrome kitchen with a stove and refrigerator <end>
<start> A lone seagull sitting on a rock with a sunset in the background. <end>
<start> People are in the water outside of the city.  <end>
<start> a number of trains at a station with people near by <end>
<start> A wing flying over a large mountain range. <end>
<start> A man in a suit and hat is riding on a bicycle. <end>
<start> A man is sitting in a chair and watching a child fly a kite. <end>
<start> A kitchen with darkly colored cupboards and a silver oven. <end>
<start> A bunch of stuffed animals sitting near the stairs <end>
<start> A man wearing a large backpack hiking with skis through snow.  <end>
<start> A person with skateboard on a city street. <end>
<start> A salad and some bread on a table by a pool. <end>
<start> A magnetic sign hung on the side of a refrigerator in a kitchen in front of cabinets. <end>
<start> Two people standing next to a skate ramp <end>
<start> A glass of good wine helps the cook do the co

<start> Gourmet pita sandwich with fresh lettuce in hand <end>
<start> A tall animal standing next to a tree. <end>
<start> Two bottles of wine sitting on top of a table. <end>
<start> Someone is standing on a brown skateboard with artwork. <end>
<start> a man is swinging a tennis racket to hit a ball <end>
<start> The truck has a colorful paint pattern on it. <end>
<start> A beer bottle rests on the small counter space beside the modern bathroom sink.  <end>
<start> Two soccer players playing soccer on a soccer field. <end>
<start> A store display with birds and purses in cages <end>
<start> a young child riding skis on a snowy surface <end>
<start> A shot of a cluttered desk and television in an office. <end>
<start> Many people standing and walking on a sidewalk next to motorcycles. <end>
<start> Professional baseball players giving each other a high five by the dugout <end>
<start> Outdoor artsy display of pots and foliage inside neat shop. <end>
<start> Rose in dotted vase on desk

<start> A horse that is pulling a carriage down a street. <end>
<start> A large metal clock high up in a building. <end>
<start> The double decker bus is painted with bright colors. <end>
<start> A giraffe standing next to green plants in a field. <end>
<start> A modern looking area leading to a pool and jacuzzi <end>
<start> A woman and two children are blowing out the candles on the cake. <end>
<start> a huge group of skiers in a marathon <end>
<start> A tennis player on the court in a tennis stadium with a man walking behind him. <end>
<start> A group of people snow skiing in a competition. <end>
<start> Four giraffes are in the cage at the zoo. <end>
<start> A green bus driving filled with passengers on a narrow road. <end>
<start> An animal on a grassy field of some sort. <end>
<start> Two giraffes that are standing up near each other. <end>
<start> A herd of sheep standing on top of a dry grass field. <end>
<start> A police officer on a cell phone standing in a crowd of people. <

<start> A black dog standing on a docked boat <end>
<start> Shiny bits of brown and white food on a red plate <end>
<start> A dog holds a cat down as they wrestle. <end>
<start> A drab colored kitchen with a shiny metal oven. <end>
<start> a bunch of people walk in a parade  <end>
<start> A white surfboard sitting in the corner of a room. <end>
<start> A dog sits inside a car with a pair of fuzzy dice. <end>
<start> a photo of an empty street with a street light <end>
<start> Man skateboarding with protective gear and orange cones <end>
<start> A dessert with a frosting sunflower and purple frosting roses. <end>
<start> Messy row of books books and small teddy bear. <end>
<start> A vintage city street scene with pedestrians and a horse and buggy. <end>
<start> We are looking at a crowded open air market. <end>
<start> A woman with glasses talking on the telephone. <end>
<start> two brown and white birds sitting on a roof <end>
<start> A group of giraffes are standing next to a tree beh

<start> A display of modern at with an orange color scheme. <end>
<start> A gray van parked outside of an orange building. <end>
<start> A man is snowboarding over some barrels.  <end>
<start> A video game of a man playing tennis. <end>
<start> Two elephants walking around park with caretaker nearby <end>
<start> A young man riding a skate board on a skate board ramp. <end>
<start> a toddler walking towards a suitcase to touch it  <end>
<start> A blue bird perched on a branch, looking towards its left. <end>
<start> The giraffe seems calm inside of the fence. <end>
<start> Seating is arranged around a coffee table and rug on a tile floor. <end>
<start> A huge pile of many bananas that are not ripe. <end>
<start> A woman is laying on a bed with a cat. <end>
<start> A plate with hot dogs and condiments laying on a table. <end>
<start> An elephant with a man on its back and people standing nearby. <end>
<start> An owl is sitting on a large log in the wilderness. <end>
<start> Pizza at a r

<start> Three men standing together while on of them handing another one a frisbee. <end>
<start> A side view mirror sitting on the front of a semi truck. <end>
<start> Two bicycles chained to a fence next to a park bench on a winter day. <end>
<start> People standing and sitting down at a carnival. <end>
<start> On a bus or subway a man sitting down listening to music. <end>
<start> a close up of a dog laying on a bed with a ball <end>
<start> A small dog lays on a blanket with a stuffed animal. <end>
<start> Several lined up motorcycles in a city street. <end>
<start> A young boy hiding under a car's passenger seat from a zebra. <end>
<start> a woman sitting on a couch next to another person with both people holding wine glasses. <end>
<start> A man flying through the air on top of a skateboard. <end>
<start> A girl is sitting with an umbrella and some dolls. <end>
<start> a baseball player holding a bat in a batterbox <end>
<start> A group of diners at a table have wine. <end>
<star

<start> A small girl standing in a room with a couple of camping chairs <end>
<start> A jumbo jet on the tarmac of an airport. <end>
<start> A group of man playing a game of soccer. <end>
<start> An adult man helping a youth on a skateboard. <end>
<start> A ladder truck is traveling in the slow lane of the highway. <end>
<start> A speed bump sign at a street intersection. <end>
<start> An three way intersection with a stop sign and a a road sign across the intersection pointing the direction for two highways. <end>
<start> a train on a train track near a station <end>
<start> Two baseball players walking on a baseball field. <end>
<start> Poised to slice into an iced multi-layer cake. <end>
<start> A cup of coffee set next to a computer and mouse. <end>
<start> A flat-bread pizza sitting on a dinner plate. <end>
<start> A young boy standing next to a tree. <end>
<start> A man doing a trick on his skate board. <end>
<start> Two wine glasses lie beside a bottle of wine in straw. <end>
<s

## Preprocess the images using InceptionV3
Next, you will use InceptionV3 (which is pretrained on Imagenet) to classify each image. You will extract features from the last convolutional layer.

First, you will convert the images into InceptionV3's expected format by:
* Resizing the image to 299px by 299px
* [Preprocess the images](https://cloud.google.com/tpu/docs/inception-v3-advanced#preprocessing_stage) using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to normalize the image so that it contains pixels in the range of -1 to 1, which matches the format of the images used to train InceptionV3.

In [29]:
def load_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (299, 299))
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    return img, image_path

## Initialize InceptionV3 and load the pretrained Imagenet weights

Now you'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. The shape of the output of this layer is ```8x8x2048```. You use the last convolutional layer because you are using attention in this example. You don't perform this initialization during training because it could become a bottleneck.

* You forward each image through the network and store the resulting vector in a dictionary (image_name --> feature_vector).
* After all the images are passed through the network, you pickle the dictionary and save it to disk.




In [30]:
image_model = tf.keras.applications.InceptionV3(include_top=False,
                                                weights='imagenet')
new_input = image_model.input
hidden_layer = image_model.layers[-1].output

image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

## Caching the features extracted from InceptionV3

You will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but also memory intensive, requiring 8 \* 8 \* 2048 floats per image. At the time of writing, this exceeds the memory limitations of Colab (currently 12GB of memory).

Performance could be improved with a more sophisticated caching strategy (for example, by sharding the images to reduce random access disk I/O), but that would require more code.

The caching will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you can: 

1. install [tqdm](https://github.com/tqdm/tqdm):

    `!pip install tqdm`

2. Import tqdm:

    `from tqdm import tqdm`

3. Change the following line:

    `for img, path in image_dataset:`

    to:

    `for img, path in tqdm(image_dataset):`


In [56]:
# Get unique images
encode_train = sorted(set(img_name_vector))

# Feel free to change batch_size according to your system configuration
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(
  load_image, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(16)
for img, path in image_dataset:
    batch_features = image_features_extract_model(img)
    print(batch_features)
    batch_features = tf.reshape(batch_features,
                              (batch_features.shape[0], -1, batch_features.shape[3]))

    for bf, p in zip(batch_features, path):
        path_of_feature = p.numpy().decode("utf-8")
        np.save(path_of_feature, bf.numpy())

Tensor("model_1_18/mixed10/concat:0", shape=(?, 8, 8, 2048), dtype=float32)


TypeError: Failed to convert object of type <class 'tuple'> to Tensor. Contents: (Dimension(None), -1, Dimension(2048)). Consider casting elements to a supported type.

## Preprocess and tokenize the captions

* First, you'll tokenize the captions (for example, by splitting on spaces). This gives us a  vocabulary of all of the unique words in the data (for example, "surfing", "football", and so on).
* Next, you'll limit the vocabulary size to the top 5,000 words (to save memory). You'll replace all other words with the token "UNK" (unknown).
* You then create word-to-index and index-to-word mappings.
* Finally, you pad all sequences to be the same length as the longest one.

In [None]:
# Find the maximum length of any caption in our dataset
def calc_max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
# Choose the top 5000 words from the vocabulary
top_k = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k,oov_token="<unk>",filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [None]:
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

In [None]:
# Create the tokenized vectors
train_seqs = tokenizer.texts_to_sequences(train_captions)

In [None]:
# Pad each vector to the max_length of the captions
# If you do not provide a max_length value, pad_sequences calculates it automatically
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

In [None]:
# Calculates the max_length, which is used to store the attention weights
max_length = calc_max_length(train_seqs)
max_length

## Split the data into training and testing

In [None]:
# Create training and validation sets using an 80-20 split
img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector,
                                                                    cap_vector,
                                                                    test_size=0.2,
                                                                    random_state=0)

In [None]:
len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)

## Create a tf.data dataset for training



 Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.

In [None]:
# Feel free to change these parameters according to your system's configuration

BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
vocab_size = len(tokenizer.word_index) + 1
num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

In [None]:
# Load the numpy files
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(
          map_func, [item1, item2], [tf.float32, tf.int32]),
          num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

## Model

Fun fact: the decoder below is identical to the one in the example for [Neural Machine Translation with Attention](../sequences/nmt_with_attention.ipynb).

The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.

* In this example, you extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048).
* You squash that to a shape of (64, 2048).
* This vector is then passed through the CNN Encoder (which consists of a single Fully connected layer).
* The RNN (here GRU) attends over the image to predict the next word.

In [None]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)

    # score shape == (batch_size, 64, hidden_size)
    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

    # attention_weights shape == (batch_size, 64, 1)
    # you get 1 at the last axis because you are applying score to self.V
    attention_weights = tf.nn.softmax(self.V(score), axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
class CNN_Encoder(tf.keras.Model):
    # Since you have already extracted the features and dumped it using pickle
    # This encoder passes those features through a Fully connected layer
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        # shape after fc == (batch_size, 64, embedding_dim)
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

In [None]:
class RNN_Decoder(tf.keras.Model):
  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)

    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    # defining attention as a separate model
    context_vector, attention_weights = self.attention(features, hidden)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)

    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))

    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)

    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [None]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, vocab_size)

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

## Checkpoint

In [None]:
checkpoint_path = "/content/drive/My Drive/Colab Notebooks"
ckpt = tf.train.Checkpoint(encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

## Training

* You extract the features stored in the respective `.npy` files and then pass those features through the encoder.
* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.
* The decoder returns the predictions and the decoder hidden state.
* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
* Use teacher forcing to decide the next input to the decoder.
* Teacher forcing is the technique where the target word is passed as the next input to the decoder.
* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.


In [None]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []
[tokenizer.word_index['<start>']]

In [None]:
@tf.function
def train_step(img_tensor, target):
  loss = 0

  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])

  dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

  with tf.GradientTape() as tape:
      features = encoder(img_tensor)

      for i in range(1, target.shape[1]):
          # passing the features through the decoder
          predictions, hidden, _ = decoder(dec_input, features, hidden)

          loss += loss_function(target[:, i], predictions)

          # using teacher forcing
          dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))

  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)

  optimizer.apply_gradients(zip(gradients, trainable_variables))

  return loss, total_loss

In [None]:
EPOCHS = 1

for epoch in range(start_epoch, EPOCHS):
    start = time.time()
    total_loss = 0

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = train_step(img_tensor, target)
        total_loss += t_loss

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
              epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
    # storing the epoch end loss value to plot later
    loss_plot.append(total_loss / num_steps)

    if epoch % 5 == 0:
      ckpt_manager.save()

    print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
                                         total_loss/num_steps))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()

## Caption!

* The evaluate function is similar to the training loop, except you don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.
* Stop predicting when the model predicts the end token.
* And store the attention weights for every time step.

In [None]:
def evaluate(image):
    attention_plot = np.zeros((max_length, attention_features_shape))

    hidden = decoder.reset_state(batch_size=1)

    temp_input = tf.expand_dims(load_image(image)[0], 0)
    img_tensor_val = image_features_extract_model(temp_input)
    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))

    features = encoder(img_tensor_val)

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)
    result = []

    for i in range(max_length):
        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)

        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()
        result.append(tokenizer.index_word[predicted_id])

        if tokenizer.index_word[predicted_id] == '<end>':
            return result, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    attention_plot = attention_plot[:len(result), :]
    return result, attention_plot

In [None]:
def plot_attention(image, result, attention_plot):
    temp_image = np.array(Image.open(image))

    fig = plt.figure(figsize=(10, 10))

    len_result = len(result)
    for l in range(len_result):
        temp_att = np.resize(attention_plot[l], (8, 8))
        ax = fig.add_subplot(len_result//2, len_result//2, l+1)
        ax.set_title(result[l])
        img = ax.imshow(temp_image)
        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

    plt.tight_layout()
    plt.show()

In [None]:
# captions on the validation set
rid = np.random.randint(0, len(img_name_val))
image = img_name_val[rid]
real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])
result, attention_plot = evaluate(image)

print ('Real Caption:', real_caption)
print ('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot)
# opening the image
Image.open(img_name_val[rid])

## Try it on your own images
For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)


In [None]:
image_url = 'https://tensorflow.org/images/surf.jpg'
image_extension = image_url[-4:]
image_path = tf.keras.utils.get_file('image'+image_extension,
                                     origin=image_url)

result, attention_plot = evaluate(image_path)
print ('Prediction Caption:', ' '.join(result))
plot_attention(image_path, result, attention_plot)
# opening the image
Image.open(image_path)

# Next steps

Congrats! You've just trained an image captioning model with attention. Next, take a look at this example [Neural Machine Translation with Attention](../sequences/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset.