# **Image Caption Generator**


### Import Necessary libraries


In [36]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image 
import cv2
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

path_img='../dataset/Flicker8k_Dataset/'
s_words = stopwords.words('english')


### Extract and clean captions from file


In [37]:
def clean_img(x):
    return str(x).split("#")[0]

In [38]:
df = pd.read_csv('../dataset/flickr8k_text/Flickr8k.token.txt',
                 delimiter='\t', names=['image', 'caption'])

df['image']=df['image'].apply(clean_img)


In [39]:
def to_dict():
    cap_dict={}
    for i in range(len(df)):
        img=df.image[i]
        cap=df.caption[i]
        if img not in cap_dict:
            cap_dict[img]=[cap]
        else:
            cap_dict[img].append(cap)
    return cap_dict



In [40]:
def clean_cap(dic):
    ps=PorterStemmer()
    for i in list(dic.keys()):
        corpus=[]
        # print(i)
        for j in dic[i]:
            # print(j)
            review = re.sub("[^a-z A-Z]", " ", j)
            review = review.lower()
            review = review.split()
            review = [ps.stem(x) for x in review if len(x)>1]
            review = " ".join(review)
            corpus.append(review)
        dic[i]=corpus
    return dic



    

In [41]:
def extract_voc(dic):
    vocab=set()
    for i in list(dic.keys()):
        [vocab.update(j.split()) for j in dic[i]]
    return vocab    


In [42]:
def save_dic_to_text(dic, path):
    lines = list()
    for img_name, caption_list in dic.items():
        for caption in caption_list:
            lines.append(img_name + '\t' + caption)
    data = '\n'.join(lines)
    file = open(path, 'w')
    file.write(data)
    file.close()


In [43]:
cap_dict = to_dict()
cap_dict=clean_cap(cap_dict)
vocab=extract_voc(cap_dict)
save_dic_to_text(cap_dict, 'cleaned_descriptions.txt')


### Feature Extraction 

In [44]:
def extract_feature(path):
    