In [16]:
import json
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

##---- Porter Stemmer---#
stemmer_ps = PorterStemmer()

def sentenceStemmerPS(sentence):
    # we need to tokenize the sentence or else stemming will return the entire sentence as is.
    token_words = word_tokenize(sentence)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(stemmer_ps.stem(word))
        # adding a space so that we can join all the words at the end to form the sentence again.
        stem_sentence.append(" ")
    return "".join(stem_sentence)

####---Lemmatizer----#
lemmatizer = WordNetLemmatizer()
def sentenceLemmatizer(sentence):
    token_words = word_tokenize(sentence)
# we need to tokenize the sentence or else lemmatizing will return the entire sentence as is.
    lemma_sentence = []
    for word in token_words:
        lemma_sentence.append(lemmatizer.lemmatize(word))
        lemma_sentence.append(" ")
    return "".join(lemma_sentence)

captions_dir = 'dataset/captions_train2014.json'
caption_dataset = json.load(open(captions_dir,'r'))
print(len(caption_dataset['annotations']))



[nltk_data] Downloading package omw-1.4 to /Users/geli/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


414113


In [87]:
captionList = []
imageidList = []
captionSentList= []
stop_words = ['.']
stopwords_dir='./dataset/stopwords_en.txt'
if stopwords_dir:
    with open((stopwords_dir), "r") as data:
        for word in data:
            stop_words.append(word.strip())

for annotation in caption_dataset['annotations'][:100]:
    caption = annotation['caption'].lower()
    caption_lemmatized = sentenceLemmatizer(caption)
    # caption_without_sw = [word for word in word_tokenize(caption) if not word in stopwords.words('english')]
    caption_without_sw = [word for word in word_tokenize(caption_lemmatized) if not word in stop_words]

    captionList.append(caption_without_sw)
    captionSentList.append(' '.join(caption_without_sw))
    
    imageidList.append(annotation['image_id'])

In [88]:
annotation

{'image_id': 28231,
 'id': 3183,
 'caption': 'A blurry bike rider zooms past a new Mercedes. '}

In [89]:
tfidf = 4
n = tfidf
words, weight = None, None
if n > 0:
    print("tf-idf processing")
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(captionSentList))
    words = vectorizer.get_feature_names() 
    weight = tfidf.toarray()

tf-idf processing


In [98]:
cap2ids={}
num_img = 5
for idx, cap in enumerate(captionList):
    imageID = imageidList[idx]
    if n > 0:
        w = weight[idx]
        loc = np.argsort(-w)
        top_words = []
        for i in range(n):
            if w[loc[i]] > 0.0:
                top_words.append(words[loc[i]])
        top_cap = []
        for word in cap:
            if word.lower() in top_words:
                top_cap.append(word)

    for word in top_cap:

            if word not in cap2ids:
                cap2ids[word] = [imageID]  # index 0 is used for placeholder
            else:
                if imageID  not in cap2ids[word]:
                    cap2ids[word].append(imageID)

# import random
# for key, value in cap2ids.items():
#     if len(value) < num_img:
#         value.extend([0] * (num_img - len(value)))
#         cap2ids[key] = value
#     else:
#         value = random.sample(value, num_img)
#         cap2ids[key] = value

# pickle.dump(cap2ids,open(cap2image_file,"wb"))

# print("data process finished!")
# print(len(cap2ids))
# print(total_img)

In [99]:
cap2ids

{'clean': [318556, 538480, 266366, 0, 0],
 'decorated': [318556, 0, 0, 0, 0],
 'bathroom': [318556, 28149, 266366, 517565, 538480],
 'panoramic': [116100, 0, 0, 0, 0],
 'view': [116100, 318556, 0, 0, 0],
 'kitchen': [476220, 299675, 360334, 122802, 360306],
 'appliance': [116100, 476220, 0, 0, 0],
 'blue': [318556, 538480, 0, 0, 0],
 'butterfly': [318556, 0, 0, 0, 0],
 'themed': [318556, 0, 0, 0, 0],
 'wall': [318556, 538480, 0, 0, 0],
 'photo': [116100, 124567, 0, 0, 0],
 'dining': [116100, 0, 0, 0, 0],
 'sign': [379340, 0, 0, 0, 0],
 'street': [379340, 385716, 31813, 0, 0],
 'vandalized': [379340, 0, 0, 0, 0],
 'beetle': [379340, 0, 0, 0, 0],
 'road': [379340, 0, 0, 0, 0],
 'border': [318556, 0, 0, 0, 0],
 'paint': [318556, 0, 0, 0, 0],
 'angled': [318556, 0, 0, 0, 0],
 'beautifully': [318556, 0, 0, 0, 0],
 'people': [134754, 18691, 302443, 0, 0],
 'walking': [134754, 293605, 385716, 302443, 0],
 'beach': [134754, 31813, 0, 0, 0],
 'sink': [266366, 538480, 32275, 360306, 56972],
 'to