## Link Image Caption to Text in Summary

In [80]:
import re
import numpy as np
import pandas as pd
import gensim
from gensim.models import Word2Vec

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer, util

### Clean text and load data

In [27]:
def clean(txt_lst):   
    def clean_text(text, remove_stopwords = True):
        text = text.lower()
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%:#$@\[\]/]', ' ', text)
        text = re.sub(r'[.,!?]', '', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'\'', ' ', text)
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)
        return nltk.WordPunctTokenizer().tokenize(text)
    return list(map(clean_text, txt_lst))

def lemmatize(txt_lst):
    lemm = nltk.stem.WordNetLemmatizer()
    return list(map(lambda word: list(map(lemm.lemmatize, word)),
                    txt_lst))

In [78]:
# read summary and captions

with open("../data/Albania-summary.txt") as file:
    text = [f.replace("\n", "") for f in file.readlines()]  # remove \n line break
    sentences = lemmatize(clean(text))
    

# figure 3, 4, 6
captions_text = ["An aerial photo taken on Wednesday, November 27, shows emergency crews searching the rubble of a collapsed building in Thumane, Albania, the day after a devastating earthquake struck the region.",
                 "A doorframe remains standing amid the rubble of a collapsed building in Thumane.",
                 "Members of the emergency services work to remove debris from a damaged building in Durres."]
captions = lemmatize(clean(captions_text))

### N-gram overlaps

In [90]:
for caption in captions:
    max_ngram = -1
    max_ngram_id = None
    for idx, sent in enumerate(sentences):
        ngram_overlap = len(set(caption) & set(sent))
        if ngram_overlap > max_ngram:
            max_ngram = ngram_overlap
            max_ngram_id = idx
    
    print(max_ngram)
    print(text[max_ngram_id])
    print()

6
Rescuers in Albania dug through the rubble of collapsed buildings in search of survivors on Tuesday, after a 6.4-magnitude earthquake struck the Balkan nation, killing at least 23 people and injuring 650.

3
Rescuers in Albania dug through the rubble of collapsed buildings in search of survivors on Tuesday, after a 6.4-magnitude earthquake struck the Balkan nation, killing at least 23 people and injuring 650.

3
Rama said on Saturday that preliminary figures showed more than 1,465 buildings in the capital, Tirana, and about 900 in nearby Durres were seriously damaged in Tuesday's 6.4-magnitude predawn earthquake.



### Word embeddings

In [36]:
WORD2VEC_PATH = "../GoogleNews-vectors-negative300.bin"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

In [72]:
# get caption and summary sentence embedding matrix

def embed_sentence(sentences):
    matrix = []
    for sent in sentences:
        avg_vec = np.mean([word2vec[token] for token in sent if token in word2vec], axis=0)
        matrix.append(avg_vec)
    
    return np.array(matrix)

caption_matrix = embed_sentence(captions)
summary_matrix = embed_sentence(sentences)
print(caption_matrix.shape, summary_matrix.shape)

(3, 300) (23, 300)


In [88]:
cosine = cosine_similarity(caption_matrix, summary_matrix)  # 3 x 23
for row in cosine:
    print(np.max(row))
    print(text[np.argmax(row)])
    print()

0.8105717
Rescuers in Albania dug through the rubble of collapsed buildings in search of survivors on Tuesday, after a 6.4-magnitude earthquake struck the Balkan nation, killing at least 23 people and injuring 650.

0.6932562
Videos and pictures shared on social media showed chaotic scenes of residents rummaging through the rubble, or trying to extricate people trapped under collapsed buildings.

0.66305846
He said more than 1,465 buildings in Tirana and about 900 in nearby Durres had been seriously damaged. Durres castle walls damaged by the earthquake In Albania, a large proportion of the earthquake damage has been blamed on corruption, violations of the building code and substandard construction following the demise of communism during the early 1990s.



### SentenceBERT: sentence embeddings

In [75]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = sbert_model.encode(text)

In [89]:
for caption in captions_text:
    caption_embedding = sbert_model.encode(caption, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(caption_embedding, sentence_embeddings)[0]
    cos_scores = cos_scores.numpy()
    print(np.max(cos_scores))
    print(text[np.argmax(cos_scores)])
    print()

0.88861513
Rescuers in Albania dug through the rubble of collapsed buildings in search of survivors on Tuesday, after a 6.4-magnitude earthquake struck the Balkan nation, killing at least 23 people and injuring 650.

0.56913984
Four buildings, including a five-storey apartment block, collapsed in Kodër-Thumanë and the town was hardest hit from the earthquake.

0.73777497
Earthquake damage is being checked by civil engineers from the European Union, United States and local experts to assess whether buildings are structurally sound, unsafe and required demolition or just needed replastering.

