In [None]:
!pip install allennlp

In [None]:
import warnings
import numpy as np
import pandas as pd

import torch
from allennlp.data import Vocabulary, TokenIndexer, Tokenizer
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import (
    ELMoTokenCharactersIndexer,
)
from allennlp.data.tokenizers import WhitespaceTokenizer

from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
warnings.filterwarnings("ignore")

artemis_data = pd.read_csv("/content/drive/MyDrive/Lumiere/Dataset/artemis_dataset_release_v0.csv")
artemis_data.head()


Unnamed: 0,art_style,painting,emotion,utterance,repetition
0,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"She seems very happy in the picture, and you w...",10
1,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,sadness,This woman has really knotty hands which makes...,10
2,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,something else,"When looking at this woman, I am filled with c...",10
3,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,contentment,"A woman looking at ease, peaceful, and satisfi...",10
4,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,awe,She looks like a lady from that past that migh...,10


In [None]:
artemis_captions = artemis_data["utterance"].values

In [None]:
artemis_captions_spl = []
for i in artemis_captions:
    i = " ".join(["<sos>", i, "<eos>"]) #adding <sos> and <eos>
    artemis_captions_spl.append(i)

In [None]:
artemis_captions_spl[:10] # checking a few values

['<sos> She seems very happy in the picture, and you want to know what what is behind the smile. <eos>',
 '<sos> This woman has really knotty hands which makes her look like she has arthritis. <eos>',
 '<sos> When looking at this woman, I am filled with curiosity about what she is thinking about with her elbow on the table and a very emotionless face. <eos>',
 '<sos> A woman looking at ease, peaceful, and satisfied amongst her books makes me feel content. <eos>',
 '<sos> She looks like a lady from that past that might have been a teacher (books).  She looks tired and I wondered how hard it must have been for them back then. <eos>',
 '<sos> The bright colors make a very unique scene for the interesting shapes. <eos>',
 '<sos> The way the image is presented, with large chunks of paint used to depict each of the subjects, makes for a slight amount of confusion and an unsureness on the part of the viewer: what, exactly, was Kandinsky trying to depict during Autumn? <eos>',
 '<sos> the stro

In [None]:
embeddings_dict = {}

token_indexer: TokenIndexer = ELMoTokenCharactersIndexer()
tokenizer: Tokenizer = WhitespaceTokenizer()
vocab = Vocabulary()

In [None]:
elmo_options_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/options.json"
)
elmo_weight_file = (
    "https://allennlp.s3.amazonaws.com/models/elmo/test_fixture/lm_weights.hdf5"
) # from AllenNLP

elmo_embedding = ElmoTokenEmbedder(
    options_file=elmo_options_file, weight_file=elmo_weight_file, requires_grad=False
)

embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

In [None]:
text = artemis_captions_spl[0] # picking an example

In [None]:
tokens = tokenizer.tokenize(text)
text_field = TextField(tokens, {"elmo_tokens": token_indexer})
text_field.index(vocab)
padding_lengths = text_field.get_padding_lengths()
# print(type(padding_lengths), padding_lengths)
tensor_dict = text_field.as_tensor(padding_lengths)
# print(tensor_dict) 

In [None]:
tokens # example of the tokenizer

[<sos>,
 She,
 seems,
 very,
 happy,
 in,
 the,
 picture,,
 and,
 you,
 want,
 to,
 know,
 what,
 what,
 is,
 behind,
 the,
 smile.,
 <eos>]

In [None]:
for i in range(len(artemis_captions)):
    text = artemis_captions[i]
    # print(text, type(text)) # for debugging
    tokens = tokenizer.tokenize(text)
    text_field = TextField(tokens, {"elmo_tokens": token_indexer})
    text_field.index(vocab)
    
    padding_lengths = text_field.get_padding_lengths()
    tensor_dict = text_field.as_tensor(padding_lengths)

    token_tensor = text_field.as_tensor(text_field.get_padding_lengths())
    embedder = BasicTextFieldEmbedder(token_embedders={"elmo_tokens": elmo_embedding})

    tensor_dict = text_field.batch_tensors([token_tensor])
    embedded_tokens = embedder(tensor_dict)

    embeddings_dict.update({artemis_captions[i]: embedded_tokens.detach().cpu().numpy()}) # removing the gradient part

In [None]:
np.save('artemis_elmo_embeddings.npy', embeddings_dict)

In [None]:
# Load
data_dict = np.load('artemis_elmo_embeddings.npy',allow_pickle='TRUE').item()
print(data_dict[artemis_captions[0]])

[[[ 0.          0.6844415  -0.6827564   0.37949494 -0.
   -0.         -0.         -0.          0.         -0.51658684
    0.          0.         -0.          0.45956087 -0.
   -0.          0.          0.80556774 -0.73684406 -0.
   -0.36114234 -1.536783   -0.          0.35063332 -0.99035996
   -0.         -0.         -0.07399112 -1.1687298   0.
   -0.          0.11975098]
  [ 0.          1.2534962  -0.34084865 -0.         -0.
   -0.24923238 -0.         -1.5924509  -0.         -0.
   -0.          0.23717815 -0.645031    0.         -0.
    0.         -0.          0.          0.         -0.43035048
   -1.7940229  -0.6245768   0.          0.4018072  -0.
   -0.09282405  0.         -0.         -1.624832    0.
   -0.6257524  -1.366801  ]
  [ 0.11926934  0.         -0.         -0.         -0.
   -0.         -0.72425073 -0.         -0.         -0.4223647
   -0.          0.         -0.62927216  0.8310913   0.
    0.         -0.21252863  0.          0.21363086 -0.60022426
   -0.         -0.1303918

In [None]:
import os
os.chdir("/content/drive/MyDrive/Lumiere/Dataset/")
np.save('artemis_elmo_embeddings.npy', embeddings_dict)