In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#!pip install transformers
import torch
from transformers import BertTokenizer, BertModel
import json
from tqdm import tqdm
import gc

In [2]:
!wget https://cs.stanford.edu/people/karpathy/deepimagesent/flickr30k.zip

In [3]:
!unzip flickr30k.zip

In [4]:
!ls flickr30k

In [5]:
!cat flickr30k/readme.txt

In [6]:
with open('flickr30k/dataset.json') as jsonfile:
    data = json.load(jsonfile)

In [7]:
NIMAGES = len(data['images'])
NIMAGES

In [8]:
data['images'][0]

In [9]:
RAW_SENTENCES  = []
SENT_SENTENCES = []


for i in tqdm(range(NIMAGES)):
    img_data = data['images'][i]
    raw_sentences    = [ x['raw']    for x in img_data['sentences'] ]
    token_sentences  = [ x['tokens'] for x in img_data['sentences'] ]
    sentid_sentences = [ x['sentid'] for x in img_data['sentences'] ] 
    #print (raw_sentences)
    #print (sentid_sentences)
    RAW_SENTENCES.append(raw_sentences)
    SENT_SENTENCES.append(sentid_sentences)
    del img_data, raw_sentences, token_sentences, sentid_sentences 

In [10]:
RAW_SENTENCES[0], SENT_SENTENCES[0]

In [11]:
len(RAW_SENTENCES), len(SENT_SENTENCES)

## BERT example

In [12]:
MODEL          = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(MODEL)
bert_model     = BertModel.from_pretrained(MODEL)

In [13]:
#https://huggingface.co/docs/transformers/preprocessing

sample_text = "Hello world, testing this tokenizer"

in_tokens = bert_tokenizer.encode(sample_text)
print (in_tokens)
print (bert_tokenizer.decode(in_tokens))

inputs = bert_tokenizer(sample_text, return_tensors="pt")
print (inputs)

outputs = bert_model(**inputs)
last_hidden_states = outputs.last_hidden_state
print (last_hidden_states.shape)

## Extract embeddings

In [None]:
EMBD = []
INPUTS = []

for sentences, sent_ids in tqdm(zip(RAW_SENTENCES, SENT_SENTENCES)):
    
    assert len(sentences) == 5
    #print (sentences, sent_ids)
    
    inputs = bert_tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    #print (inputs)
    INPUTS.append(inputs)

    outputs = bert_model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    #print (last_hidden_states.shape)
    
    EMBD.append(last_hidden_states)
    
    del inputs, last_hidden_states, outputs
    gc.collect()