In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import lmdb
import pickle
from tqdm import tqdm

In [2]:
def get_features_and_token_counts(df, tokenizer, model, include_cls=True):
    narrations = df['narration'].tolist()
    encoded_input = tokenizer(narrations, padding='max_length', truncation=True, max_length=20, return_tensors='pt')
    encoded_input = {key: val.to(model.device) for key, val in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    features = model_output.last_hidden_state
    
    # Calculate the number of tokens before padding
    # Attention masks are 1 for real tokens and 0 for padding
    if include_cls:
        num_tokens = torch.sum(encoded_input['attention_mask'], dim=1).item()  # Includes [CLS]
    else:
        num_tokens = (torch.sum(encoded_input['attention_mask'], dim=1) - 1).item()  # Excludes [CLS]

    return features, num_tokens

In [3]:
# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
train_csv = "/private/home/arjunrs1/CliMer/data/epic/EPIC_train.csv"
val_csv = "/private/home/arjunrs1/CliMer/data/epic/EPIC_val.csv"
test_csv = "/private/home/arjunrs1/CliMer/data/epic/EPIC_test.csv"

In [5]:
data_train = pd.read_csv(train_csv)
data_val = pd.read_csv(val_csv)
data_test = pd.read_csv(test_csv)
data = pd.concat((data_train, data_val, data_test))
#data = data[data.video_id.isin(['P01_01', 'P01_02'])] #For now, we have restricted to just these video_ids

In [6]:
# Path to LMDB database
lmdb_path = '/private/home/arjunrs1/CliMer/lmdb_bert_features'

# Create or open the LMDB database
env = lmdb.open(lmdb_path, map_size=int(4e9))

with env.begin(write=True) as txn:
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        # Compute features for each narration
        features, num_tokens = get_features_and_token_counts(pd.DataFrame([row]), tokenizer, model)
        
        # Prepare the data structure
        to_save = {
            'features': features.detach(),
            'num_tokens': num_tokens
        }
        
        # Serialize the data structure
        serialized_data = pickle.dumps(to_save)
        
        # Put the serialized data in the database with the ASCII-encoded key
        clip_id = row['narration_id'].encode('ascii')
        txn.put(clip_id, serialized_data)

env.close()

100%|██████████| 50167/50167 [16:26<00:00, 50.87it/s]


In [None]:
for index, row in data.iterrows():
        # Compute features for each narration
        features, num_tokens = get_features_and_token_counts(pd.DataFrame([row]), tokenizer, model)