In [1]:
from mmf.common.registry import registry
import torch

In [2]:
model_cls = registry.get_model_class("visual_bert")
model = model_cls.from_pretrained("visual_bert.finetuned.hateful_memes.from_coco")

  + "Switching to CPU version."
See the compact keys issue for more details: https://github.com/omry/omegaconf/issues/152


In [3]:
print(model)

VisualBERT(
  (model): VisualBERTForClassification(
    (bert): VisualBERTBase(
      (embeddings): BertVisioLinguisticEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (token_type_embeddings_visual): Embedding(2, 768)
        (position_embeddings_visual): Embedding(512, 768)
        (projection): Linear(in_features=2048, out_features=768, bias=True)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_featur

In [4]:
object_methods = [method_name for method_name in dir(model)
                  if callable(getattr(model, method_name))]

print(object_methods)

['__call__', '__class__', '__delattr__', '__dir__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '_apply', '_get_name', '_load_from_state_dict', '_named_members', '_register_load_state_dict_pre_hook', '_register_state_dict_hook', '_replicate_for_data_parallel', '_save_to_state_dict', '_slow_forward', 'add_custom_params', 'add_module', 'apply', 'bfloat16', 'buffers', 'build', 'children', 'config_path', 'cpu', 'cuda', 'double', 'eval', 'extra_repr', 'flatten', 'flatten_for_bert', 'float', 'format_for_prediction', 'format_state_key', 'forward', 'from_pretrained', 'get_optimizer_parameters', 'half', 'init_losses', 'load_requirements', 'load_state_dict', 'model', 'modules', 'named_buffers', 'named_children', 'named_modules', 'named_parameters', 'parameters

In [5]:
from bert.tokenization.bert_tokenization import FullTokenizer
tokenizer = FullTokenizer(
      vocab_file='pretrained_bert_model/vocab.txt')

In [6]:
import json

data_dir = '../facebook_challenge_data/'
model_dir = 'models/'

# load data and print sizes
# load data and print sizes
def get_dict(path):
    jsonl_content = open(path, 'r').read()
    data = [json.loads(jline) for jline in jsonl_content.split('\n')]
    return {datum['id'] : datum for datum in data}


train_dict = get_dict(data_dir + 'train.jsonl')
val_dict = get_dict(data_dir + 'dev.jsonl')
test_dict = get_dict(data_dir + 'test.jsonl')

print(len(train_dict))
print(len(val_dict))
print(len(test_dict))

def get_text_data(dictionary):
    return [(datum['text'], datum['label']) for datum in dictionary.values()]

train_data = get_text_data(train_dict)
val_data = get_text_data(val_dict)

8500
500
1000


In [7]:
from tqdm import tqdm
import tensorflow as tf
import numpy as np

# tokenize the sequences
# https://colab.research.google.com/drive/1WQY_XxdiCVFzjMXnDdNfUjDFi0CN5hkT#scrollTo=TApTW_wLxoA9

MAX_SEQ_LEN = 50

def list_to_mats(l, tokenizer, pad_len):
    texts, labels = zip(*l)
    
    sequences = []
    for text in tqdm(texts):
        tokens = ["[CLS]"] + tokenizer.tokenize(text) + ["[SEP]"]
        tokens = [t for t in tokens if t[0]!='<'] # remove twitter specific text
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        sequences.append(token_ids)
    
    x = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
    y = np.asarray(labels)
    return x, y

x_train, y_train = list_to_mats(train_data, tokenizer, MAX_SEQ_LEN)
x_val, y_val = list_to_mats(val_data, tokenizer, MAX_SEQ_LEN)
# x_test, y_test = list_to_mats(test_data, tokenizer, MAX_SEQ_LEN)

print('Shape of data tensor:', x_train.shape)
print('Shape of label tensor:', y_train.shape)

100%|██████████| 8500/8500 [00:01<00:00, 4990.34it/s]
100%|██████████| 500/500 [00:00<00:00, 5791.83it/s]

Shape of data tensor: (8500, 50)
Shape of label tensor: (8500,)





In [8]:
# use input mask and segment ids from
# https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb#scrollTo=IhJSe0QHNG7U
from mmf.common.sample import Sample, SampleList

input_mask = torch.tensor(np.ones(MAX_SEQ_LEN)).long()
segment_ids = torch.tensor(np.zeros(MAX_SEQ_LEN)).long()

sample_list = [Sample({'input_ids': torch.tensor(x).long(),
                      'input_mask': input_mask,
                      'segment_ids': segment_ids}) for x in x_val]
sample_list = SampleList(sample_list)

In [9]:
import time
t0 = time.time()
preds = model.forward(sample_list)
print(time.time()-t0)

8.94962453842163


In [10]:
print(preds)
pred_list = preds['scores'].tolist()

{'scores': tensor([[ 3.3944, -3.3940],
        [ 4.1372, -4.5100],
        [ 4.0722, -4.3152],
        [ 3.7807, -4.3434],
        [ 4.1213, -4.4178],
        [ 0.6177, -0.1645],
        [ 3.9144, -4.2022],
        [ 3.7445, -4.4305],
        [ 3.9649, -4.4931],
        [ 4.0661, -4.4082],
        [ 4.0079, -4.7269],
        [ 2.8871, -2.6498],
        [ 3.8696, -4.3932],
        [ 3.7421, -4.0838],
        [ 3.7046, -4.1818],
        [ 3.7688, -4.0062],
        [ 3.9317, -4.4949],
        [ 3.6897, -4.1206],
        [ 3.6248, -3.9990],
        [ 4.0347, -4.5704],
        [ 4.0118, -4.5896],
        [ 3.3246, -3.0516],
        [ 3.6051, -3.9703],
        [ 4.0415, -4.4295],
        [ 3.9168, -4.4417],
        [ 3.7617, -4.2302],
        [ 3.6068, -4.1093],
        [ 4.0320, -4.4828],
        [ 3.5725, -3.8096],
        [ 2.4044, -1.8997],
        [ 3.2938, -3.7659],
        [ 4.0818, -4.4964],
        [ 3.9449, -4.2842],
        [ 3.7612, -4.3850],
        [ 3.7821, -3.8485],
        [

In [14]:
softmaxed = torch.nn.functional.softmax(preds['scores'])
print(softmaxed)

predicted = [t[1].item() for t in softmaxed]
print()

tensor([[9.9887e-01, 1.1255e-03],
        [9.9982e-01, 1.7560e-04],
        [9.9977e-01, 2.2767e-04],
        [9.9970e-01, 2.9622e-04],
        [9.9980e-01, 1.9563e-04],
        [6.8615e-01, 3.1385e-01],
        [9.9970e-01, 2.9847e-04],
        [9.9972e-01, 2.8152e-04],
        [9.9979e-01, 2.1215e-04],
        [9.9979e-01, 2.0871e-04],
        [9.9984e-01, 1.6086e-04],
        [9.9608e-01, 3.9233e-03],
        [9.9974e-01, 2.5785e-04],
        [9.9960e-01, 3.9909e-04],
        [9.9962e-01, 3.7569e-04],
        [9.9958e-01, 4.1991e-04],
        [9.9978e-01, 2.1892e-04],
        [9.9959e-01, 4.0538e-04],
        [9.9951e-01, 4.8843e-04],
        [9.9982e-01, 1.8312e-04],
        [9.9982e-01, 1.8380e-04],
        [9.9830e-01, 1.6986e-03],
        [9.9949e-01, 5.1263e-04],
        [9.9979e-01, 2.0940e-04],
        [9.9977e-01, 2.3434e-04],
        [9.9966e-01, 3.3807e-04],
        [9.9955e-01, 4.4542e-04],
        [9.9980e-01, 2.0044e-04],
        [9.9938e-01, 6.2194e-04],
        [9.866

  """Entry point for launching an IPython kernel.


In [18]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score

print('Test AUROC:', roc_auc_score(y_val, predicted))

preds_bin = [1 if i > 0.1 else 0 for i in predicted]
print(preds_bin)
print('Val acc:', accuracy_score(y_val, preds_bin))

Test AUROC: 0.5775520000000001
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [16]:
print(predicted)

[0.0011254631681367755, 0.00017559983825776726, 0.00022766573238186538, 0.0002962196012958884, 0.00019562843954190612, 0.31385016441345215, 0.0002984744496643543, 0.00028152469894848764, 0.00021215301239863038, 0.00020871087326668203, 0.00016086376854218543, 0.003923343028873205, 0.00025785318575799465, 0.0003990858676843345, 0.0003756936639547348, 0.0004199052054900676, 0.00021891521464567631, 0.0004053784068673849, 0.0004884320078417659, 0.00018312406609766185, 0.00018379895482212305, 0.0016986430855467916, 0.0005126335308887064, 0.0002093981602229178, 0.00023433593742083758, 0.00033807242289185524, 0.0004454247246030718, 0.00020043957920279354, 0.0006219418137334287, 0.013332842849195004, 0.0008582607842981815, 0.00018813024507835507, 0.0002666839864104986, 0.0002897251979447901, 0.0004851285193581134, 0.00029084301786497235, 0.00027539092116057873, 0.0005018523661419749, 0.00023202685406431556, 0.001176820369437337, 0.00030185302603058517, 0.0014672846300527453, 0.00018275078036822

In [23]:
model.train()

VisualBERT(
  (model): VisualBERTForClassification(
    (bert): VisualBERTBase(
      (embeddings): BertVisioLinguisticEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (token_type_embeddings_visual): Embedding(2, 768)
        (position_embeddings_visual): Embedding(512, 768)
        (projection): Linear(in_features=2048, out_features=768, bias=True)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_featur