In [2]:
### HIDDEN STATES ###
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import pandas as pd

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Text
from data_loader import load_data, clean_data

subtask = 'a'
texts = ["Here is the sentence I want embeddings for.",
"After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."]

train, val, test = load_data(subtask)

#Clean data
X_train, y_train = clean_data(train)
X_val, y_val = clean_data(val)
X_test, y_test = clean_data(test)

max_seq_length = 40
def get_features(texts):

    col_names = ["tokenized_text","tokens_tensor","segments_tensors"]
    features = pd.DataFrame(columns=col_names)

    for text in texts:
        marked_text = "[CLS] " + text + " [SEP]"

        #print("Text: ", text)

        # Tokenization
        tokenized_text = tokenizer.tokenize(marked_text)
        #print("Tokenized text: ", tokenized_text)

        # Convert token to vocabulary indices
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Segement IDs
        segments_ids = [1] * len(tokenized_text)

        # Padding

        padding = [0] * (max_seq_length - len(indexed_tokens))

        indexed_tokens += padding
        segments_ids += padding

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        input_features = {'tokenized_text': tokenized_text, 'tokens_tensor':tokens_tensor, 'segments_tensors':segments_tensors}
        features.loc[len(features)] = input_features

    return features

features = get_features(X_test)
# val_features = get_features(X_val)
# test_features = get_features(X_test)
#print(features)

### Predict hidden states features for each layer ###
model.eval()

with torch.no_grad():
    # all_encoded_layers = []

    # for i  in range(len(features.index)):
        #print(i)
    encoded_layers, _ = model(features['tokens_tensor'][0], features['segments_tensors'][0])
        # all_encoded_layers.append(encoded_layers)


def get_embedding(encoded_layers):
    token_embeddings = []
    for token_i in range(max_seq_length):

        hidden_layers = []

        for layer_i in range(len(encoded_layers)):
            vec = encoded_layers[layer_i][0][token_i]

            hidden_layers.append(vec)

    token_embeddings.append(hidden_layers)

    ### First Layer ###
    first_layer = torch.mean(encoded_layers[0], 1)
    # print(len(first_layer), len(first_layer[0]))

    ### Second-to-Last Hidden ###
    second_to_last = torch.mean(encoded_layers[11], 1)

    # print(len(second_to_last), len(second_to_last[0]))

    ### Sum Last Four Hidden ###
    # token_last_four_sum = []
    #
    # for token in token_embeddings:
    #     sum_vec = torch.sum(torch.stack(token)[-4:], 0)
    #     token_last_four_sum.append(sum_vec)
    #
    # print(len(token_last_four_sum), len(token_last_four_sum[0]))
    token_last_four_sum = [torch.sum(torch.stack(token)[-4:], 0) for token in token_embeddings]

    ### Concat Last Four Hidden ###
    # token_last_four_cat = []
    # for token in token_embeddings:
    #     cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), 0)
    #     token_last_four_cat.append(cat_vec)
    #
    # print(len(token_last_four_cat), len(token_last_four_cat[0]))
    token_last_four_cat = [torch.cat((token[-1], token[-2], token[-3], token[-4]), 0) for token in token_embeddings]

    ### Sum All 12 Layers ###
    # token_sum_all = []
    #
    # for token in token_embeddings:
    #     sum_vec = torch.sum(torch.stack(token)[0:], 0)
    #     token_sum_all.append(sum_vec)
    #
    # print(len(token_sum_all), len(token_sum_all[0]))
    token_sum_all = [torch.sum(torch.stack(token)[0:], 0) for token in token_embeddings]

    return first_layer, second_to_last, token_last_four_sum, token_last_four_cat, token_sum_all

first, second_to_last, last_four_sum, last_four_cat, sum_all = get_embedding(encoded_layers)

# col_names = ['first', 'second_to_last', 'last_four_sum', 'last_four_cat', 'sum_all']
# embeddings = pd.DataFrame(columns=col_names)

# for i, encoded_layer in enumerate(all_encoded_layers):
#     print(i)
#     first, second_to_last, last_four_sum, last_four_cat, sum_all = get_embedding(encoded_layer)

#     input_embeddings = {'first': first, 'second_to_last':second_to_last, 'last_four_sum':last_four_sum, 'last_four_cat':last_four_cat, 'sum_all':sum_all}
#     embeddings.loc[len(embeddings)] = input_embeddings

# # print(embeddings)

# embeddings.to_csv("data/test_embeddings_bert.csv")


INFO:pytorch_pretrained_bert.modeling:loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/adebruijn/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
INFO:pytorch_pretrained_bert.modeling:extracting archive file /Users/adebruijn/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/_z/cmpd3s213g95yywl93cyfvdc0000gq/T/tmpupqq8o4x
INFO:pytorch_pretrained_bert.modeling:Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:p

In [3]:
first

tensor([[ 0.3141, -0.4490, -0.0811, -0.4260, -0.1309,  0.1133, -0.1384,  0.3867,
          0.0093, -0.0251,  0.1767,  0.0027, -0.3276,  0.2614, -0.3140,  0.1850,
          0.2464, -0.1266,  0.0128, -0.1044, -0.1372, -0.1791,  0.1078, -0.0380,
          0.2169, -0.0779, -0.0572,  0.0836,  0.0460,  0.2432,  0.2034,  0.1800,
          0.2450,  0.0862, -0.0305,  0.0457, -0.1843,  0.0195,  0.2808, -0.2686,
          0.0086, -0.2583, -0.0541,  0.1873,  0.1133, -0.4559,  0.3318, -0.0180,
          0.3753, -0.3129, -0.4807,  0.1477,  0.1623, -0.7589,  0.2420, -0.1196,
          0.2329, -0.0635,  0.3093, -0.1157,  0.0907,  0.2149,  0.1216, -0.1242,
         -0.5222,  0.0860, -0.0279,  0.2214, -0.3305,  0.1967, -0.1007, -0.0972,
         -0.5381, -0.1814,  0.2152,  0.1256, -0.0513,  0.3066,  0.2815, -0.2962,
          0.0243,  0.1938,  0.2016, -0.0797,  0.0613, -0.0596,  0.0259,  0.0251,
          0.1348,  0.0294,  0.2524, -0.0850,  0.1291,  0.4252, -0.2034,  0.1876,
          0.4001, -0.0131, -

In [16]:
second_to_last

tensor([[ 0.2497, -0.2608,  0.4228,  0.1481,  0.2560, -0.2400,  0.0675,  0.6774,
         -0.2617,  0.2635, -0.0048, -0.1584, -0.6036,  0.4951, -0.0106,  0.8101,
          0.1081,  0.1142, -0.2973,  0.6830,  0.5105, -0.0659,  0.1132,  0.3114,
          0.4098, -0.1282,  0.0599, -0.1679, -0.4782,  0.0365,  0.5590,  0.0988,
         -0.2772, -0.2289,  0.4038,  0.2477, -0.3453,  0.0540,  0.3237,  0.0352,
         -0.4339, -0.7116, -0.4096,  0.5067,  0.3141, -0.3890,  0.4667,  0.1607,
         -0.1111,  0.0586, -0.2740,  0.2439, -0.2019, -0.6187,  0.4109,  0.4841,
         -0.3722, -0.8074, -0.1909, -0.5252,  0.0999, -0.2436,  0.5261, -0.3647,
          0.3348, -0.0462, -0.2086,  0.4089, -0.8180, -0.3386, -0.3974, -0.2547,
         -0.1708,  0.0749, -0.0274, -0.2568, -0.3363,  0.4073, -0.1258, -0.0276,
          0.1476,  0.6075, -0.0536,  0.6096,  0.0131,  0.2635,  0.2509,  0.2679,
          0.0215,  0.6494,  0.1804,  0.0337,  0.6152, -0.1040, -0.0813,  0.0083,
          0.2014, -0.1073, -

In [22]:
torch(last_four_sum[0])

TypeError: 'module' object is not callable

In [19]:
len(last_four_cat[0])

3072

In [32]:
print ("Our final sentence embedding vector of shape:", first[0].shape[0])
print ("Our final sentence embedding vector of shape:", second_to_last[0].shape[0])
print ("Our final sentence embedding vector of shape:", last_four_sum[0].shape[0])
print ("Our final sentence embedding vector of shape:", last_four_cat[0].shape[0])
print ("Our final sentence embedding vector of shape:", sum_all[0].shape[0])


Our final sentence embedding vector of shape: 768
Our final sentence embedding vector of shape: 768
Our final sentence embedding vector of shape: 768
Our final sentence embedding vector of shape: 3072
Our final sentence embedding vector of shape: 768


In [43]:
first.size()

torch.Size([1, 768])

In [44]:
second_to_last.size().

torch.Size([1, 768])

In [53]:
last_four_sum[0].view(1,-1).size()

torch.Size([1, 768])

In [55]:
first.size()

torch.Size([1, 768])

In [42]:
torch.FloatTensor([last_four_sum[0]])

ValueError: only one element tensors can be converted to Python scalars

In [56]:
r1 = torch.Tensor(2, 3)
r1.size()

torch.Size([2, 3])

In [66]:
torch.stack([torch.Tensor(3), torch.Tensor(3)])

tensor([[0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000]])