In [1]:
import torch
from transformers import *

import time
import numpy as np

In [44]:
model_class = RobertaModel
tokenizer_class = RobertaTokenizer
pretrained_weights = 'roberta-base'  # todo roberta-large

#model_class = BertModel
#tokenizer_class = BertTokenizer
#pretrained_weights = 'bert-base-uncased'

'''
e.g. (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
          (RobertaModel, RobertaTokenizer, 'roberta-base')]
'''

"\ne.g. (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),\n          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),\n          (RobertaModel, RobertaTokenizer, 'roberta-base')]\n"

In [45]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights,
                                    output_hidden_states=True,
                                   output_attentions=True)

In [4]:
abstract = '''Representation and analysis of complex biological and 
engineered systems as directed networks is useful for understanding 
their global structure/function organization. Enrichment of network motifs, 
which are over-represented subgraphs in real networks, can be used for topological
analysis. Because counting network motifs is computationally expensive, only
characterization of 3- to 5-node motifs has been previously reported.
In this study we used a supercomputer to analyze cyclic motifs made of 3–20 nodes 
for 6 biological and 3 technological networks. Using tools from statistical physics, we developed a theoretical framework for characterizing the ensemble of cyclic motifs in real networks. We have identified a generic property of real complex networks, antiferromagnetic organization, which is characterized by minimal directional coherence of edges along cyclic subgraphs, such that consecutive links tend to have opposing direction. As a consequence, we find that the lack of directional coherence in cyclic motifs leads to depletion in feedback loops, where the number of nodes affected by feedback loops appears to be at a local minimum compared with surrogate shuffled networks. This topology provides more dynamic stability in large networks.'
'''

In [53]:
# set all outputs to on
model.output_hidden_states=True

# Tokenize input text:
input_ids = torch.tensor([tokenizer.encode(abstract, add_special_tokens=True)]) 

# Equivalently, breaking down the steps of tokenization:
input_tokens = tokenizer.tokenize(abstract)
print("1. tokens: {}".format(input_tokens))
print()
print("2. token_ids tensor: {}".format(input_ids))

# generate hidden activation from input tokens:
with torch.no_grad():
    last_hidden_state = model(input_ids)[0]
    print("shape of last hidden states: {}".format(np.shape(last_hidden_states)))
    all_hidden_states, all_attentions = model(input_ids)[-2:]
    #print("all hidden: {}".format(np.shape(all_hidden_states)))
    #print(" all attentions: {}".format(np.shape(all_attentions)))
    
    #print(last_hidden_states[0][0][0])
    #print(all_hidden_states[0][0][0])

1. tokens: ['Represent', 'ation', 'Ġand', 'Ġanalysis', 'Ġof', 'Ġcomplex', 'Ġbiological', 'Ġand', 'Ġ', 'Ċ', 'engine', 'ered', 'Ġsystems', 'Ġas', 'Ġdirected', 'Ġnetworks', 'Ġis', 'Ġuseful', 'Ġfor', 'Ġunderstanding', 'Ġ', 'Ċ', 'their', 'Ġglobal', 'Ġstructure', '/', 'function', 'Ġorganization', '.', 'ĠEn', 'rich', 'ment', 'Ġof', 'Ġnetwork', 'Ġmotif', 's', ',', 'Ġ', 'Ċ', 'which', 'Ġare', 'Ġover', '-', 'represented', 'Ġsub', 'graph', 's', 'Ġin', 'Ġreal', 'Ġnetworks', ',', 'Ġcan', 'Ġbe', 'Ġused', 'Ġfor', 'Ġtop', 'ological', 'Ċ', 'analysis', '.', 'ĠBecause', 'Ġcounting', 'Ġnetwork', 'Ġmotif', 's', 'Ġis', 'Ġcomput', 'ationally', 'Ġexpensive', ',', 'Ġonly', 'Ċ', 'character', 'ization', 'Ġof', 'Ġ3', '-', 'Ġto', 'Ġ5', '-', 'node', 'Ġmotif', 's', 'Ġhas', 'Ġbeen', 'Ġpreviously', 'Ġreported', '.', 'Ċ', 'In', 'Ġthis', 'Ġstudy', 'Ġwe', 'Ġused', 'Ġa', 'Ġsuper', 'computer', 'Ġto', 'Ġanalyze', 'Ġcycl', 'ic', 'Ġmotif', 's', 'Ġmade', 'Ġof', 'Ġ3', 'âĢĵ', '20', 'Ġnodes', 'Ġ', 'Ċ', 'for', 'Ġ6', 'Ġbiological', 

In [54]:
# number of hidden_states:
print(len(all_hidden_states))

# size of one layer
print(all_hidden_states[0].size())

13
torch.Size([1, 249, 768])


In [59]:
# sanity check
print(last_hidden_state[0][0][0])
print(all_hidden_states[12][0][0][0])


tensor(0.0121)
tensor(0.0121)
