In [1]:
import pandas as pd
import numpy as np
import torch
import transformers as ppb # pytorch transformers
from sklearn.base import TransformerMixin

In [2]:
data_file = pd.read_csv(r'COVID19_Dataset-text_labels_only.csv')
tweets = np.asarray(data_file['Tweet'].copy())
targets = np.asarray(data_file['Is_Unreliable'].copy())

for i in range(np.shape(tweets)[0]):
    tweets[i] = tweets[i].lower()

In [3]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [4]:
tweets

array(['we are living in scary times in canada. gov’t refuses to protect canadians from coronavirus',
       'just as bad in canada. in fact, our government is now accusing us of racism for blaming the chinese for coronavirus',
       'it was only a matter of time before the mainstream media decided to blame the coronavirus on climate change, i suppose.',
       "russia's taking no chances: foreigners infected with the new chinese coronavirus will be quarantined, isolated &amp; deported, pm",
       'although there is now a presumptive confirmed case of wuhan novel coronavirus, i want you to know that ontario is prepared',
       'hooray, finally there is a propaganda banner telling people they can make babies if getting bored staying home #coronavirus',
       '#russia is pushing propaganda claiming the #coronavirus is a us bio weapon targeting #china because of #trump’s trade',
       'the best defense against disturbing new diseases like #coronavirus is bolstering public health syst

In [40]:
text = tweets[0]
            
tokenized = tokenizer.encode(text.lower(), add_special_tokens=True)
#tokenized = np.array(tokenized)
            
# max length of tweet tokens is 83 (from Saswat's code); pad all vectors
maxi = 83
#padded = list()
#padded.append(np.array(tokenized + [0]*(maxi - len(tokenized))))
padded = np.array(tokenized + [0]*(maxi - len(tokenized)))
            
segment_ids = [1]*len(padded)
            
# create tensors
tokens_tensor = torch.tensor([padded])
segments_tensor = torch.tensor([segment_ids])
            
with torch.no_grad():
    last_hidden_states = model(tokens_tensor, segments_tensor)[0] # pull out only the last hidden state
            
last_hidden_states = last_hidden_states.numpy() # dim: tweets x words x features (where tweets = 1)
            
#word_embeddings = last_hidden_states[0,:,:] # dim: words x features (where features = 768)

text_vec = last_hidden_states[0,0,:] # pull out first document (bc there's only one tweet each time) and first token vector (for [CLS] token); dim: features = 768

text_vec

array([-4.49449480e-01,  4.25321728e-01,  4.94645596e-01,  5.93763530e-01,
       -1.88613594e-01, -3.93148750e-01,  8.63426268e-01, -3.46922539e-02,
        1.96507946e-01, -1.21697433e-01, -2.21357152e-01, -5.00074327e-01,
       -3.14357221e-01,  5.74941218e-01,  3.23405057e-01,  7.09725261e-01,
       -1.66956797e-01,  6.07542813e-01, -2.69534774e-02, -7.32733533e-02,
        4.13252771e-01,  2.17606857e-01,  9.99361157e-01,  2.21240193e-01,
       -1.90895736e-01,  3.86233404e-02, -6.20155334e-02, -9.31205392e-01,
       -4.92887259e-01,  6.54744878e-02, -4.53747272e-01,  7.08430529e-01,
        3.49270850e-01, -2.01172739e-01,  4.07417953e-01, -2.92793214e-01,
       -4.83326316e-01, -2.72580147e-01,  5.25475144e-01,  1.95385650e-01,
        6.45896196e-02, -5.79230845e-01,  7.03862846e-01, -1.64437443e-02,
        1.62406713e-02, -6.55786753e-01, -3.35095239e+00,  4.48106676e-01,
       -5.26748709e-02, -7.03745604e-01, -7.03470707e-02, -3.10750127e-01,
        4.19148266e-01,  

In [23]:
last_hidden_states.shape

(1, 83, 768)

In [30]:
padded_new = tokenized + [0]*(maxi - len(tokenized))

input_ids_new = torch.tensor([np.array(padded_new)])

with torch.no_grad():
    last_hidden_states_new = model(input_ids_new)


In [31]:
last_hidden_states_new[0].shape

torch.Size([1, 83, 768])

In [32]:
last_hidden_states_new[1].shape

torch.Size([1, 768])

In [33]:
len(last_hidden_states_new)

2

In [38]:
np.array_equal(last_hidden_states, last_hidden_states_new[0].numpy())

True