In [25]:
from transformers import BertTokenizer
import torch
from tqdm import tqdm
DEVICE = torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
CHECKPOINT_PATH = '/home/student/workspace/Truthseeker/checkpoints/checkpoint_with_maxlength_400_backup'

In [26]:
def encode(sentence):
    return tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 410,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

In [27]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    CHECKPOINT_PATH, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

#Loading from statedict
#model.load_state_dict(torch.load('final.ckpt'))

# Tell pytorch to run this model on the GPU.
model.to(DEVICE)
model = model.eval()


In [28]:
import pandas as pd
DATASET_PATH = "/home/student/datasets/TruthSeeker2023/Truth_Seeker_Model_Dataset.csv"
df = pd.read_csv(DATASET_PATH)

print('Number of training sentences: {:,}\n'.format(df.shape[0]))

df = df.sample(frac=1)

Number of training sentences: 134,198



In [29]:
sentences = 'Statement: ' + df['statement'] + '| Tweet: ' +df['tweet']
labels = df["BinaryNumTarget"].values

## Inference

In [80]:
for i in range(1, 5):
    encoded_sentence_dict = encode(sentences[i])
    encoded_input_id = [encoded_sentence_dict['input_ids']]
    encoded_attention_mask = [encoded_sentence_dict['attention_mask']]
    concat_encoded_input_id = torch.cat(encoded_input_id, dim=0)
    concat_attention_mask = torch.cat(encoded_attention_mask, dim=0)
    dataset = TensorDataset(concat_encoded_input_id, concat_attention_mask)
    input_id = torch.unsqueeze(dataset[0][0], dim=0)
    input_mask = torch.unsqueeze(dataset[0][1], dim=0)
    print (input_id.shape)
    print (input_mask.shape)
    with torch.no_grad(): 
        output = model(
                input_id,
                token_type_ids=None, 
                attention_mask=input_mask, return_dict=True)
    print (output.logits, labels[i])

torch.Size([1, 410])
torch.Size([1, 410])
tensor([[-8.3545,  8.3569]]) tensor(0., dtype=torch.float64)
torch.Size([1, 410])
torch.Size([1, 410])
tensor([[-8.3460,  8.3472]]) tensor(1., dtype=torch.float64)
torch.Size([1, 410])
torch.Size([1, 410])
tensor([[-8.3609,  8.3624]]) tensor(0., dtype=torch.float64)
torch.Size([1, 410])
torch.Size([1, 410])
tensor([[-8.3542,  8.3559]]) tensor(0., dtype=torch.float64)


# Inferencing as a dataloader

In [54]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
MAX_SENTENCE_LENGTH = 410

# For every sentence...
for sent in tqdm(sentences[:1000]):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_SENTENCE_LENGTH,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

100%|██████████████████████████████| 1000/1000 [00:01<00:00, 566.24it/s]


In [55]:
import torch
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

  labels = torch.tensor(labels)


In [56]:
index = 15
# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[index])
print('Token IDs:', input_ids[index])
print ('Labels:', labels[index])

Original:  Statement: End of eviction moratorium means millions of Americans could lose their housing in the middle of a pandemic.| Tweet: @Riguy_453 @PaulSorrentino3 @POTUS Trump didn't attempt to mandate mask!! Biden wanted to more than the vaccine but you can't get Republican AMEricans to comply 
It was the Supreme Court that knocked the eviction moratorium on the head. The Amy Coney Barratts. Another Trump dirty trick.
Token IDs: tensor([  101,  4861,  1024,  1000,  2017,  2113,  2054,  9733,  3825,  1999,
         2976,  3318,  7773,  2197,  2095,  1029,  5717,  1012,  1000,  1064,
         1056, 28394,  2102,  1024,  2049, 16498,  1010,  2073,  2017,  2123,
         2102,  2707,  2012,  5717,  1012,  1996,  2769,  3310,  2013,  3316,
         2066,  9733,  2040,  3825,  5717,  6363,  1999,  2976,  3318,  7773,
         2197,  2095,  1012,  2049,  2256,  2769,  2000,  4088,  2007,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,  

In [87]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels[:1000])

In [88]:
dataset[0][1]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [89]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 2


# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            dataset, # The validation samples.
            sampler = RandomSampler(dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.,
        )

In [95]:
output = model(dataset[3:4][0], token_type_ids=None, attention_mask=dataset[3:4][1],labels=None)
loss = output.loss
logits = output.logits
print (logits, dataset[3:4][2])

tensor([[ 8.6795, -8.6905]], grad_fn=<AddmmBackward0>) tensor([0.], dtype=torch.float64)


In [82]:

'''
for INDEX in range(1,20,1):
    with torch.no_grad():
        #output = model(dataset[INDEX:INDEX + 2][1], token_type_ids=None, attention_mask=dataset[INDEX:INDEX + 2][1],labels=None)
        b_input_ids = dataset[INDEX:INDEX + 1][0]
        attention_mask = dataset[INDEX:INDEX + 1][1]
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            
            output = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=None)
            loss = output.loss
            logits = output.logits
        print (logits, dataset[INDEX: INDEX+ 1][2])
'''

IndexError: tuple index out of range

In [76]:
for step, batch in enumerate(validation_dataloader):
    if step > 5:
        break
    b_input_ids = batch[0].to(DEVICE)
    b_input_mask = batch[1].to(DEVICE)
    b_labels = batch[2].to(torch.int64).to(DEVICE)
    b_labels_one_hot = torch.nn.functional.one_hot(b_labels, num_classes=2).float()
    
    # Tell pytorch not to bother with constructing the compute graph during
    # the forward pass, since this is only needed for backprop (training).
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        
        output = model(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=b_input_mask,
                               labels=None)
        loss = output.loss
        logits = output.logits
        print (logits, b_labels)

tensor([[-8.5272,  8.5553],
        [ 8.6726, -8.6745]]) tensor([1, 0])
tensor([[-8.5503,  8.5755],
        [-8.5136,  8.5264]]) tensor([1, 1])
tensor([[ 8.6494, -8.6570],
        [-8.5335,  8.5487]]) tensor([0, 1])
tensor([[ 8.6742, -8.6542],
        [ 8.6651, -8.6456]]) tensor([0, 0])
tensor([[ 8.6865, -8.6853],
        [-8.5263,  8.5356]]) tensor([0, 1])
tensor([[-8.4601,  8.4752],
        [ 8.6435, -8.6325]]) tensor([1, 0])


In [53]:
len(dataset)

1