### Import and initialize

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset, random_split
DEVICE = torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
CHECKPOINT_PATH = '/home/student/workspace/Truthseeker/Save_dir/bert'

## Loading the model

In [4]:
model = BertForSequenceClassification.from_pretrained(
    CHECKPOINT_PATH, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

#Loading from statedict
#model.load_state_dict(torch.load('final.ckpt'))

# Tell pytorch to run this model on the GPU.
model.to(DEVICE)
model = model.eval()

## Loading the dataset

In [6]:
import pandas as pd
DATASET_PATH = "/home/student/workspace/Truthseeker/dataset/TruthSeeker2023/Truth_Seeker_Model_Dataset.csv"
df = pd.read_csv(DATASET_PATH)

print('Number of training sentences: {:,}\n'.format(df.shape[0]))
#Shuffling the dataset
df = df.sample(frac=1)

Number of training sentences: 134,198



In [7]:
sentences = 'Statement: ' + df['statement'] + '| Tweet: ' +df['tweet']
labels = df["BinaryNumTarget"].values

## Tokenize the dataset

In [8]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
MAX_SENTENCE_LENGTH = 410

# For every sentence...
for i, sent in tqdm(enumerate(sentences[:1000])):
    if i > 300 and i < 310:
        print (sent, labels[i])
    encoded_dict = tokenizer.encode_plus(
                        sent,                     # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_SENTENCE_LENGTH,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
444it [00:00, 584.18it/s]

Statement: Says Warren Buffett has publicly said his secretary "should not be paying a higher tax rate" than him.| Tweet: @man_integrated It is effectively progressive, and no one will be able to say "Warren Buffet pays a lower tax rate than his secretary" 1.0
Statement: Says the 1986 Vaccine protection act resulted in the biggest jump in shots for children because it gave drug manufacturers exemption for responsibility of killing people.| Tweet: @CNN Next up has to be the - 

National Childhood Vaccine Injury Act of 1986 wrongly provides immunity to #bigpharma. 

What other industry/ company is free of liability  Name ONE! 

Thousands of children &amp; adults get injured every year &amp; we cant hold these companies accountable 0.0
Statement: "DMX received COVID vaccine days before heart attack."| Tweet: Dmx had a heart attack 2 days after taking the vaccine  family saying it had nothing to do with drugs . 0.0
Statement: "On Oct. 7, the Access Hollywood tape comes out. One hour later,

1000it [00:01, 568.24it/s]


In [9]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels[:1000])

## Inference any single data point

In [11]:
INDEX = 236
with torch.no_grad():
    #output = model(dataset[INDEX:INDEX + 2][1], token_type_ids=None, attention_mask=dataset[INDEX:INDEX + 2][1],labels=None)
    b_input_ids = dataset[INDEX:INDEX + 1][0]
    attention_mask = dataset[INDEX:INDEX + 1][1]
    with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        # token_type_ids is the same as the "segment ids", which 
        # differentiates sentence 1 and 2 in 2-sentence tasks.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        print (f"{b_input_ids.shape} {attention_mask.shape=}")
        output = model(b_input_ids, 
                               token_type_ids=None, 
                               attention_mask=attention_mask,
                               labels=None)
        loss = output.loss
        logits = output.logits
    print ("Prediction", torch.argmax(logits), "| Label:", dataset[INDEX: INDEX+ 1][2])

torch.Size([1, 410]) attention_mask.shape=torch.Size([1, 410])
Prediction tensor(0) | Label: tensor([1.], dtype=torch.float64)


## Inference without the tensor dataset

In [55]:
inference_index = 150
input_id_tensor = input_ids[inference_index]
attention_mask_input_tensor = attention_masks[inference_index]
output_label = labels[inference_index]

In [56]:
input_id_tensor.shape

torch.Size([410])

In [57]:
input_id_tensor_expanded = input_id_tensor.unsqueeze(dim=0)
attention_mask_input_tensor_expanded = attention_mask_input_tensor.unsqueeze(dim=0)

In [58]:
input_id_tensor_expanded.shape, attention_mask_input_tensor_expanded.shape

(torch.Size([1, 410]), torch.Size([1, 410]))

In [59]:
output = model(input_id_tensor_expanded, token_type_ids=None,attention_mask=attention_mask_input_tensor_expanded,labels=None)

In [60]:
print (output.logits.argmax(), output_label)

tensor(1) tensor(1., dtype=torch.float64)


## Inference on a new data point

In [115]:
MAX_SENTENCE_LENGTH = 410
test_sentence = 'Statement: "The majority of Austinites rent" the places they live.| Tweet: .@bcabsalom No, Im not dismissing renters. Austinites that rent and rely on public transit will benefit greatly from rail along Riverside.'

In [116]:
#Encoding the sentence

In [117]:
encoded_dict_for_new_data_point = tokenizer.encode_plus(
                        test_sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_SENTENCE_LENGTH,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
# Add the encoded sentence to the list.    
new_data_point_input_id = encoded_dict_for_new_data_point['input_ids']
    
# And its attention mask (simply differentiates padding from non-padding).
new_data_point_attention_mask = encoded_dict_for_new_data_point['attention_mask']

In [118]:
output = model(new_data_point_input_id, token_type_ids=None, attention_mask=new_data_point_attention_mask,labels=None)
print (output.logits.argmax())

tensor(1)
