In [34]:
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification, BertConfig
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset, random_split
device = torch.device('cuda')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
CHECKPOINT_PATH = '/home/student/workspace/Truthseeker/Save_dir/distilbert'

In [35]:
model = DistilBertForSequenceClassification.from_pretrained(
    CHECKPOINT_PATH, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

#Loading from statedict
#model.load_state_dict(torch.load('final.ckpt'))

# Tell pytorch to run this model on the GPU.
model.to(DEVICE)
model = model.eval()

In [36]:
import pandas as pd
DATASET_PATH = "/home/student/workspace/Truthseeker/dataset/TruthSeeker2023/Truth_Seeker_Model_Dataset.csv"
df = pd.read_csv(DATASET_PATH)
gt_df = pd.read_csv('dataset/TruthSeeker2023/Truthfulness.csv')
test_indices = pd.read_csv('dataset/TruthSeeker2023/test_indices.csv')

#Concatenation and filtering
df = pd.concat([df, gt_df], axis=1)
df = df[~df['5_label_majority_answer'].isin(['NO MAJORITY'])]
# print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df = df.iloc[test_indices['index']]

In [37]:
sentences = 'Statement: ' + df['statement'] + '| Tweet: ' + df['tweet']
labels = df["2-way-label"].values
indices = df["Unnamed: 0"].values

In [38]:
indices

array([  1409, 118289,  31886, ...,   9813,  51282,  65465])

In [39]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
MAX_SENTENCE_LENGTH = 410

# For every sentence...
for i, sent in tqdm(enumerate(sentences[:1000])):
    if i > 300 and i < 310:
        print (sent, labels[i])
    encoded_dict = tokenizer.encode_plus(
                        sent,                     # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = MAX_SENTENCE_LENGTH,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
394it [00:00, 537.13it/s]

Statement: Hank Aarons death was caused by a COVID-19 vaccine.| Tweet: @bretts_daddy @IngrahamAngle You DO know its still in the experimental stage and people getting it have a 4.5% death rate where as you have a .07% death rate if you actually get covid.  No thanks. Ill go ahead and let all of you get the vaccine.   Ill take my chances.  Look at Hank Aaron. False
Statement: Melania dug up the WH Rose Garden, removing roses from every First Lady since 1913.| Tweet: @jodaniecarpino @Leslieoo7 @holmes_62 It was destroyed by Jackie Kennedy.  In 1902 Edith Roosevelt designated the area as a colonial garden. In 1913 Ellen Wilson named the  area the Rose Garden and made changes.  In 1962 Jackie Kennedy did her thing.  Now 58 years later Melania Trump did her thing.  Stop the hate. False
Statement: Photo shows Joe Biden doesnt wear a mask on a plane.| Tweet: @RichardGrenell @JoeBiden Joe Biden doesnt wear a mask on a plane in 2019 - but wears one OUTSIDE #FactsMatter False
Statement: "For dec

1000it [00:01, 546.20it/s]


In [40]:
input_ids = torch.cat(input_ids, dim=0).to(device)
attention_masks = torch.cat(attention_masks, dim=0).to(device)
labels = torch.tensor(labels).to(device)
indices = torch.tensor(indices).to(device)

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels[:1000], indices[:1000])

In [45]:
from tqdm  import tqdm
results = []
for i in tqdm(range(len(dataset))):
    INDEX = i
    with torch.no_grad():
        #output = model(dataset[INDEX:INDEX + 2][1], token_type_ids=None, attention_mask=dataset[INDEX:INDEX + 2][1],labels=None)
        b_input_ids = dataset[INDEX:INDEX + 1][0]
        attention_mask = dataset[INDEX:INDEX + 1][1]
        with torch.no_grad():        
    
            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            #print (f"{b_input_ids.shape} {attention_mask.shape=}")
            output = model(b_input_ids, 
                                   #token_type_ids=None, 
                                   attention_mask=attention_mask,
                                   labels=None)
            loss = output.loss
            logits = output.logits
        #print ("INDEX " + str(INDEX) + " Prediction", torch.argmax(logits), "| Label:", dataset[INDEX: INDEX+ 1][2])
        results.append((dataset[INDEX: INDEX+ 1][3].item(), 1 - torch.argmax(logits).item(), dataset[INDEX: INDEX+ 1][2].item()))

100%|█████████████████████████████████████████████████████████████| 1000/1000 [00:36<00:00, 27.29it/s]


In [46]:
results

[(1409, 1, True),
 (118289, 0, False),
 (31886, 1, True),
 (127422, 0, False),
 (108102, 0, False),
 (36487, 1, True),
 (105407, 0, False),
 (56423, 0, False),
 (101862, 0, False),
 (116874, 0, False),
 (6699, 1, True),
 (110973, 0, True),
 (53068, 0, False),
 (128912, 0, False),
 (5266, 1, True),
 (65328, 1, True),
 (16650, 1, True),
 (13449, 1, True),
 (43821, 1, True),
 (113423, 0, False),
 (119518, 0, True),
 (98313, 0, False),
 (6997, 1, True),
 (51091, 0, False),
 (29458, 1, True),
 (33163, 1, True),
 (42176, 1, True),
 (78491, 0, False),
 (11493, 1, True),
 (42426, 1, True),
 (85005, 0, False),
 (128843, 0, False),
 (128042, 0, False),
 (30887, 1, True),
 (74628, 1, True),
 (123818, 0, False),
 (46771, 1, True),
 (131200, 0, True),
 (63464, 1, True),
 (55454, 0, False),
 (107232, 0, False),
 (60915, 1, True),
 (97032, 0, False),
 (8696, 1, True),
 (128071, 0, False),
 (84582, 0, False),
 (22780, 1, True),
 (109073, 0, False),
 (84878, 0, False),
 (127555, 0, False),
 (130875, 0,