### Handle Imports

In [15]:
import torch
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import pprint

### Check if PyTorch recognizes GPU

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Read in dataset

In [17]:
import datasets
import os

columns = [
    'id', 'label', 'claim', 'subject', 'speaker', 'speaker_job_title', 'state_info',
    'party_affiliation', 'barely_true_counts', 'false_counts',
    'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context'
]

# read in original LIAR dataset
df_train = pd.read_csv('../data/LIAR/train.tsv', sep='\t', names=columns).dropna()
df_valid = pd.read_csv('../data/LIAR/valid.tsv', sep='\t', names=columns).dropna()
df_test = pd.read_csv('../data/LIAR/test.tsv', sep='\t', names=columns).dropna()

### Tokenize Input Data

In [18]:
def tokenize_liar(samples, labels, tokenizer):
  tokenized = []
  for idx in range(len(samples)):
    tokenized_claim = tokenizer(samples[idx], return_tensors='pt')
    
    n_inst = {
      'claim_token': tokenized_claim,
      'claim_origin': samples[idx],
      'label': labels[idx], 
      'idx': idx
    }
    tokenized.append(n_inst)

  return tokenized

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

#LIAR Code:
# obtain training samples/label pairs from dataset
liar_train_samples = np.array(df_train['claim'])
liar_train_labels = np.array(df_train['label'])
liar_test_samples = np.array(df_test['claim'])
liar_test_labels = np.array(df_test['label'])

tokenized_train_dataset = tokenize_liar(liar_train_samples, liar_train_labels, tokenizer)
tokenized_test_dataset = tokenize_liar(liar_test_samples, liar_test_labels, tokenizer)
pprint.pprint(tokenized_train_dataset[0])

{'claim_origin': 'Says the Annies List political group supports '
                 'third-trimester abortions on demand.',
 'claim_token': {'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
                 'input_ids': tensor([[  101,  8652,  1116,  1103,  7765,  1116,  5619,  1741,  1372,  6253,
          1503,   118, 13373, 12831, 12030,  1116,  1113,  4555,   119,   102]]),
                 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])},
 'idx': 0,
 'label': 'false'}


### Define PyTorch Datasets (Augmented using SMOTE & Unaugmented)

In [20]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from imblearn.over_sampling import SMOTE

class LiarDataset(Dataset):
    def __init__(self, liar_data):
        self.labels = []
        self.data = []
    
        self.label_map = {
            'pants-fire': 0,
            'false': 0,
            'barely-true': 0,
            'half-true': 0,
            'mostly-true': 1,
            'true': 1
        }

        for idx in range(len(liar_data)):
            self.data.append(liar_data[idx]['claim_token'])
            self.labels.append(self.label_map[liar_data[idx]['label']])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        claim = {key: torch.tensor(value, dtype=torch.long) for key, value in self.data[idx].items()}
        label = torch.tensor(self.labels[idx], dtype=torch.long)

        return claim, label

### Segment Dataset into training and test portions

In [21]:
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

X_train = [inst['claim_token'] for inst in tokenized_train_dataset]
y_train = [inst['label'] for inst in tokenized_train_dataset]
X_test = [inst['claim_token'] for inst in tokenized_test_dataset]
y_test = [inst['label'] for inst in tokenized_test_dataset]

# reconstruct dictionaries using training/test sets
train_set = []
for claim_token, label in zip(X_train, y_train):
    train_inst = {
        'claim_token': claim_token,  
        'label': label
    }
    train_set.append(train_inst)

test_set = []
for claim_token, label in zip(X_test, y_test):
    test_inst = {
        'claim_token': claim_token,
        'label': label
    }
    test_set.append(test_inst)

train_dataset = LiarDataset(train_set)
test_dataset = LiarDataset(test_set)

### Define model, dataloaders, loss function, collate function, and optimizer

In [22]:
# need to create collate function to pad variable length sequences for input
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # as per cell 6 output, item[0] will look like this:
    # 'tweet_token': {'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),
    #                   'input_ids': tensor([[  101, 21887, 23350,  2003, 19345, 13685,  1012,   102]]),
    #                   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]])}}
    # item[1] will be a numeric label according to MisinformationDataset's label_map
    input_ids = [item[0]['input_ids'].squeeze(0) for item in batch]
    attention_masks = [item[0]['attention_mask'].squeeze(0) for item in batch]
    labels = [item[1] for item in batch]

    # pad sequences for input_ids and attention_masks with 0 values
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.tensor(labels, dtype=torch.long)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
    }, labels

In [23]:
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from collections import Counter

# change num_labels in accordance with current problem design (binary or multi-class classification)
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to(device)

# freeze base model layers
for param in model.base_model.parameters():
    param.requires_grad = False

# unfreeze last two layers of base model for fine tuning
#for param in model.base_model.encoder.layer[-2:]:
#    param.requires_grad = True

# attempt to use class weights to offset imbalance of dataset
label_counts = Counter(liar_train_labels)
label_map = {
            'pants-fire': 0,
            'false': 0,
            'barely-true': 0,
            'half-true': 0,
            'mostly-true': 1,
            'true': 1
        }
numeric_labels = np.array([label_map[inst['label']] for inst in tokenized_train_dataset])
true_count = np.count_nonzero(numeric_labels == 1)
false_count = np.count_nonzero(numeric_labels == 0)
total_count = len(train_dataset)
true_weight = total_count / true_count
false_weight = total_count / false_count
print(f'Weights: \nFalse: {false_weight}\nTrue: {true_weight}')

class_weights = torch.tensor([false_weight, true_weight]).to(device)

loss_fn = CrossEntropyLoss(weight=class_weights)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
#dev_dataloader = DataLoader(dev_set, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Weights: 
False: 1.5983353151010702
True: 2.6713036565977744


### Define training loop

In [24]:
def train_covid(model, optim, loss_fn, dataloader, epochs):
#def train_covid(model, optim, dataloader, epochs):
  for epoch in range(epochs):
      model.train()
      total_loss = 0

      for batch_idx, batch in enumerate(dataloader):
          optim.zero_grad()
          
          # unpack batch of form (tweets, labels)
          claims, labels = batch
          # send tweets dict's values to device
          claims = {key: value.to(device) for key, value in claims.items()}
          labels = labels.to(device)
          
          # forward pass on BERT
          outputs = model(**claims, labels=labels)
          logits = outputs.logits
          
          # class weighted CrossEntropyLoss
          loss = loss_fn(logits, labels)
          
          # loss provided by model
          #loss = outputs.loss 

          # backwards pass on BERT
          loss.backward()
          optim.step()

          total_loss += loss.item()

          print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}/{len(dataloader)}, Loss: {loss.item()}")

      print(f"Epoch {epoch + 1}, Loss: {total_loss}")

In [25]:
print(model.config.num_labels)

2


### Train model for sequence classification

In [26]:
epochs = 9
train_covid(model, optimizer, loss_fn, train_dataloader, epochs)
#train_covid(model, optimizer, train_dataloader, epochs)

Epoch 1, Batch 1/211, Loss: 0.7164663076400757


  claim = {key: torch.tensor(value, dtype=torch.long) for key, value in self.data[idx].items()}


Epoch 1, Batch 2/211, Loss: 0.748382031917572
Epoch 1, Batch 3/211, Loss: 0.6427400708198547
Epoch 1, Batch 4/211, Loss: 0.7271836996078491
Epoch 1, Batch 5/211, Loss: 0.7478856444358826
Epoch 1, Batch 6/211, Loss: 0.6884716749191284
Epoch 1, Batch 7/211, Loss: 0.7412406802177429
Epoch 1, Batch 8/211, Loss: 0.6987291574478149
Epoch 1, Batch 9/211, Loss: 0.6998040080070496
Epoch 1, Batch 10/211, Loss: 0.7079986929893494
Epoch 1, Batch 11/211, Loss: 0.6642714142799377
Epoch 1, Batch 12/211, Loss: 0.7646110653877258
Epoch 1, Batch 13/211, Loss: 0.6738997101783752
Epoch 1, Batch 14/211, Loss: 0.7221581339836121
Epoch 1, Batch 15/211, Loss: 0.7242905497550964
Epoch 1, Batch 16/211, Loss: 0.6721864342689514
Epoch 1, Batch 17/211, Loss: 0.7252640128135681
Epoch 1, Batch 18/211, Loss: 0.6813012957572937
Epoch 1, Batch 19/211, Loss: 0.7155381441116333
Epoch 1, Batch 20/211, Loss: 0.69950270652771
Epoch 1, Batch 21/211, Loss: 0.6960915923118591
Epoch 1, Batch 22/211, Loss: 0.7300071716308594
Epo

### Evaluate Model

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            tweets, labels = batch
            tweets = {key: value.to(device) for key, value in tweets.items()}
            labels = labels.to(device)
            
            # run sequences through BERT
            outputs = model(**tweets, labels=labels)
            
            # highest energy class is our prediction
            logits = outputs.logits
            preds = torch.argmax(logits, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    class_labels = [0, 1]
    #class_labels = [0, 1, 2, 3, 4, 5]
    per_class_accuracy = {}
    for class_label in class_labels:
        # get indices which match current class_label
        class_indices = np.where(np.array(all_labels) == class_label)[0]
        
        # get predictions of current class label
        class_preds = np.array(all_preds)[class_indices]
        
        # calculate accuracy for current class_label
        correct_class_preds = np.sum(class_preds == class_label)
        total_class_samples = len(class_indices)
        
        per_class_accuracy[class_label] = (correct_class_preds / total_class_samples) * 100
        
    accuracy = 100*accuracy_score(all_labels, all_preds)
    precision = 100*precision_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)
    recall = 100*recall_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)
    f1 = f1_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)

    return accuracy, precision, recall, f1, per_class_accuracy


# evaluate the model on the test set (unaugmented)
accuracy, precision, recall, f1, per_class_accuracy = evaluate_model(model, test_dataloader)

print(f"Test Accuracy: {accuracy:.2f}%")
print(f"Per Class Accuracy: {per_class_accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

  claim = {key: torch.tensor(value, dtype=torch.long) for key, value in self.data[idx].items()}


Test Accuracy: 48.30%
Per Class Accuracy: {0: 32.95668549905838, 1: 73.6024844720497}
Precision: [67.30769231 39.96627319]
Recall: [32.9566855  73.60248447]
F1: [0.44247788 0.51803279]


### Save Model Weights

In [28]:
torch.save(model.state_dict(), f'./models/model_weights15-{accuracy:.1f}.pth')