### Handle Imports

In [1]:
import torch
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import pprint

  from .autonotebook import tqdm as notebook_tqdm


### Check if PyTorch recognizes GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Read in dataset

In [3]:
df = pd.read_csv('./data/covid_lies.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591 entries, 0 to 6590
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   misconception_id  6591 non-null   int64 
 1   misconception     6591 non-null   object
 2   tweet_id          6591 non-null   int64 
 3   label             6591 non-null   object
dtypes: int64(2), object(2)
memory usage: 206.1+ KB


### Tokenize Input Data

In [4]:
def tokenize_covid(samples, labels, tokenizer):
  tokenized = []
  for idx in range(len(samples)):
    tokenized_tweet = tokenizer(samples[idx], return_tensors='pt')
    
    n_inst = {
      'tweet_token': tokenized_tweet,
      'tweet_origin': samples[idx],
      'label': labels[idx], 
      'idx': idx
    }
    tokenized.append(n_inst)

  return tokenized

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("digitalepidemiologylab/covid-twitter-bert")

# obtain samples/label pairs from dataset
covid_samples = np.array(df['misconception'])
covid_labels = np.array(df['label'])

tokenized_dataset = tokenize_covid(covid_samples, covid_labels, tokenizer)
pprint.pprint(tokenized_dataset[0])

{'idx': 0,
 'label': 'na',
 'tweet_origin': 'Coronavirus is genetically engineered.',
 'tweet_token': {'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),
                 'input_ids': tensor([[  101, 21887, 23350,  2003, 19345, 13685,  1012,   102]]),
                 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]])}}


### Define PyTorch Datasets (Augmented using SMOTE & Unaugmented)

In [6]:
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from imblearn.over_sampling import SMOTE

class MisinformationAugmentedDataset(Dataset):
    def __init__(self, covid_data):
        self.labels = []
        self.data = []
    
        self.label_map = {
            "pos": 0,
            "neg": 1,
            "na": 2,
        }

        # perform upsampling of underrepresented data in our imbalanced dataset
        input_ids = [inst['tweet_token']['input_ids'].squeeze(0) for inst in covid_data]
        attention_masks = [inst['tweet_token']['attention_mask'].squeeze(0) for inst in covid_data]
        token_type_ids = [inst['tweet_token']['token_type_ids'].squeeze(0) for inst in covid_data]
        numeric_labels = [self.label_map[inst['label']] for inst in covid_data]
        
        padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0).numpy()
        padded_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0).numpy()
        padded_token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0).numpy()

        smote = SMOTE(random_state=42)
        X_upsampled, y_upsampled = smote.fit_resample(padded_input_ids, numeric_labels)

        # rebuild dictionaries
        for idx in range(len(X_upsampled)):
            input_ids_tensor = torch.tensor(X_upsampled[idx], dtype=torch.long)
            attention_mask_tensor = torch.tensor(
                padded_attention_masks[idx % len(padded_attention_masks)], dtype=torch.long
            )
            token_type_ids_tensor = torch.tensor(
                padded_token_type_ids[idx % len(padded_token_type_ids)], dtype=torch.long
            )
            tweet_token = {
                "input_ids": input_ids_tensor,
                "attention_mask": attention_mask_tensor,
                "token_type_ids": token_type_ids_tensor,
            }

            self.data.append(tweet_token)
            self.labels.append(torch.tensor(y_upsampled[idx], dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tweet = self.data[idx]
        label = self.labels[idx]

        # adjust code according to received UserWarning:
        # UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() 
        # or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
        tweet = {key: value.clone().detach() for key, value in tweet.items()}
        label = torch.tensor(label, dtype=torch.long)

        return tweet, label
    
class MisinformationDataset(Dataset):
    def __init__(self, covid_data):
        self.labels = []
        self.data = []
    
        self.label_map = {
            "pos": 0,
            "neg": 1,
            "na": 2,
        }

        # perform upsampling of underrepresented data in our imbalanced dataset
        input_ids = [inst['tweet_token']['input_ids'].squeeze(0) for inst in covid_data]
        attention_masks = [inst['tweet_token']['attention_mask'].squeeze(0) for inst in covid_data]
        token_type_ids = [inst['tweet_token']['token_type_ids'].squeeze(0) for inst in covid_data]
        numeric_labels = [self.label_map[inst['label']] for inst in covid_data]
        
        padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0).numpy()
        padded_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0).numpy()
        padded_token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0).numpy()

        # rebuild dictionaries
        for idx in range(len(covid_data)):
            input_ids_tensor = torch.tensor(padded_input_ids[idx], dtype=torch.long)
            attention_mask_tensor = torch.tensor(padded_attention_masks[idx], dtype=torch.long)
            token_type_ids_tensor = torch.tensor(padded_token_type_ids[idx], dtype=torch.long)
            tweet_token = {
                "input_ids": input_ids_tensor,
                "attention_mask": attention_mask_tensor,
                "token_type_ids": token_type_ids_tensor,
            }

            self.data.append(tweet_token)
            self.labels.append(torch.tensor(numeric_labels[idx], dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the tokenized tweet and label
        tweet = self.data[idx]
        label = self.labels[idx]

        return tweet, label

### Segment Dataset into training and test portions

In [7]:
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split

X_vals = [inst['tweet_token'] for inst in tokenized_dataset]
y_vals = [inst['label'] for inst in tokenized_dataset]
X_train, X_test, y_train, y_test = train_test_split(X_vals, y_vals, test_size=0.2, random_state=42)

# reconstruct dictionaries using training/test sets
train_set = []
for tweet_token, label in zip(X_train, y_train):
    train_inst = {
        'tweet_token': tweet_token,  
        'label': label
    }
    train_set.append(train_inst)

test_set = []
for tweet_token, label in zip(X_test, y_test):
    test_inst = {
        'tweet_token': tweet_token,
        'label': label
    }
    test_set.append(test_inst)

train_dataset = MisinformationAugmentedDataset(train_set)
test_dataset = MisinformationDataset(test_set)

### Define model, dataloaders, loss function, collate function, and optimizer

In [8]:
# need to create collate function to pad variable length sequences for input
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # as per cell 6 output, item[0] will look like this:
    # 'tweet_token': {'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]]),
    #                   'input_ids': tensor([[  101, 21887, 23350,  2003, 19345, 13685,  1012,   102]]),
    #                   'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]])}}
    # item[1] will be a numeric label according to MisinformationDataset's label_map
    input_ids = [item[0]['input_ids'] for item in batch]
    attention_masks = [item[0]['attention_mask'] for item in batch]
    labels = [item[1] for item in batch]

    # pad sequences for input_ids and attention_masks with 0 values
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    labels = torch.tensor(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
    }, labels

In [9]:
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader

''' 
https://huggingface.co/digitalepidemiologylab/covid-twitter-bert-v2

CT-BERT: This model was trained on 97M unique tweets (1.2B training examples) 
collected between January 12 and July 5, 2020 containing at least one of the keywords 
"wuhan", "ncov", "coronavirus", "covid", or "sars-cov-2".  
These tweets were filtered and preprocessed to reach a final sample of 22.5M tweets 
(containing 40.7M sentences and 633M tokens) which were used for training.
'''
model = AutoModelForSequenceClassification.from_pretrained(
    "digitalepidemiologylab/covid-twitter-bert-v2", 
    num_labels=3
)
model = model.to(device)

# attempt to use class weights to offset imbalance of dataset
# pos_count = np.count_nonzero(covid_labels == 'pos')
# neg_count = np.count_nonzero(covid_labels == 'neg')
# na_count = np.count_nonzero(covid_labels == 'na')
# total_count = len(covid_labels)
# pos_weight = total_count / pos_count
# neg_weight = total_count / neg_count
# na_weight = total_count / na_count
# print(f'Weights: \nPos: {pos_weight}\nNeg: {neg_weight}\nNa: {na_weight}')

# class_weights = torch.tensor([1.05*pos_weight, neg_weight, na_weight]).to(device)
# loss_fn = CrossEntropyLoss(weight=class_weights)

# freeze base model layers
for param in model.base_model.parameters():
    param.requires_grad = False

# unfreeze last two layers of base model for fine tuning
#for param in model.base_model.encoder.layer[-2:]:
#    param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
#dev_dataloader = DataLoader(dev_set, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Define training loop

In [10]:
#def train_covid(model, optim, loss_fn, dataloader, epochs):
def train_covid(model, optim, dataloader, epochs):
  for epoch in range(epochs):
      model.train()
      total_loss = 0

      for batch_idx, batch in enumerate(dataloader):
          optim.zero_grad()
          
          # unpack batch of form (tweets, labels)
          tweets, labels = batch
          # send tweets dict's values to device
          tweets = {key: value.to(device) for key, value in tweets.items()}
          labels = labels.to(device)
          
          # forward pass on CT-BERT
          outputs = model(**tweets, labels=labels)
          #logits = outputs.logits
          
          # class weighted CrossEntropyLoss
          #loss = loss_fn(logits, labels)
          
          # loss provided by model
          loss = outputs.loss 

          # backwards pass on CT-BERT
          loss.backward()
          optim.step()

          total_loss += loss.item()

          print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}/{len(dataloader)}, Loss: {loss.item()}")

      print(f"Epoch {epoch + 1}, Loss: {total_loss}")

### Train model for sequence classification

In [11]:
epochs = 36
#train_covid(model, optimizer, loss_fn, train_dataloader, epochs)
train_covid(model, optimizer, train_dataloader, epochs)

  label = torch.tensor(label, dtype=torch.long)


Epoch 1, Batch 1/462, Loss: 1.0853698253631592
Epoch 1, Batch 2/462, Loss: 1.0650392770767212
Epoch 1, Batch 3/462, Loss: 1.0514171123504639
Epoch 1, Batch 4/462, Loss: 1.0701689720153809
Epoch 1, Batch 5/462, Loss: 1.0448793172836304
Epoch 1, Batch 6/462, Loss: 1.0889793634414673
Epoch 1, Batch 7/462, Loss: 1.0558212995529175
Epoch 1, Batch 8/462, Loss: 1.031874179840088
Epoch 1, Batch 9/462, Loss: 1.0473425388336182
Epoch 1, Batch 10/462, Loss: 1.0855754613876343
Epoch 1, Batch 11/462, Loss: 1.0841615200042725
Epoch 1, Batch 12/462, Loss: 1.0572271347045898
Epoch 1, Batch 13/462, Loss: 0.9914488792419434
Epoch 1, Batch 14/462, Loss: 1.0941407680511475
Epoch 1, Batch 15/462, Loss: 1.0895123481750488
Epoch 1, Batch 16/462, Loss: 1.0619837045669556
Epoch 1, Batch 17/462, Loss: 1.085300326347351
Epoch 1, Batch 18/462, Loss: 0.9517677426338196
Epoch 1, Batch 19/462, Loss: 1.0503504276275635
Epoch 1, Batch 20/462, Loss: 1.1387302875518799
Epoch 1, Batch 21/462, Loss: 1.0425587892532349
Epo

### Evaluate Model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(dataloader):
            tweets, labels = batch
            tweets = {key: value.to(device) for key, value in tweets.items()}
            labels = labels.to(device)
            
            # run sequences through CT-BERT
            outputs = model(**tweets, labels=labels)
            
            # highest energy class is our prediction
            logits = outputs.logits
            preds = torch.argmax(logits, 1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    class_labels = [0, 1, 2]
    per_class_accuracy = {}
    for class_label in class_labels:
        # get indices which match current class_label
        class_indices = np.where(np.array(all_labels) == class_label)[0]
        
        # get predictions of current class label
        class_preds = np.array(all_preds)[class_indices]
        
        # calculate accuracy for current class_label
        correct_class_preds = np.sum(class_preds == class_label)
        total_class_samples = len(class_indices)
        
        per_class_accuracy[class_label] = (correct_class_preds / total_class_samples) * 100
        
    accuracy = 100*accuracy_score(all_labels, all_preds)
    precision = 100*precision_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)
    recall = 100*recall_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)
    f1 = f1_score(all_labels, all_preds, labels=class_labels, average=None, zero_division=0)

    return accuracy, precision, recall, f1, per_class_accuracy


# evaluate the model on the test set (unaugmented)
accuracy, precision, recall, f1, per_class_accuracy = evaluate_model(model, test_dataloader)

print(f"Test Accuracy: {accuracy:.2f}%")
print(f"Per Class Accuracy: {per_class_accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

Test Accuracy: 85.90%
Per Class Accuracy: {0: 22.413793103448278, 1: 54.285714285714285, 2: 89.8042414355628}
Precision: [32.5        14.96062992 95.57291667]
Recall: [22.4137931  54.28571429 89.80424144]
F1: [0.26530612 0.2345679  0.92598823]


### Save Model Weights

In [13]:
torch.save(model.state_dict(), f'./models/model_weights23-{accuracy:.1f}.pth')