In [19]:
%pip install numpy transformers pandas torch scikit-learn pyarrow accelerate transformers[torch] transformers[sentencepiece] ipywidgets tqdm datetime imblearn sentencepiece papermill

Note: you may need to restart the kernel to use updated packages.


In [20]:
import os
import time
from datetime import datetime
import string
import random
import logging
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Main Code


In [22]:
def train_model(model, epochs, optimizer, loss_fn, train_dataloader):
    for epoch in range(epochs):
        model.train()
        model.to(device)
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            loss.backward()
            optimizer.step()
            # total_loss += loss.item()
            total_loss += loss
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
        logging.info(f"Epoch {epoch+1} average train loss: {avg_train_loss}")

# Main Code


In [23]:
# Default parameters, Papermill will overwrite these
categories = 'science'
train_data = None
test_data = None
select_model = 0
freeze_layers_up_to = 0
epochs = 1
weight_for_class_0 = None
weight_for_class_1 = None
learning_rate = 0.00001
min_acc = 0.85
batch_size = 8
epochs = 4

In [24]:
def generate_random_string(length=10):
    letters = string.ascii_letters + string.digits
    return ''.join(random.choice(letters) for _ in range(length))

run_id = generate_random_string()

In [25]:
log_dir_mapping = {
  'crime': './results/crime',
  'science': './results/science',
  'health': './results/health',
  'politics': './results/politics',
  'social_media': './results/social_media'
}

log_dir = log_dir_mapping.get(categories, './results')
os.makedirs(log_dir, exist_ok=True)

log_filename = f'training_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
log_filepath = os.path.join(log_dir, log_filename)

logging.basicConfig(filename=log_filepath, filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
logging.info(f'\nStarting detection model - {run_id}')

In [26]:
root_dir = '../data'

dataset_mapping = {
  'fa-kes': '../data/crime/FA-KES-Dataset.feather',
  'snope': '../data/crime/snope.feather',
  'covid_claims': '../data/health/covid_claims.feather',
  'covid_fake_news': '../data/health/covid_fake_news_dataset.feather',
  'covid_FNIR': '../data/health/covid_FNIR.feather',
  'fake_news': '../data/politics/fake_news_dataset.feather',
  'isot_dataset': '../data/politics/isot_dataset.feather',
  'liar_dataset': '../data/politics/liar_dataset.feather',
  'pheme': '../data/politics/pheme.feather',
  'politifact': '../data/politics/politifact_dataset.feather',
  'climate': '../data/science/climate_dataset.feather',
  'gossipcop': '../data/social_media/gossipcop.feather',
  'isot_social': '../data/social_media/isot_dataset.feather',
  'isot_multipurpose': '../data/isot_multipurpose_small.feather'
}

def load_data_from_category(category):
    files = os.listdir(os.path.join(root_dir, category))
    dataframes = []
    for file in files:
        if file.endswith('.feather'):
            df = pd.read_feather(os.path.join(root_dir, category, file))
            dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

def load_as_train_or_test(dataset_list):
    dataframes = []
    for dataset in dataset_list:
        df = pd.read_feather(dataset_mapping.get(dataset))
        dataframes.append(df)
    return pd.concat(dataframes, ignore_index=True)

# Use this one when running multiple categories
#combined_dataframes = [load_data_from_category(category) for category in categories] 
# This one is for single category
if train_data is None and test_data is None:
    train_data_set = False
    combined_dataframes = [load_data_from_category(categories)]
    combined_df = pd.concat(combined_dataframes, ignore_index=True)
    combined_df.dropna(inplace=True)
    datasets_size = len(combined_df)
    print("combined dataframe: ")
    print(combined_df)

# This one is when specific training and testing sets are used
if train_data != None and test_data != None:
    train_data_set = True
    train_dataset_name, test_dataset_name = train_data, test_data
    train_df = load_as_train_or_test(train_data)
    test_df = load_as_train_or_test(test_data)
    train_df.dropna(inplace=True)
    test_df.dropna(inplace=True)
    train_dataset_size, test_dataset_size = len(train_df), len(test_df)
    print("training dataframe: ")
    print(train_df.head())
    print("testing dataframe: ")
    print(test_df.head())

                                                  text  label  \
0    Global warming is driving polar bears toward e...      0   
1    The sun has gone into ‘lockdown’ which could c...      0   
2    They tell us that we are the primary forces co...      0   
3    The Great Barrier Reef is experiencing the mos...      0   
4    Volcanoes Melting West Antarctic Glaciers, Not...      0   
..                                                 ...    ...   
902  No warming since at least 1995, no melting gla...      1   
903  This was the case last year too, while earlier...      1   
904  "Disasters Cost More Than Ever — But Not Becau...      1   
905  CO2 constitutes 80% of the non-condensing gree...      1   
906           the Great Barrier Reef is in fine fettle      1   

                                              metadata  
0    [{'article': 'Extinction risk from global warm...  
1    [{'article': 'Famine', 'entropy': 0.0, 'eviden...  
2    [{'article': 'Carbon dioxide', 'entropy': 0

In [27]:
'''
def load_data_from_category(category, filenames):
    dataframes = []
    for file_name in filenames:
        file_path = os.path.join(root_dir, category, file_name)
        if os.path.exists(file_path):
            df = pd.read_feather(file_path)
            dataframes.append(df)
        else:
            print(f"The file '{file_name}' in the '{category}' category does not exist.")
    return pd.concat(dataframes, ignore_index=True)

root_dir = '../data'
category = 'health'

# List of filenames to include in combined_df
#included_filenames = ['isot_dataset.feather', 'fake_news_dataset.feather', 'pheme.feather', 'liar_dataset.feather', 'politifact_dataset.feather']
included_filenames = ['covid_claims.feather', 'covid_fake_news_dataset.feather', 'covid_FNIR.feather']
combined_df = load_data_from_category(category, included_filenames)

# Drop NaN values
combined_df.dropna(inplace=True)

# Print the resulting DataFrame
print(combined_df)
'''

'\ndef load_data_from_category(category, filenames):\n    dataframes = []\n    for file_name in filenames:\n        file_path = os.path.join(root_dir, category, file_name)\n        if os.path.exists(file_path):\n            df = pd.read_feather(file_path)\n            dataframes.append(df)\n        else:\n            print(f"The file \'{file_name}\' in the \'{category}\' category does not exist.")\n    return pd.concat(dataframes, ignore_index=True)\n\nroot_dir = \'../data\'\ncategory = \'health\'\n\n# List of filenames to include in combined_df\n#included_filenames = [\'isot_dataset.feather\', \'fake_news_dataset.feather\', \'pheme.feather\', \'liar_dataset.feather\', \'politifact_dataset.feather\']\nincluded_filenames = [\'covid_claims.feather\', \'covid_fake_news_dataset.feather\', \'covid_FNIR.feather\']\ncombined_df = load_data_from_category(category, included_filenames)\n\n# Drop NaN values\ncombined_df.dropna(inplace=True)\n\n# Print the resulting DataFrame\nprint(combined_df)\n

In [28]:
# May need to include more columns to process metadata
if train_data is None and test_data is None:
  texts = combined_df['text'].values
  labels = combined_df['label'].values
  num_classes = combined_df['label'].nunique()
  print(texts[:5])
  print(labels[:5])

  unique_classes, class_counts = np.unique(labels, return_counts=True)

if train_data != None and test_data != None:
  train_texts = train_df['text'].values
  train_labels = train_df['label'].values
  test_texts = test_df['text'].values
  test_labels = test_df['label'].values
  num_classes = max(train_df['label'].nunique(), test_df['label'].nunique())
  print(train_texts[:5])
  print(train_labels[:5])
  print(test_texts[:5])
  print(test_labels[:5])

  unique_classes, class_counts = np.unique(train_labels, return_counts=True)

for class_label, count in zip(unique_classes, class_counts):
  print(f"Class {class_label}: {count} instances")

print(f"Number of unique classes: {num_classes}")

['Global warming is driving polar bears toward extinction'
 'The sun has gone into ‘lockdown’ which could cause freezing weather, earthquakes and famine, say scientists'
 'They tell us that we are the primary forces controlling earth temperatures by the burning of fossil fuels and releasing their carbon dioxide.'
 'The Great Barrier Reef is experiencing the most widespread bleaching ever recorded'
 'Volcanoes Melting West Antarctic Glaciers, Not Global Warming']
[0 0 0 0 0]
Number of unique classes: 2
Class 0: 654 instances
Class 1: 253 instances


In [29]:
# Split the dataset into training and testing sets
if train_data is None and test_data is None:
  train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=7623)
  print("Split category data into training and testing sets")

In [30]:
balancing_strategy = None

if weight_for_class_0 is None and weight_for_class_1 is None:
  balancing_strategy = 'Default weights'
  # Calculate class frequencies
  class_counts = np.bincount(labels) if train_data is None else np.bincount(train_labels)
  total_samples = np.sum(class_counts)

  # Calculate class weights, the original distribution of weights
  weight_for_class_0 = class_counts[0] / sum(class_counts)
  weight_for_class_1 = class_counts[1] / sum(class_counts)
elif weight_for_class_0 == 'auto' and weight_for_class_1 == 'auto':
  balancing_strategy = 'Auto weights'
  # Calculate class frequencies
  class_counts = np.bincount(labels) if train_data is None else np.bincount(train_labels)
  total_samples = np.sum(class_counts)

  # Calculate class weights
  weight_for_class_0 = class_counts[1] / sum(class_counts)
  weight_for_class_1 = class_counts[0] / sum(class_counts)
else:
  balancing_strategy = 'Manual weights'

print(f"Weight for class 0: {weight_for_class_0:.2f}")
print(f"Weight for class 1: {weight_for_class_1:.2f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
# Load the pre-trained BERT (or BERT variation) model and tokenizer
model_mapping = {
    0: (BertTokenizer, BertForSequenceClassification, 'bert-base-uncased'),
    1: (BertTokenizer, BertForSequenceClassification, 'bert-base-cased'),
    2: (DistilBertTokenizer, DistilBertForSequenceClassification, 'distilbert-base-uncased-finetuned-sst-2-english'),
    3: (DistilBertTokenizer, DistilBertForSequenceClassification, 'distilbert-base-uncased'),
    4: (RobertaTokenizer, RobertaForSequenceClassification, 'roberta-base'),
    5: (AlbertTokenizer, AlbertForSequenceClassification, 'albert-base-v2')
}

tokenizer_class, model_class, model_name = model_mapping.get(select_model, (None, None, None))

if tokenizer_class and model_class and model_name:
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=2)
else:
    logging.error(f"Invalid model selection: {select_model}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Freeze the layers up to the specified layer
if freeze_layers_up_to > 0:
    for param in model.parameters():
        param.requires_grad = False

    if select_model == 0 or select_model == 1:
        print("Layers: "+str(len(model.bert.encoder.layer)))
        for param in model.bert.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 2 or select_model == 3:
        print("Layers: "+str(len(model.distilbert.transformer.layer)))
        for param in model.distilbert.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 4:
        print("Layers: "+str(len(model.roberta.encoder.layer)))
        for param in model.roberta.embeddings.parameters():
            param.requires_grad = True
    elif select_model == 5:
        print("Layers: "+str(len(model.albert.encoder.albert_layer_groups)))
        for param in model.albert.embeddings.parameters():
            param.requires_grad = True


In [32]:
# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [33]:
# Convert the encodings to PyTorch tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)

test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Create a DataLoader for training and testing
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [34]:
# Check for GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [35]:
# Move the model and data to the GPU
model.to(device)
train_inputs, train_masks, train_labels = train_inputs.to(device), train_masks.to(device), train_labels.to(device)
test_inputs, test_masks, test_labels = test_inputs.to(device), test_masks.to(device), test_labels.to(device)

# Define class weights based on class imbalance
class_weights = torch.tensor([weight_for_class_0, weight_for_class_1], dtype=torch.float)

# Define the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))

# Define the optimizer with a learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Model training code
def train_model(model, epochs, optimizer, loss_fn, train_dataloader):
    for epoch in range(epochs):
        model.train()
        model.to(device)
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            loss.backward()
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} average train loss: {avg_train_loss}")
        logging.info(f"Epoch {epoch+1} average train loss: {avg_train_loss}")

# Fine-tune the pre-trained BERT model
train_start_time = time.time()
train_model(model, epochs, optimizer, loss_fn, train_dataloader)
train_end_time = time.time()

Epoch 1 average train loss: 0.0
Epoch 2 average train loss: 0.0
Epoch 3 average train loss: 0.0
Epoch 4 average train loss: 0.0
Epoch 5 average train loss: 0.0


In [36]:
# Evaluate the model
eval_start_time = time.time()
model.eval()
predictions = []
for batch in test_dataloader:
    inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': None}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(logits.argmax(dim=1).cpu().tolist())
eval_end_time = time.time()

In [37]:
# Calculate additional metrics
precision = precision_score(test_labels.cpu(), predictions)
recall = recall_score(test_labels.cpu(), predictions)
f1 = f1_score(test_labels.cpu(), predictions)
accuracy = accuracy_score(test_labels.cpu(), predictions)
g_mean = (recall*accuracy)**0.5

In [38]:
save_dir = './models'
os.makedirs(save_dir, exist_ok=True)

saved_model_name = f"bert_model_{run_id}.pt"
model_path = os.path.join(save_dir, saved_model_name)

# Only save model if accuracy meets minimum threshold
if accuracy > min_acc:
    torch.save(model.state_dict(), model_path)

In [39]:
# Log the results
logging.info("Evaluation Results")
logging.info(f"Training time: {train_end_time - train_start_time} seconds")
logging.info(f"Inference time: {eval_end_time - eval_start_time} seconds")
logging.info(f"Precision: {precision}")
logging.info(f"Recall: {recall}")
logging.info(f"F-score: {f1}")
logging.info(f"Accuracy: {accuracy}")
logging.info(f"G-mean: {g_mean}")
logging.info("Additional Info")
logging.info(f"Model name: {model_name}")
if not train_data_set:
  logging.info(f"Datasets list: {categories}")
  logging.info(f"Dataset size: {datasets_size}")
else:
  logging.info(f"Datasets list: {categories} Train: {train_dataset_name}, Test: {test_dataset_name}")
  logging.info(f"Dataset size: Train: {train_dataset_size}, Test: {test_dataset_size}")
logging.info(f"Layers frozen: {freeze_layers_up_to}")
logging.info(f"Learning rate: {learning_rate}")
logging.info(f"Class weights: {balancing_strategy} - {[weight_for_class_0, weight_for_class_1]}")
if accuracy > min_acc:
  logging.info(f"Model saved to: {model_path}")
else:
  logging.info("Model not saved, didn't meet minimum accuracy threshold")

# Print the evaluation results
print("Evaluation Results:")
print(f"Training time: {train_end_time - train_start_time} seconds")
print(f"Inference time: {eval_end_time - eval_start_time} seconds")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f1}")
print(f"Accuracy: {accuracy}")
print(f"G-mean: {g_mean}")
print(f"Epoch: {epochs}")
print(f"Batch size: {batch_size}")

Evaluation Results:
Training time: 31.90778636932373 seconds
Inference time: 0.4577946662902832 seconds
Precision: 0.5
Recall: 0.4772727272727273
F-score: 0.48837209302325585
Accuracy: 0.7582417582417582
G-mean: 0.6015713689065595
